summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-30 22:51:49 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-30 22:51:49 +0000
commit4ea1b2bdc0a86f135bae4ae95cfc3d45a9416604 (patch)
tree55cbdff00c136a483f5a280c07930635d58c9e3b /src/Text/Pandoc/Readers/HTML.hs
parent7cd9db048b9c29238efd1cecda65264db4223dcd (diff)
Merged 'strict' branch from r324. This adds a '--strict'
option to pandoc, which forces it to stay as close as possible to official Markdown syntax. git-svn-id: https://pandoc.googlecode.com/svn/trunk@347 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs24
1 files changed, 17 insertions, 7 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 2bf75654c..9beaaacff 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -32,7 +32,12 @@ module Text.Pandoc.Readers.HTML (
rawHtmlInline,
rawHtmlBlock,
anyHtmlBlockTag,
- anyHtmlInlineTag
+ anyHtmlInlineTag,
+ anyHtmlTag,
+ anyHtmlEndTag,
+ htmlEndTag,
+ extractTagType,
+ htmlBlockElement
) where
import Text.Regex ( matchRegex, mkRegex )
@@ -78,17 +83,18 @@ inlinesTilEnd tag = try (do
inlines <- manyTill inline (htmlEndTag tag)
return inlines)
--- extract type from a tag: e.g. br from <br>, < br >, </br>, etc.
+-- | Extract type from a tag: e.g. 'br' from '<br>'
extractTagType tag =
case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
Just [match] -> (map toLower match)
Nothing -> ""
+-- | Parse any HTML tag (closing or opening) and return text of tag
anyHtmlTag = try (do
char '<'
spaces
tag <- many1 alphaNum
- attribs <- htmlAttributes
+ attribs <- htmlAttributes
spaces
ender <- option "" (string "/")
let ender' = if (null ender) then "" else " /"
@@ -150,9 +156,10 @@ htmlRegularAttribute = try (do
(do
a <- many (alphaNum <|> (oneOf "-._:"))
return (a,"")) ]
- return (name, content,
+ return (name, content,
(" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+-- | Parse an end tag of type 'tag'
htmlEndTag tag = try (do
char '<'
spaces
@@ -174,20 +181,23 @@ anyHtmlInlineTag = try (do
tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
if isInline tag then return tag else fail "not an inline tag")
--- scripts must be treated differently, because they can contain <> etc.
+-- | Parses material between script tags.
+-- Scripts must be treated differently, because they can contain '<>' etc.
htmlScript = try (do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
return (open ++ rest ++ "</script>"))
+htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
+
rawHtmlBlock = try (do
notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
- body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec,
- definition]
+ body <- htmlBlockElement <|> anyHtmlBlockTag
sp <- (many space)
state <- getState
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
+-- | Parses an HTML comment.
htmlComment = try (do
string "<!--"
comment <- manyTill anyChar (try (string "-->"))