summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2010-02-02 07:36:55 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2010-02-02 07:36:55 +0000
commit19b0c72dd18050a00dd77bb3dfddd0d0702d157f (patch)
treec120e270496a1ecba0c2badd4b8bc2a59c7d0a93 /src/Text
parent70a7d7b2141472fe558bfc191f259667ed09f911 (diff)
Made HTML reader much more forgiving.
+ Incorporated idea (from HXT) that an element can be closed by an open tag for another element. + Javascript is partially parsed to make sure that a <script> section is not closed by a </script> in a comment or string. + More lenient non-quoted attribute values. Now we accept anything but a space character, quote, or <>. This helps in parsing e.g. www.google.com! + Bare & signs are now parsed as a string. This is a common HTML mistake. + Skip a bare < in malformed HTML. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1825 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs135
1 files changed, 106 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index c0df3aa65..58762c35f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -50,6 +50,7 @@ import Data.Maybe ( fromMaybe )
import Data.List ( isPrefixOf, isSuffixOf, intercalate )
import Data.Char ( toLower, isAlphaNum )
import Network.URI ( parseURIReference, URI (..) )
+import Control.Monad ( liftM )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
@@ -71,7 +72,7 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"br", "cite", "code", "dfn", "em", "font", "i", "img",
"input", "kbd", "label", "q", "s", "samp", "select",
"small", "span", "strike", "strong", "sub", "sup",
- "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+ "textarea", "tt", "u", "var"]
-}
blockHtmlTags :: [[Char]]
@@ -80,7 +81,7 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
"h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes",
"noscript", "ol", "p", "pre", "table", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr", "script"] ++ eitherBlockOrInline
+ "th", "thead", "tr", "script", "style"]
sanitaryTags :: [[Char]]
sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
@@ -112,6 +113,40 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset",
"summary", "tabindex", "target", "title", "type",
"usemap", "valign", "value", "vspace", "width"]
+-- taken from HXT and extended
+
+closes :: String -> String -> Bool
+"EOF" `closes` _ = True
+_ `closes` "body" = False
+_ `closes` "html" = False
+"a" `closes` "a" = True
+"li" `closes` "li" = True
+"th" `closes` t | t `elem` ["th","td"] = True
+"td" `closes` t | t `elem` ["th","td"] = True
+"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dt" `closes` t | t `elem` ["dt","dd"] = True
+"dd" `closes` t | t `elem` ["dt","dd"] = True
+"hr" `closes` "p" = True
+"p" `closes` "p" = True
+"meta" `closes` "meta" = True
+"colgroup" `closes` "colgroup" = True
+"form" `closes` "form" = True
+"label" `closes` "label" = True
+"map" `closes` "map" = True
+"object" `closes` "object" = True
+_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
+t `closes` "select" | t /= "option" = True
+"thead" `closes` t | t `elem` ["colgroup"] = True
+"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
+"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
+t `closes` t2 |
+ t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
+ t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div"
+t1 `closes` t2 |
+ t1 `elem` blockHtmlTags &&
+ t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
+_ `closes` _ = False
+
--
-- HTML utility functions
--
@@ -176,6 +211,19 @@ extractTagType ('<':rest) =
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
extractTagType _ = ""
+-- Parse any HTML tag (opening or self-closing) and return tag type
+anyOpener :: GenParser Char ParserState [Char]
+anyOpener = try $ do
+ char '<'
+ spaces
+ tag <- many1 alphaNum
+ skipMany htmlAttribute
+ spaces
+ option "" (string "/")
+ spaces
+ char '>'
+ return $ map toLower tag
+
-- | Parse any HTML tag (opening or self-closing) and return text of tag
anyHtmlTag :: GenParser Char ParserState [Char]
anyHtmlTag = try $ do
@@ -257,32 +305,30 @@ htmlRegularAttribute = try $ do
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
(do
- a <- many (alphaNum <|> (oneOf "-._:"))
+ a <- many (noneOf " \t\n\r\"'<>")
return (a,"")) ]
return (name, content,
(name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-- | Parse an end tag of type 'tag'
-htmlEndTag :: [Char] -> GenParser Char st [Char]
+htmlEndTag :: [Char] -> GenParser Char ParserState [Char]
htmlEndTag tag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- stringAnyCase tag
- spaces
- char '>'
- return $ "</" ++ tag ++ ">"
-
-{-
--- | Returns @True@ if the tag is (or can be) an inline tag.
-isInline :: String -> Bool
-isInline tag = (extractTagType tag) `elem` inlineHtmlTags
--}
+ closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $
+ anyOpener <|> (eof >> return "EOF")
+ if closedByNext
+ then return ""
+ else do char '<'
+ spaces
+ char '/'
+ spaces
+ stringAnyCase tag
+ spaces
+ char '>'
+ return $ "</" ++ tag ++ ">"
-- | Returns @True@ if the tag is (or can be) a block tag.
isBlock :: String -> Bool
-isBlock tag = (extractTagType tag) `elem` blockHtmlTags
+isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline)
anyHtmlBlockTag :: GenParser Char ParserState [Char]
anyHtmlBlockTag = try $ do
@@ -298,18 +344,43 @@ anyHtmlInlineTag = try $ do
-- Scripts must be treated differently, because they can contain '<>' etc.
htmlScript :: GenParser Char ParserState [Char]
htmlScript = try $ do
- open <- string "<script"
- rest <- manyTill anyChar (htmlEndTag "script")
+ lookAhead $ htmlTag "script"
+ open <- anyHtmlTag
+ rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
st <- getState
if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
then return "<!-- unsafe HTML removed -->"
else return $ open ++ rest ++ "</script>"
+scriptChunk :: GenParser Char ParserState [Char]
+scriptChunk = jsComment <|> jsString <|> jsChars
+ where jsComment = jsEndlineComment <|> jsMultilineComment
+ jsString = jsSingleQuoteString <|> jsDoubleQuoteString
+ jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar
+ jsEndlineComment = try $ do
+ string "//"
+ res <- manyTill anyChar newline
+ return ("//" ++ res)
+ jsMultilineComment = try $ do
+ string "/*"
+ res <- manyTill anyChar (try $ string "*/")
+ return ("/*" ++ res ++ "*/")
+ jsSingleQuoteString = stringwith '\''
+ jsDoubleQuoteString = stringwith '"'
+ charWithEsc escapable = try $
+ (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x])
+ <|> count 1 anyChar
+ stringwith c = try $ do
+ char c
+ res <- liftM concat $ manyTill (charWithEsc [c]) (char c)
+ return (c : (res ++ [c]))
+
-- | Parses material between style tags.
-- Style tags must be treated differently, because they can contain CSS
htmlStyle :: GenParser Char ParserState [Char]
htmlStyle = try $ do
- open <- string "<style"
+ lookAhead $ htmlTag "style"
+ open <- anyHtmlTag
rest <- manyTill anyChar (htmlEndTag "style")
st <- getState
if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
@@ -404,6 +475,14 @@ bodyTitle = try $ do
_ -> fail "not title"
inlinesTilEnd "h1"
+endOfDoc :: GenParser Char ParserState ()
+endOfDoc = try $ do
+ spaces
+ optional (htmlEndTag "body")
+ spaces
+ optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html>
+ eof
+
parseHtml :: GenParser Char ParserState Pandoc
parseHtml = do
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
@@ -415,11 +494,7 @@ parseHtml = do
spaces
optional bodyTitle -- skip title in body, because it's represented in meta
blocks <- parseBlocks
- spaces
- optional (htmlEndTag "body")
- spaces
- optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
- eof
+ endOfDoc
return $ Pandoc meta blocks
--
@@ -438,6 +513,7 @@ block = choice [ codeBlock
, para
, plain
, rawHtmlBlock'
+ , notFollowedBy' endOfDoc >> char '<' >> return Null
] <?> "block"
--
@@ -586,6 +662,7 @@ inline = choice [ charRef
, link
, image
, rawHtmlInline
+ , char '&' >> return (Str "&") -- common HTML error
] <?> "inline"
code :: GenParser Char ParserState Inline
@@ -599,7 +676,7 @@ code = try $ do
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = do
- result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
+ result <- anyHtmlInlineTag <|> htmlComment
state <- getState
if stateParseRaw state then return (HtmlInline result) else return (Str "")
@@ -640,7 +717,7 @@ linebreak :: GenParser Char ParserState Inline
linebreak = htmlTag "br" >> optional newline >> return LineBreak
str :: GenParser Char st Inline
-str = many1 (noneOf "<& \t\n") >>= return . Str
+str = many1 (noneOf "< \t\n&") >>= return . Str
--
-- links and images