Made HTML reader much more forgiving.

+ Incorporated idea (from HXT) that an element can be closed by an open tag for another element. + Javascript is partially parsed to make sure that a <script> section is not closed by a </script> in a comment or string. + More lenient non-quoted attribute values. Now we accept anything but a space character, quote, or <>. This helps in parsing e.g. www.google.com! + Bare & signs are now parsed as a string. This is a common HTML mistake. + Skip a bare < in malformed HTML. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1825 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2010-02-02 07:36:55 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2010-02-02 07:36:55 +0000
commit: 19b0c72dd18050a00dd77bb3dfddd0d0702d157f (patch)
tree: c120e270496a1ecba0c2badd4b8bc2a59c7d0a93 /src/Text
parent: 70a7d7b2141472fe558bfc191f259667ed09f911 (diff)
1 files changed, 106 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index c0df3aa65..58762c35f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -50,6 +50,7 @@ import Data.Maybe ( fromMaybe )
 import Data.List ( isPrefixOf, isSuffixOf, intercalate )
 import Data.Char ( toLower, isAlphaNum )
 import Network.URI ( parseURIReference, URI (..) )
+import Control.Monad ( liftM )
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ParserState   -- ^ Parser state
@@ -71,7 +72,7 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
                   "br", "cite", "code", "dfn", "em", "font", "i", "img",
                   "input", "kbd", "label", "q", "s", "samp", "select",
                   "small", "span", "strike", "strong", "sub", "sup",
-                  "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+                  "textarea", "tt", "u", "var"]
 -}
 
 blockHtmlTags :: [[Char]]
@@ -80,7 +81,7 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
                  "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes",
                  "noscript", "ol", "p", "pre", "table", "ul", "dd",
                  "dt", "frameset", "li", "tbody", "td", "tfoot",
-                 "th", "thead", "tr", "script"] ++ eitherBlockOrInline
+                 "th", "thead", "tr", "script", "style"]
 
 sanitaryTags :: [[Char]]
 sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
@@ -112,6 +113,40 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset",
                       "summary", "tabindex", "target", "title", "type",
                       "usemap", "valign", "value", "vspace", "width"]
 
+-- taken from HXT and extended
+
+closes :: String -> String -> Bool
+"EOF" `closes` _ = True
+_ `closes` "body" = False
+_ `closes` "html" = False
+"a" `closes` "a" = True
+"li" `closes` "li" = True
+"th" `closes` t | t `elem` ["th","td"] = True
+"td" `closes` t | t `elem` ["th","td"] = True
+"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dt" `closes` t | t `elem` ["dt","dd"] = True
+"dd" `closes` t | t `elem` ["dt","dd"] = True
+"hr" `closes` "p" = True
+"p" `closes` "p" = True
+"meta" `closes` "meta" = True
+"colgroup" `closes` "colgroup" = True
+"form" `closes` "form" = True
+"label" `closes` "label" = True
+"map" `closes` "map" = True
+"object" `closes` "object" = True
+_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
+t `closes` "select" | t /= "option" = True
+"thead" `closes` t | t `elem` ["colgroup"] = True
+"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
+"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
+t `closes` t2 |
+   t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
+   t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div"
+t1 `closes` t2 |
+   t1 `elem` blockHtmlTags &&
+   t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
+_ `closes` _ = False
+
 --
 -- HTML utility functions
 --
@@ -176,6 +211,19 @@ extractTagType ('<':rest) =
   map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
 extractTagType _ = ""
 
+-- Parse any HTML tag (opening or self-closing) and return tag type
+anyOpener :: GenParser Char ParserState [Char]
+anyOpener = try $ do
+  char '<'
+  spaces
+  tag <- many1 alphaNum
+  skipMany htmlAttribute
+  spaces
+  option "" (string "/")
+  spaces
+  char '>'
+  return $ map toLower tag 
+
 -- | Parse any HTML tag (opening or self-closing) and return text of tag
 anyHtmlTag :: GenParser Char ParserState [Char]
 anyHtmlTag = try $ do
@@ -257,32 +305,30 @@ htmlRegularAttribute = try $ do
   (content, quoteStr) <- choice [ (quoted '\''), 
                                   (quoted '"'), 
                                   (do
-                                     a <- many (alphaNum <|> (oneOf "-._:"))
+                                     a <- many (noneOf " \t\n\r\"'<>")
                                      return (a,"")) ]
   return (name, content,
           (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
 
 -- | Parse an end tag of type 'tag'
-htmlEndTag :: [Char] -> GenParser Char st [Char]
+htmlEndTag :: [Char] -> GenParser Char ParserState [Char]
 htmlEndTag tag = try $ do
-  char '<'   
-  spaces
-  char '/'
-  spaces
-  stringAnyCase tag
-  spaces
-  char '>'
-  return $ "</" ++ tag ++ ">"
-
-{-
--- | Returns @True@ if the tag is (or can be) an inline tag.
-isInline :: String -> Bool
-isInline tag = (extractTagType tag) `elem` inlineHtmlTags
--}
+  closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $
+                   anyOpener <|> (eof >> return "EOF")
+  if closedByNext
+     then return ""
+     else do char '<'   
+             spaces
+             char '/'
+             spaces
+             stringAnyCase tag
+             spaces
+             char '>'
+             return $ "</" ++ tag ++ ">"
 
 -- | Returns @True@ if the tag is (or can be) a block tag.
 isBlock :: String -> Bool
-isBlock tag = (extractTagType tag) `elem` blockHtmlTags 
+isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline)
 
 anyHtmlBlockTag :: GenParser Char ParserState [Char]
 anyHtmlBlockTag = try $ do
@@ -298,18 +344,43 @@ anyHtmlInlineTag = try $ do
 -- Scripts must be treated differently, because they can contain '<>' etc.
 htmlScript :: GenParser Char ParserState [Char]
 htmlScript = try $ do
-  open <- string "<script"
-  rest <- manyTill anyChar (htmlEndTag "script")
+  lookAhead $ htmlTag "script"
+  open <- anyHtmlTag
+  rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
   st <- getState
   if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
      then return "<!-- unsafe HTML removed -->"
      else return $ open ++ rest ++ "</script>"
 
+scriptChunk :: GenParser Char ParserState [Char]
+scriptChunk = jsComment <|> jsString <|> jsChars
+  where jsComment = jsEndlineComment <|> jsMultilineComment
+        jsString  = jsSingleQuoteString <|> jsDoubleQuoteString
+        jsChars   = many1 (noneOf "<\"'*/") <|> count 1 anyChar
+        jsEndlineComment = try $ do
+           string "//"
+           res <- manyTill anyChar newline
+           return ("//" ++ res)
+        jsMultilineComment = try $ do
+           string "/*"
+           res <- manyTill anyChar (try $ string "*/")
+           return ("/*" ++ res ++ "*/")
+        jsSingleQuoteString = stringwith '\''
+        jsDoubleQuoteString = stringwith '"'
+        charWithEsc escapable = try $
+           (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x])
+          <|> count 1 anyChar
+        stringwith c = try $ do
+           char c
+           res <- liftM concat $ manyTill (charWithEsc [c]) (char c)
+           return (c : (res ++ [c]))
+
 -- | Parses material between style tags.
 -- Style tags must be treated differently, because they can contain CSS
 htmlStyle :: GenParser Char ParserState [Char]
 htmlStyle = try $ do
-  open <- string "<style"
+  lookAhead $ htmlTag "style"
+  open <- anyHtmlTag
   rest <- manyTill anyChar (htmlEndTag "style")
   st <- getState
   if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
@@ -404,6 +475,14 @@ bodyTitle = try $ do
        _            -> fail "not title"
   inlinesTilEnd "h1"
 
+endOfDoc :: GenParser Char ParserState ()
+endOfDoc = try $ do
+  spaces
+  optional (htmlEndTag "body")
+  spaces
+  optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html>
+  eof 
+
 parseHtml :: GenParser Char ParserState Pandoc
 parseHtml = do
   sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
@@ -415,11 +494,7 @@ parseHtml = do
   spaces
   optional bodyTitle  -- skip title in body, because it's represented in meta
   blocks <- parseBlocks
-  spaces
-  optional (htmlEndTag "body")
-  spaces
-  optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
-  eof
+  endOfDoc
   return $ Pandoc meta blocks
 
 --
@@ -438,6 +513,7 @@ block = choice [ codeBlock
                , para
                , plain
                , rawHtmlBlock'
+               , notFollowedBy' endOfDoc >> char '<' >> return Null
                ] <?> "block"
 
 --
@@ -586,6 +662,7 @@ inline = choice [ charRef
                 , link
                 , image
                 , rawHtmlInline
+                , char '&' >> return (Str "&") -- common HTML error
                 ] <?> "inline"
 
 code :: GenParser Char ParserState Inline
@@ -599,7 +676,7 @@ code = try $ do
 
 rawHtmlInline :: GenParser Char ParserState Inline
 rawHtmlInline = do
-  result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
+  result <- anyHtmlInlineTag <|> htmlComment
   state <- getState
   if stateParseRaw state then return (HtmlInline result) else return (Str "")
 
@@ -640,7 +717,7 @@ linebreak :: GenParser Char ParserState Inline
 linebreak = htmlTag "br" >> optional newline >> return LineBreak
 
 str :: GenParser Char st Inline
-str = many1 (noneOf "<& \t\n") >>= return . Str
+str = many1 (noneOf "< \t\n&") >>= return . Str
 
 --
 -- links and images
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2010-02-02 07:36:55 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2010-02-02 07:36:55 +0000
commit	19b0c72dd18050a00dd77bb3dfddd0d0702d157f (patch)
tree	c120e270496a1ecba0c2badd4b8bc2a59c7d0a93 /src/Text
parent	70a7d7b2141472fe558bfc191f259667ed09f911 (diff)