HTML reader: code cleanup + parse <tt> as Code.

Partially resolves Issue #247.
author: John MacFarlane <jgm@berkeley.edu> 2010-07-14 09:39:48 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2010-07-14 09:39:48 -0700
commit: 68e3f83545ab4da7004ee36080e308dafa244aee (patch)
tree: 40fd9031676fb4c003a85b7aba520a63ad1d9a8c /src/Text/Pandoc
parent: e84126a11423e6dc339dc7abbdbcbbde6d61ae05 (diff)
1 files changed, 47 insertions, 34 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 6d54e7349..f47309d3f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -51,7 +51,7 @@ import Data.Maybe ( fromMaybe )
 import Data.List ( isPrefixOf, isSuffixOf, intercalate )
 import Data.Char ( toLower, isAlphaNum )
 import Network.URI ( parseURIReference, URI (..) )
-import Control.Monad ( liftM )
+import Control.Monad ( liftM, when )
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ParserState   -- ^ Parser state
@@ -199,11 +199,11 @@ inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
 
 -- | Parse blocks between open and close tag.
 blocksIn :: String -> GenParser Char ParserState [Block]
-blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
+blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag
 
 -- | Parse inlines between open and close tag.
 inlinesIn :: String -> GenParser Char ParserState [Inline]
-inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
+inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag
 
 -- | Extract type from a tag:  e.g. @br@ from @\<br\>@
 extractTagType :: String -> String
@@ -259,18 +259,33 @@ anyHtmlEndTag = try $ do
      then return $ "<!-- unsafe HTML removed -->"
      else return result 
 
-htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
-htmlTag tag = try $ do
+htmlTag :: Bool
+        -> String
+        -> GenParser Char ParserState (String, [(String, String)])
+htmlTag selfClosing tag = try $ do
   char '<'
   spaces
   stringAnyCase tag
   attribs <- many htmlAttribute
   spaces
-  optional (string "/")
-  spaces
+  -- note: we want to handle both HTML and XHTML,
+  -- so we don't require the /
+  when selfClosing $ optional $ char '/' >> spaces
   char '>'
   return (tag, (map (\(name, content, _) -> (name, content)) attribs))
 
+htmlOpenTag :: String
+             -> GenParser Char ParserState (String, [(String, String)])
+htmlOpenTag = htmlTag False
+
+htmlCloseTag :: String
+             -> GenParser Char ParserState (String, [(String, String)])
+htmlCloseTag = htmlTag False . ('/':)
+
+htmlSelfClosingTag :: String
+                   -> GenParser Char ParserState (String, [(String, String)])
+htmlSelfClosingTag = htmlTag True
+
 -- parses a quoted html attribute value
 quoted :: Char -> GenParser Char st (String, String)
 quoted quoteChar = do
@@ -345,7 +360,7 @@ anyHtmlInlineTag = try $ do
 -- Scripts must be treated differently, because they can contain '<>' etc.
 htmlScript :: GenParser Char ParserState [Char]
 htmlScript = try $ do
-  lookAhead $ htmlTag "script"
+  lookAhead $ htmlOpenTag "script"
   open <- anyHtmlTag
   rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
   st <- getState
@@ -380,7 +395,7 @@ scriptChunk = jsComment <|> jsString <|> jsChars
 -- Style tags must be treated differently, because they can contain CSS
 htmlStyle :: GenParser Char ParserState [Char]
 htmlStyle = try $ do
-  lookAhead $ htmlTag "style"
+  lookAhead $ htmlOpenTag "style"
   open <- anyHtmlTag
   rest <- manyTill anyChar (htmlEndTag "style")
   st <- getState
@@ -412,7 +427,8 @@ rawVerbatimBlock = try $ do
 -- We don't want to parse </body> or </html> as raw HTML, since these
 -- are handled in parseHtml.
 rawHtmlBlock' :: GenParser Char ParserState Block
-rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
+rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|>
+                                   htmlCloseTag "html")
                    rawHtmlBlock
 
 -- | Parses an HTML comment.
@@ -442,13 +458,13 @@ definition = try $ do
 
 nonTitleNonHead :: GenParser Char ParserState Char
 nonTitleNonHead = try $ do
-  notFollowedBy $ (htmlTag "title" >> return ' ') <|> 
+  notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|> 
                   (htmlEndTag "head" >> return ' ')
   (rawHtmlBlock >> return ' ') <|> anyChar
 
 parseTitle :: GenParser Char ParserState [Inline]
 parseTitle = try $ do
-  (tag, _) <- htmlTag "title"
+  (tag, _) <- htmlOpenTag "title"
   contents <- inlinesTilEnd tag
   spaces
   return contents
@@ -456,7 +472,7 @@ parseTitle = try $ do
 -- parse header and return meta-information (for now, just title)
 parseHead :: GenParser Char ParserState Meta
 parseHead = try $ do
-  htmlTag "head"
+  htmlOpenTag "head"
   spaces
   skipMany nonTitleNonHead
   contents <- option [] parseTitle
@@ -464,13 +480,10 @@ parseHead = try $ do
   htmlEndTag "head"
   return $ Meta contents [] []
 
-skipHtmlTag :: String -> GenParser Char ParserState ()
-skipHtmlTag tag = optional (htmlTag tag)
-
 -- h1 class="title" representation of title in body
 bodyTitle :: GenParser Char ParserState [Inline]
 bodyTitle = try $ do
-  (_, attribs) <- htmlTag "h1"  
+  (_, attribs) <- htmlOpenTag "h1"  
   case (extractAttribute "class" attribs) of
        Just "title" -> return ""
        _            -> fail "not title"
@@ -488,11 +501,11 @@ parseHtml :: GenParser Char ParserState Pandoc
 parseHtml = do
   sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
   spaces
-  skipHtmlTag "html"
+  optional $ htmlOpenTag "html"
   spaces
   meta <- option (Meta [] [] []) parseHead
   spaces
-  skipHtmlTag "body"
+  optional $ htmlOpenTag "body"
   spaces
   optional bodyTitle  -- skip title in body, because it's represented in meta
   blocks <- parseBlocks
@@ -528,7 +541,7 @@ header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
 headerLevel :: Int -> GenParser Char ParserState Block
 headerLevel n = try $ do
     let level = "h" ++ show n
-    htmlTag level
+    htmlOpenTag level
     contents <- inlinesTilEnd level
     return $ Header n (normalizeSpaces contents)
 
@@ -538,7 +551,7 @@ headerLevel n = try $ do
 
 hrule :: GenParser Char ParserState Block
 hrule = try  $ do
-  (_, attribs) <- htmlTag "hr"
+  (_, attribs) <- htmlSelfClosingTag "hr"
   state <- getState
   if not (null attribs) && stateParseRaw state
      then unexpected "attributes in hr" -- parse as raw in this case
@@ -552,7 +565,7 @@ hrule = try  $ do
 -- skipped, because they are not portable to output formats other than HTML.
 codeBlock :: GenParser Char ParserState Block
 codeBlock = try $ do
-    htmlTag "pre" 
+    htmlOpenTag "pre" 
     result <- manyTill 
               (many1 (satisfy (/= '<')) <|> 
                ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
@@ -573,7 +586,7 @@ codeBlock = try $ do
 --
 
 blockQuote :: GenParser Char ParserState Block
-blockQuote = try $ htmlTag "blockquote" >> spaces >> 
+blockQuote = try $ htmlOpenTag "blockquote" >> spaces >> 
                    blocksTilEnd "blockquote" >>= (return . BlockQuote)
 
 --
@@ -585,7 +598,7 @@ list = choice [ bulletList, orderedList, definitionList ] <?> "list"
 
 orderedList :: GenParser Char ParserState Block
 orderedList = try $ do
-  (_, attribs) <- htmlTag "ol"
+  (_, attribs) <- htmlOpenTag "ol"
   (start, style) <- option (1, DefaultStyle) $
                            do failIfStrict
                               let sta = fromMaybe "1" $ 
@@ -610,7 +623,7 @@ orderedList = try $ do
 
 bulletList :: GenParser Char ParserState Block
 bulletList = try $ do
-  htmlTag "ul"
+  htmlOpenTag "ul"
   spaces
   -- note: if they have an <ol> or <ul> not in scope of a <li>,
   -- treat it as a list item, though it's not valid xhtml...
@@ -621,7 +634,7 @@ bulletList = try $ do
 definitionList :: GenParser Char ParserState Block
 definitionList = try $ do
   failIfStrict  -- def lists not part of standard markdown
-  htmlTag "dl"
+  htmlOpenTag "dl"
   spaces
   items <- sepEndBy1 definitionListItem spaces
   htmlEndTag "dl"
@@ -639,7 +652,7 @@ definitionListItem = try $ do
 --
 
 para :: GenParser Char ParserState Block
-para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= 
+para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>= 
              return . Para . normalizeSpaces
 
 -- 
@@ -673,8 +686,8 @@ inline = choice [ charRef
 
 code :: GenParser Char ParserState Inline
 code = try $ do 
-  htmlTag "code"
-  result <- manyTill anyChar (htmlEndTag "code")
+  result <- (htmlOpenTag "code" >> manyTill anyChar (htmlEndTag "code"))
+        <|> (htmlOpenTag "tt"   >> manyTill anyChar (htmlEndTag "tt"))
   -- remove internal line breaks, leading and trailing space,
   -- and decode character references
   return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ 
@@ -687,7 +700,7 @@ rawHtmlInline = do
   if stateParseRaw state then return (HtmlInline result) else return (Str "")
 
 betweenTags :: [Char] -> GenParser Char ParserState [Inline]
-betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= 
+betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>= 
                         return . normalizeSpaces
 
 emph :: GenParser Char ParserState Inline
@@ -709,7 +722,7 @@ strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
 spanStrikeout :: GenParser Char ParserState Inline
 spanStrikeout = try $ do
   failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
-  (_, attributes) <- htmlTag "span" 
+  (_, attributes) <- htmlOpenTag "span" 
   result <- case (extractAttribute "class" attributes) of
               Just "strikeout" -> inlinesTilEnd "span"
               _                -> fail "not a strikeout"
@@ -720,7 +733,7 @@ whitespace = many1 space >> return Space
 
 -- hard line break
 linebreak :: GenParser Char ParserState Inline
-linebreak = htmlTag "br" >> optional newline >> return LineBreak
+linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak
 
 str :: GenParser Char st Inline
 str = many1 (noneOf "< \t\n&") >>= return . Str
@@ -741,7 +754,7 @@ extractAttribute name ((attrName, contents):rest) =
 
 link :: GenParser Char ParserState Inline
 link = try $ do
-  (_, attributes) <- htmlTag "a"  
+  (_, attributes) <- htmlOpenTag "a"  
   url <- case (extractAttribute "href" attributes) of
            Just url -> return url
            Nothing  -> fail "no href"
@@ -751,7 +764,7 @@ link = try $ do
 
 image :: GenParser Char ParserState Inline
 image = try $ do
-  (_, attributes) <- htmlTag "img" 
+  (_, attributes) <- htmlSelfClosingTag "img"
   url <- case (extractAttribute "src" attributes) of
            Just url -> return url
            Nothing  -> fail "no src"
author	John MacFarlane <jgm@berkeley.edu>	2010-07-14 09:39:48 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2010-07-14 09:39:48 -0700
commit	68e3f83545ab4da7004ee36080e308dafa244aee (patch)
tree	40fd9031676fb4c003a85b7aba520a63ad1d9a8c /src/Text/Pandoc
parent	e84126a11423e6dc339dc7abbdbcbbde6d61ae05 (diff)