summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs1105
1 files changed, 425 insertions, 680 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f47309d3f..18e3113d3 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -27,43 +27,397 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of HTML to 'Pandoc' document.
-}
-module Text.Pandoc.Readers.HTML (
- readHtml,
- rawHtmlInline,
- rawHtmlBlock,
- anyHtmlBlockTag,
- anyHtmlInlineTag,
- anyHtmlTag,
- anyHtmlEndTag,
- htmlEndTag,
- extractTagType,
- htmlBlockElement,
- htmlComment,
- unsanitaryURI
+module Text.Pandoc.Readers.HTML ( readHtml
+ , htmlTag
+ , htmlInBalanced
+ , isInlineTag
+ , isBlockTag
+ , isTextTag
+ , isCommentTag
) where
import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Parsec.Pos
+import Text.HTML.TagSoup
+import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
+import Text.Pandoc.Builder (text, toList)
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
-import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
-import Data.Maybe ( fromMaybe )
-import Data.List ( isPrefixOf, isSuffixOf, intercalate )
-import Data.Char ( toLower, isAlphaNum )
-import Network.URI ( parseURIReference, URI (..) )
-import Control.Monad ( liftM, when )
+import Data.Maybe ( fromMaybe, isJust )
+import Data.List ( intercalate )
+import Data.Char ( isSpace, isDigit )
+import Control.Monad ( liftM, guard )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
-readHtml = readWith parseHtml
+readHtml st inp = Pandoc meta blocks
+ where blocks = readWith parseBody st rest
+ tags = canonicalizeTags $
+ parseTagsOptions parseOptions{ optTagPosition = True } inp
+ hasHeader = any (~== TagOpen "head" []) tags
+ (meta, rest) = if hasHeader
+ then parseHeader tags
+ else (Meta [] [] [], tags)
+
+type TagParser = GenParser (Tag String) ParserState
+
+-- TODO - fix this - not every header has a title tag
+parseHeader :: [Tag String] -> (Meta, [Tag String])
+parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
+ where (tit,_) = break (~== TagClose "title") $ drop 1 $
+ dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
+ tit' = concatMap fromTagText $ filter isTagText tit
+ tit'' = normalizeSpaces $ toList $ text tit'
+ rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" ||
+ t ~== TagOpen "body" []) tags
+
+parseBody :: TagParser [Block]
+parseBody = liftM concat $ manyTill block eof
+
+block :: TagParser [Block]
+block = choice
+ [ pPara
+ , pHeader
+ , pBlockQuote
+ , pCodeBlock
+ , pList
+ , pHrule
+ , pSimpleTable
+ , pPlain
+ , pRawHtmlBlock
+ ]
+
+renderTags' :: [Tag String] -> String
+renderTags' = renderTagsOptions
+ renderOptions{ optMinimize = (`elem` ["hr","br","img"]) }
+
+pList :: TagParser [Block]
+pList = pBulletList <|> pOrderedList <|> pDefinitionList
+
+pBulletList :: TagParser [Block]
+pBulletList = try $ do
+ pSatisfy (~== TagOpen "ul" [])
+ let nonItem = pSatisfy (\t ->
+ not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
+ not (t ~== TagClose "ul"))
+ -- note: if they have an <ol> or <ul> not in scope of a <li>,
+ -- treat it as a list item, though it's not valid xhtml...
+ skipMany nonItem
+ items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
+ return [BulletList items]
+
+pOrderedList :: TagParser [Block]
+pOrderedList = try $ do
+ TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
+ st <- getState
+ let (start, style) = if stateStrict st
+ then (1, DefaultStyle)
+ else (sta', sty')
+ where sta = fromMaybe "1" $
+ lookup "start" attribs
+ sta' = if all isDigit sta
+ then read sta
+ else 1
+ sty = fromMaybe (fromMaybe "" $
+ lookup "style" attribs) $
+ lookup "class" attribs
+ sty' = case sty of
+ "lower-roman" -> LowerRoman
+ "upper-roman" -> UpperRoman
+ "lower-alpha" -> LowerAlpha
+ "upper-alpha" -> UpperAlpha
+ "decimal" -> Decimal
+ _ -> DefaultStyle
+ let nonItem = pSatisfy (\t ->
+ not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
+ not (t ~== TagClose "ol"))
+ -- note: if they have an <ol> or <ul> not in scope of a <li>,
+ -- treat it as a list item, though it's not valid xhtml...
+ skipMany nonItem
+ items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
+ return [OrderedList (start, style, DefaultDelim) items]
+
+pDefinitionList :: TagParser [Block]
+pDefinitionList = try $ do
+ pSatisfy (~== TagOpen "dl" [])
+ items <- manyTill pDefListItem (pCloses "dl")
+ return [DefinitionList items]
+
+pDefListItem :: TagParser ([Inline],[[Block]])
+pDefListItem = try $ do
+ let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
+ not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
+ terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
+ defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
+ skipMany nonItem
+ let term = intercalate [LineBreak] terms
+ return (term, defs)
+
+pRawTag :: TagParser String
+pRawTag = do
+ tag <- pAnyTag
+ let ignorable x = x `elem` ["html","head","body"]
+ if tagOpen ignorable (const True) tag || tagClose ignorable tag
+ then return []
+ else return $ renderTags' [tag]
+
+pRawHtmlBlock :: TagParser [Block]
+pRawHtmlBlock = do
+ raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
+ state <- getState
+ if stateParseRaw state && not (null raw)
+ then return [RawBlock "html" raw]
+ else return []
+
+pHtmlBlock :: String -> TagParser String
+pHtmlBlock t = try $ do
+ open <- pSatisfy (~== TagOpen t [])
+ contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
+ return $ renderTags' $ [open] ++ contents ++ [TagClose t]
+
+pHeader :: TagParser [Block]
+pHeader = try $ do
+ TagOpen tagtype attr <- pSatisfy $
+ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
+ (const True)
+ let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
+ let level = read (drop 1 tagtype)
+ contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
+ return $ if bodyTitle
+ then [] -- skip a representation of the title in the body
+ else [Header level $ normalizeSpaces contents]
+
+pHrule :: TagParser [Block]
+pHrule = do
+ pSelfClosing (=="hr") (const True)
+ return [HorizontalRule]
+
+pSimpleTable :: TagParser [Block]
+pSimpleTable = try $ do
+ TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
+ skipMany pBlank
+ head' <- option [] $ pInTags "th" pTd
+ rows <- many1 $ try $
+ skipMany pBlank >> pInTags "tr" pTd
+ skipMany pBlank
+ TagClose _ <- pSatisfy (~== TagClose "table")
+ let cols = maximum $ map length rows
+ let aligns = replicate cols AlignLeft
+ let widths = replicate cols 0
+ return [Table [] aligns widths head' rows]
+
+pTd :: TagParser [TableCell]
+pTd = try $ do
+ skipMany pBlank
+ res <- pInTags "td" pPlain
+ skipMany pBlank
+ return [res]
+
+pBlockQuote :: TagParser [Block]
+pBlockQuote = do
+ contents <- pInTags "blockquote" block
+ return [BlockQuote contents]
+
+pPlain :: TagParser [Block]
+pPlain = do
+ contents <- liftM (normalizeSpaces . concat) $ many1 inline
+ if null contents
+ then return []
+ else return [Plain contents]
+
+pPara :: TagParser [Block]
+pPara = do
+ contents <- pInTags "p" inline
+ return [Para $ normalizeSpaces contents]
+
+pCodeBlock :: TagParser [Block]
+pCodeBlock = try $ do
+ TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
+ contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
+ let rawText = concatMap fromTagText $ filter isTagText contents
+ -- drop leading newline if any
+ let result' = case rawText of
+ '\n':xs -> xs
+ _ -> rawText
+ -- drop trailing newline if any
+ let result = case reverse result' of
+ '\n':_ -> init result'
+ _ -> result'
+ let attribsId = fromMaybe "" $ lookup "id" attr
+ let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+ let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ st <- getState
+ let attribs = if stateStrict st
+ then ("",[],[])
+ else (attribsId, attribsClasses, attribsKV)
+ return [CodeBlock attribs result]
+
+inline :: TagParser [Inline]
+inline = choice
+ [ pTagText
+ , pEmph
+ , pStrong
+ , pSuperscript
+ , pSubscript
+ , pStrikeout
+ , pLineBreak
+ , pLink
+ , pImage
+ , pCode
+ , pRawHtmlInline
+ ]
+
+pLocation :: TagParser ()
+pLocation = do
+ (TagPosition r c) <- pSat isTagPosition
+ setPosition $ newPos "input" r c
+
+pSat :: (Tag String -> Bool) -> TagParser (Tag String)
+pSat f = do
+ pos <- getPosition
+ token show (const pos) (\x -> if f x then Just x else Nothing)
+
+pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
+pSatisfy f = try $ optional pLocation >> pSat f
+
+pAnyTag :: TagParser (Tag String)
+pAnyTag = pSatisfy (const True)
+
+pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
+ -> TagParser (Tag String)
+pSelfClosing f g = do
+ open <- pSatisfy (tagOpen f g)
+ optional $ pSatisfy (tagClose f)
+ return open
+
+pEmph :: TagParser [Inline]
+pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
+
+pStrong :: TagParser [Inline]
+pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
+
+pSuperscript :: TagParser [Inline]
+pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript
+
+pSubscript :: TagParser [Inline]
+pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript
+
+pStrikeout :: TagParser [Inline]
+pStrikeout = do
+ failIfStrict
+ pInlinesInTags "s" Strikeout <|>
+ pInlinesInTags "strike" Strikeout <|>
+ pInlinesInTags "del" Strikeout <|>
+ try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
+ contents <- liftM concat $ manyTill inline (pCloses "span")
+ return [Strikeout contents])
+
+pLineBreak :: TagParser [Inline]
+pLineBreak = do
+ pSelfClosing (=="br") (const True)
+ return [LineBreak]
+
+pLink :: TagParser [Inline]
+pLink = try $ do
+ tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
+ let url = fromAttrib "href" tag
+ let title = fromAttrib "title" tag
+ lab <- liftM concat $ manyTill inline (pCloses "a")
+ return [Link (normalizeSpaces lab) (escapeURI url, title)]
+
+pImage :: TagParser [Inline]
+pImage = do
+ tag <- pSelfClosing (=="img") (isJust . lookup "src")
+ let url = fromAttrib "src" tag
+ let title = fromAttrib "title" tag
+ let alt = fromAttrib "alt" tag
+ return [Image (toList $ text alt) (escapeURI url, title)]
+
+pCode :: TagParser [Inline]
+pCode = try $ do
+ (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
+ result <- manyTill pAnyTag (pCloses open)
+ let ident = fromMaybe "" $ lookup "id" attr
+ let classes = words $ fromMaybe [] $ lookup "class" attr
+ let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr
+ return [Code (ident,classes,rest)
+ $ intercalate " " $ lines $ innerText result]
+
+pRawHtmlInline :: TagParser [Inline]
+pRawHtmlInline = do
+ result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
+ state <- getState
+ if stateParseRaw state
+ then return [RawInline "html" $ renderTags' [result]]
+ else return []
+
+pInlinesInTags :: String -> ([Inline] -> Inline)
+ -> TagParser [Inline]
+pInlinesInTags tagtype f = do
+ contents <- pInTags tagtype inline
+ return [f contents]
+
+pInTags :: String -> TagParser [a]
+ -> TagParser [a]
+pInTags tagtype parser = try $ do
+ pSatisfy (~== TagOpen tagtype [])
+ liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+
+pCloses :: String -> TagParser ()
+pCloses tagtype = try $ do
+ t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
+ case t of
+ (TagClose t') | t' == tagtype -> pAnyTag >> return ()
+ (TagOpen t' _) | t' `closes` tagtype -> return ()
+ (TagClose "ul") | tagtype == "li" -> return ()
+ (TagClose "ol") | tagtype == "li" -> return ()
+ (TagClose "dl") | tagtype == "li" -> return ()
+ _ -> pzero
+
+pTagText :: TagParser [Inline]
+pTagText = try $ do
+ (TagText str) <- pSatisfy isTagText
+ st <- getState
+ case runParser (many pTagContents) st "text" str of
+ Left _ -> fail $ "Could not parse `" ++ str ++ "'"
+ Right result -> return result
+
+pBlank :: TagParser ()
+pBlank = try $ do
+ (TagText str) <- pSatisfy isTagText
+ guard $ all isSpace str
+
+pTagContents :: GenParser Char ParserState Inline
+pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+
+pStr :: GenParser Char ParserState Inline
+pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
+
+isSpecial :: Char -> Bool
+isSpecial '"' = True
+isSpecial '\'' = True
+isSpecial '.' = True
+isSpecial '-' = True
+isSpecial '\8216' = True
+isSpecial '\8217' = True
+isSpecial '\8220' = True
+isSpecial '\8221' = True
+isSpecial _ = False
+
+pSymbol :: GenParser Char ParserState Inline
+pSymbol = satisfy isSpecial >>= return . Str . (:[])
+
+pSpace :: GenParser Char ParserState Inline
+pSpace = many1 (satisfy isSpace) >> return Space
--
-- Constants
--
-eitherBlockOrInline :: [[Char]]
+eitherBlockOrInline :: [String]
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
"map", "area", "object"]
@@ -76,57 +430,41 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"textarea", "tt", "u", "var"]
-}
-blockHtmlTags :: [[Char]]
+blockHtmlTags :: [String]
blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes",
- "noscript", "ol", "p", "pre", "table", "ul", "dd",
+ "h5", "h6", "head", "hr", "html", "isindex", "menu",
+ "noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
-sanitaryTags :: [[Char]]
-sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
- "blockquote", "br", "button", "caption", "center",
- "cite", "code", "col", "colgroup", "dd", "del", "dfn",
- "dir", "div", "dl", "dt", "em", "fieldset", "font",
- "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
- "i", "img", "input", "ins", "kbd", "label", "legend",
- "li", "map", "menu", "ol", "optgroup", "option", "p",
- "pre", "q", "s", "samp", "select", "small", "span",
- "strike", "strong", "sub", "sup", "table", "tbody",
- "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
- "u", "ul", "var"]
-
-sanitaryAttributes :: [[Char]]
-sanitaryAttributes = ["abbr", "accept", "accept-charset",
- "accesskey", "action", "align", "alt", "axis",
- "border", "cellpadding", "cellspacing", "char",
- "charoff", "charset", "checked", "cite", "class",
- "clear", "cols", "colspan", "color", "compact",
- "coords", "datetime", "dir", "disabled",
- "enctype", "for", "frame", "headers", "height",
- "href", "hreflang", "hspace", "id", "ismap",
- "label", "lang", "longdesc", "maxlength", "media",
- "method", "multiple", "name", "nohref", "noshade",
- "nowrap", "prompt", "readonly", "rel", "rev",
- "rows", "rowspan", "rules", "scope", "selected",
- "shape", "size", "span", "src", "start",
- "summary", "tabindex", "target", "title", "type",
- "usemap", "valign", "value", "vspace", "width"]
+isInlineTag :: Tag String -> Bool
+isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t ||
+ tagClose (`notElem` blockHtmlTags) t ||
+ tagComment (const True) t
+
+isBlockTag :: Tag String -> Bool
+isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
+ tagClose (`elem` blocktags) t ||
+ tagComment (const True) t
+ where blocktags = blockHtmlTags ++ eitherBlockOrInline
+
+isTextTag :: Tag String -> Bool
+isTextTag = tagText (const True)
+
+isCommentTag :: Tag String -> Bool
+isCommentTag = tagComment (const True)
-- taken from HXT and extended
closes :: String -> String -> Bool
-"EOF" `closes` _ = True
_ `closes` "body" = False
_ `closes` "html" = False
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
-"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
-"dd" `closes` t | t `elem` ["dt","dd"] = True
"hr" `closes` "p" = True
"p" `closes` "p" = True
"meta" `closes` "meta" = True
@@ -148,627 +486,34 @@ t1 `closes` t2 |
t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
_ `closes` _ = False
---
--- HTML utility functions
---
-
--- | Returns @True@ if sanitization is specified and the specified tag is
--- not on the sanitized tag list.
-unsanitaryTag :: [Char]
- -> GenParser tok ParserState Bool
-unsanitaryTag tag = do
- st <- getState
- return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
-
--- | returns @True@ if sanitization is specified and the specified attribute
--- is not on the sanitized attribute list.
-unsanitaryAttribute :: ([Char], String, t)
- -> GenParser tok ParserState Bool
-unsanitaryAttribute (attr, val, _) = do
- st <- getState
- return $ stateSanitizeHTML st &&
- (attr `notElem` sanitaryAttributes ||
- (attr `elem` ["href","src"] && unsanitaryURI val))
-
--- | Returns @True@ if the specified URI is potentially a security risk.
-unsanitaryURI :: String -> Bool
-unsanitaryURI u =
- let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
- "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
- "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
- "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
- "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
- "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
- "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
- "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
- "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
- "snews:", "webcal:", "ymsgr:"]
- in case parseURIReference (escapeURI u) of
- Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
- Nothing -> True
-
--- | Read blocks until end tag.
-blocksTilEnd :: String -> GenParser Char ParserState [Block]
-blocksTilEnd tag = do
- blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
- return $ filter (/= Null) blocks
-
--- | Read inlines until end tag.
-inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
-inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-
--- | Parse blocks between open and close tag.
-blocksIn :: String -> GenParser Char ParserState [Block]
-blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag
-
--- | Parse inlines between open and close tag.
-inlinesIn :: String -> GenParser Char ParserState [Inline]
-inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag
-
--- | Extract type from a tag: e.g. @br@ from @\<br\>@
-extractTagType :: String -> String
-extractTagType ('<':rest) =
- let isSpaceOrSlash c = c `elem` "/ \n\t" in
- map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
-extractTagType _ = ""
-
--- Parse any HTML tag (opening or self-closing) and return tag type
-anyOpener :: GenParser Char ParserState [Char]
-anyOpener = try $ do
- char '<'
- spaces
- tag <- many1 alphaNum
- skipMany htmlAttribute
- spaces
- option "" (string "/")
- spaces
- char '>'
- return $ map toLower tag
-
--- | Parse any HTML tag (opening or self-closing) and return text of tag
-anyHtmlTag :: GenParser Char ParserState [Char]
-anyHtmlTag = try $ do
- char '<'
- spaces
- tag <- many1 alphaNum
- attribs <- many htmlAttribute
- spaces
- ender <- option "" (string "/")
- let ender' = if null ender then "" else " /"
- spaces
- char '>'
- let result = "<" ++ tag ++
- concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
- unsanitary <- unsanitaryTag tag
- if unsanitary
- then return $ "<!-- unsafe HTML removed -->"
- else return result
-
-anyHtmlEndTag :: GenParser Char ParserState [Char]
-anyHtmlEndTag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- tag <- many1 alphaNum
- spaces
- char '>'
- let result = "</" ++ tag ++ ">"
- unsanitary <- unsanitaryTag tag
- if unsanitary
- then return $ "<!-- unsafe HTML removed -->"
- else return result
-
-htmlTag :: Bool
- -> String
- -> GenParser Char ParserState (String, [(String, String)])
-htmlTag selfClosing tag = try $ do
- char '<'
- spaces
- stringAnyCase tag
- attribs <- many htmlAttribute
- spaces
- -- note: we want to handle both HTML and XHTML,
- -- so we don't require the /
- when selfClosing $ optional $ char '/' >> spaces
- char '>'
- return (tag, (map (\(name, content, _) -> (name, content)) attribs))
-
-htmlOpenTag :: String
- -> GenParser Char ParserState (String, [(String, String)])
-htmlOpenTag = htmlTag False
-
-htmlCloseTag :: String
- -> GenParser Char ParserState (String, [(String, String)])
-htmlCloseTag = htmlTag False . ('/':)
-
-htmlSelfClosingTag :: String
- -> GenParser Char ParserState (String, [(String, String)])
-htmlSelfClosingTag = htmlTag True
-
--- parses a quoted html attribute value
-quoted :: Char -> GenParser Char st (String, String)
-quoted quoteChar = do
- result <- between (char quoteChar) (char quoteChar)
- (many (noneOf [quoteChar]))
- return (result, [quoteChar])
-
-nullAttribute :: ([Char], [Char], [Char])
-nullAttribute = ("", "", "")
-
-htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
-htmlAttribute = do
- attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
- unsanitary <- unsanitaryAttribute attr
- if unsanitary
- then return nullAttribute
- else return attr
-
--- minimized boolean attribute
-htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
-htmlMinimizedAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- return (name, name, name)
-
-htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
-htmlRegularAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- spaces
- char '='
- spaces
- (content, quoteStr) <- choice [ (quoted '\''),
- (quoted '"'),
- (do
- a <- many (noneOf " \t\n\r\"'<>")
- return (a,"")) ]
- return (name, content,
- (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-
--- | Parse an end tag of type 'tag'
-htmlEndTag :: [Char] -> GenParser Char ParserState [Char]
-htmlEndTag tag = try $ do
- closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $
- anyOpener <|> (eof >> return "EOF")
- if closedByNext
- then return ""
- else do char '<'
- spaces
- char '/'
- spaces
- stringAnyCase tag
- spaces
- char '>'
- return $ "</" ++ tag ++ ">"
-
--- | Returns @True@ if the tag is (or can be) a block tag.
-isBlock :: String -> Bool
-isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline)
-
-anyHtmlBlockTag :: GenParser Char ParserState [Char]
-anyHtmlBlockTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if isBlock tag then return tag else fail "not a block tag"
-
-anyHtmlInlineTag :: GenParser Char ParserState [Char]
-anyHtmlInlineTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if not (isBlock tag) then return tag else fail "not an inline tag"
-
--- | Parses material between script tags.
--- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript :: GenParser Char ParserState [Char]
-htmlScript = try $ do
- lookAhead $ htmlOpenTag "script"
- open <- anyHtmlTag
- rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
- st <- getState
- if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
- then return "<!-- unsafe HTML removed -->"
- else return $ open ++ rest ++ "</script>"
-
-scriptChunk :: GenParser Char ParserState [Char]
-scriptChunk = jsComment <|> jsString <|> jsChars
- where jsComment = jsEndlineComment <|> jsMultilineComment
- jsString = jsSingleQuoteString <|> jsDoubleQuoteString
- jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar
- jsEndlineComment = try $ do
- string "//"
- res <- manyTill anyChar newline
- return ("//" ++ res)
- jsMultilineComment = try $ do
- string "/*"
- res <- manyTill anyChar (try $ string "*/")
- return ("/*" ++ res ++ "*/")
- jsSingleQuoteString = stringwith '\''
- jsDoubleQuoteString = stringwith '"'
- charWithEsc escapable = try $
- (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x])
- <|> count 1 anyChar
- stringwith c = try $ do
- char c
- res <- liftM concat $ manyTill (charWithEsc [c]) (char c)
- return (c : (res ++ [c]))
-
--- | Parses material between style tags.
--- Style tags must be treated differently, because they can contain CSS
-htmlStyle :: GenParser Char ParserState [Char]
-htmlStyle = try $ do
- lookAhead $ htmlOpenTag "style"
- open <- anyHtmlTag
- rest <- manyTill anyChar (htmlEndTag "style")
- st <- getState
- if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
- then return "<!-- unsafe HTML removed -->"
- else return $ open ++ rest ++ "</style>"
-
-htmlBlockElement :: GenParser Char ParserState [Char]
-htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
-
-rawHtmlBlock :: GenParser Char ParserState Block
-rawHtmlBlock = try $ do
- body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
- state <- getState
- if stateParseRaw state then return (RawHtml body) else return Null
-
--- This is a block whose contents should be passed through verbatim, not interpreted.
-rawVerbatimBlock :: GenParser Char ParserState [Char]
-rawVerbatimBlock = try $ do
- start <- anyHtmlBlockTag
- let tagtype = extractTagType start
- if tagtype `elem` ["pre"]
- then do
- contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
- end <- htmlEndTag tagtype
- return $ start ++ contents ++ end
- else fail "Not a verbatim block"
-
--- We don't want to parse </body> or </html> as raw HTML, since these
--- are handled in parseHtml.
-rawHtmlBlock' :: GenParser Char ParserState Block
-rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|>
- htmlCloseTag "html")
- rawHtmlBlock
-
--- | Parses an HTML comment.
-htmlComment :: GenParser Char st [Char]
-htmlComment = try $ do
- string "<!--"
- comment <- many $ noneOf "-"
- <|> try (char '-' >>~ notFollowedBy (try (char '-' >> char '>')))
- string "-->"
- return $ "<!--" ++ comment ++ "-->"
-
---
--- parsing documents
---
-
-xmlDec :: GenParser Char st [Char]
-xmlDec = try $ do
- string "<?"
- rest <- manyTill anyChar (char '>')
- return $ "<?" ++ rest ++ ">"
-
-definition :: GenParser Char st [Char]
-definition = try $ do
- string "<!"
- rest <- manyTill anyChar (char '>')
- return $ "<!" ++ rest ++ ">"
-
-nonTitleNonHead :: GenParser Char ParserState Char
-nonTitleNonHead = try $ do
- notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|>
- (htmlEndTag "head" >> return ' ')
- (rawHtmlBlock >> return ' ') <|> anyChar
-
-parseTitle :: GenParser Char ParserState [Inline]
-parseTitle = try $ do
- (tag, _) <- htmlOpenTag "title"
- contents <- inlinesTilEnd tag
- spaces
- return contents
-
--- parse header and return meta-information (for now, just title)
-parseHead :: GenParser Char ParserState Meta
-parseHead = try $ do
- htmlOpenTag "head"
- spaces
- skipMany nonTitleNonHead
- contents <- option [] parseTitle
- skipMany nonTitleNonHead
- htmlEndTag "head"
- return $ Meta contents [] []
-
--- h1 class="title" representation of title in body
-bodyTitle :: GenParser Char ParserState [Inline]
-bodyTitle = try $ do
- (_, attribs) <- htmlOpenTag "h1"
- case (extractAttribute "class" attribs) of
- Just "title" -> return ""
- _ -> fail "not title"
- inlinesTilEnd "h1"
-
-endOfDoc :: GenParser Char ParserState ()
-endOfDoc = try $ do
- spaces
- optional (htmlEndTag "body")
- spaces
- optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html>
- eof
-
-parseHtml :: GenParser Char ParserState Pandoc
-parseHtml = do
- sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
- spaces
- optional $ htmlOpenTag "html"
- spaces
- meta <- option (Meta [] [] []) parseHead
- spaces
- optional $ htmlOpenTag "body"
- spaces
- optional bodyTitle -- skip title in body, because it's represented in meta
- blocks <- parseBlocks
- endOfDoc
- return $ Pandoc meta blocks
-
---
--- parsing blocks
---
-
-parseBlocks :: GenParser Char ParserState [Block]
-parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
-
-block :: GenParser Char ParserState Block
-block = choice [ codeBlock
- , header
- , hrule
- , list
- , blockQuote
- , para
- , plain
- , rawHtmlBlock'
- , notFollowedBy' endOfDoc >> char '<' >> return Null
- ] <?> "block"
-
---
--- header blocks
---
-
-header :: GenParser Char ParserState Block
-header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
-
-headerLevel :: Int -> GenParser Char ParserState Block
-headerLevel n = try $ do
- let level = "h" ++ show n
- htmlOpenTag level
- contents <- inlinesTilEnd level
- return $ Header n (normalizeSpaces contents)
-
---
--- hrule block
---
-
-hrule :: GenParser Char ParserState Block
-hrule = try $ do
- (_, attribs) <- htmlSelfClosingTag "hr"
- state <- getState
- if not (null attribs) && stateParseRaw state
- then unexpected "attributes in hr" -- parse as raw in this case
- else return HorizontalRule
-
---
--- code blocks
---
-
--- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
--- skipped, because they are not portable to output formats other than HTML.
-codeBlock :: GenParser Char ParserState Block
-codeBlock = try $ do
- htmlOpenTag "pre"
- result <- manyTill
- (many1 (satisfy (/= '<')) <|>
- ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
- (htmlEndTag "pre")
- let result' = concat result
- -- drop leading newline if any
- let result'' = if "\n" `isPrefixOf` result'
- then drop 1 result'
- else result'
- -- drop trailing newline if any
- let result''' = if "\n" `isSuffixOf` result''
- then init result''
- else result''
- return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
-
---
--- block quotes
---
-
-blockQuote :: GenParser Char ParserState Block
-blockQuote = try $ htmlOpenTag "blockquote" >> spaces >>
- blocksTilEnd "blockquote" >>= (return . BlockQuote)
-
---
--- list blocks
---
-
-list :: GenParser Char ParserState Block
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-orderedList :: GenParser Char ParserState Block
-orderedList = try $ do
- (_, attribs) <- htmlOpenTag "ol"
- (start, style) <- option (1, DefaultStyle) $
- do failIfStrict
- let sta = fromMaybe "1" $
- lookup "start" attribs
- let sty = fromMaybe (fromMaybe "" $
- lookup "style" attribs) $
- lookup "class" attribs
- let sty' = case sty of
- "lower-roman" -> LowerRoman
- "upper-roman" -> UpperRoman
- "lower-alpha" -> LowerAlpha
- "upper-alpha" -> UpperAlpha
- "decimal" -> Decimal
- _ -> DefaultStyle
- return (read sta, sty')
- spaces
- -- note: if they have an <ol> or <ul> not in scope of a <li>,
- -- treat it as a list item, though it's not valid xhtml...
- items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
- htmlEndTag "ol"
- return $ OrderedList (start, style, DefaultDelim) items
-
-bulletList :: GenParser Char ParserState Block
-bulletList = try $ do
- htmlOpenTag "ul"
- spaces
- -- note: if they have an <ol> or <ul> not in scope of a <li>,
- -- treat it as a list item, though it's not valid xhtml...
- items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
- htmlEndTag "ul"
- return $ BulletList items
-
-definitionList :: GenParser Char ParserState Block
-definitionList = try $ do
- failIfStrict -- def lists not part of standard markdown
- htmlOpenTag "dl"
- spaces
- items <- sepEndBy1 definitionListItem spaces
- htmlEndTag "dl"
- return $ DefinitionList items
-
-definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
-definitionListItem = try $ do
- terms <- sepEndBy1 (inlinesIn "dt") spaces
- defs <- sepEndBy1 (blocksIn "dd") spaces
- let term = intercalate [LineBreak] terms
- return (term, defs)
-
---
--- paragraph block
---
-
-para :: GenParser Char ParserState Block
-para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>=
- return . Para . normalizeSpaces
-
---
--- plain block
---
-
-plain :: GenParser Char ParserState Block
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
---
--- inline
---
-
-inline :: GenParser Char ParserState Inline
-inline = choice [ charRef
- , strong
- , emph
- , superscript
- , subscript
- , strikeout
- , spanStrikeout
- , code
- , str
- , linebreak
- , whitespace
- , link
- , image
- , rawHtmlInline
- , char '&' >> return (Str "&") -- common HTML error
- ] <?> "inline"
-
-code :: GenParser Char ParserState Inline
-code = try $ do
- result <- (htmlOpenTag "code" >> manyTill anyChar (htmlEndTag "code"))
- <|> (htmlOpenTag "tt" >> manyTill anyChar (htmlEndTag "tt"))
- -- remove internal line breaks, leading and trailing space,
- -- and decode character references
- return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
- intercalate " " $ lines result
-
-rawHtmlInline :: GenParser Char ParserState Inline
-rawHtmlInline = do
- result <- anyHtmlInlineTag <|> htmlComment
- state <- getState
- if stateParseRaw state then return (HtmlInline result) else return (Str "")
-
-betweenTags :: [Char] -> GenParser Char ParserState [Inline]
-betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>=
- return . normalizeSpaces
-
-emph :: GenParser Char ParserState Inline
-emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
-
-strong :: GenParser Char ParserState Inline
-strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
-
-superscript :: GenParser Char ParserState Inline
-superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
-
-subscript :: GenParser Char ParserState Inline
-subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
-
-strikeout :: GenParser Char ParserState Inline
-strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
- return . Strikeout
-
-spanStrikeout :: GenParser Char ParserState Inline
-spanStrikeout = try $ do
- failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
- (_, attributes) <- htmlOpenTag "span"
- result <- case (extractAttribute "class" attributes) of
- Just "strikeout" -> inlinesTilEnd "span"
- _ -> fail "not a strikeout"
- return $ Strikeout result
-
-whitespace :: GenParser Char st Inline
-whitespace = many1 space >> return Space
-
--- hard line break
-linebreak :: GenParser Char ParserState Inline
-linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak
-
-str :: GenParser Char st Inline
-str = many1 (noneOf "< \t\n&") >>= return . Str
-
---
--- links and images
---
-
--- extract contents of attribute (attribute names are case-insensitive)
-extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
-extractAttribute _ [] = Nothing
-extractAttribute name ((attrName, contents):rest) =
- let name' = map toLower name
- attrName' = map toLower attrName
- in if attrName' == name'
- then Just (decodeCharacterReferences contents)
- else extractAttribute name rest
-
-link :: GenParser Char ParserState Inline
-link = try $ do
- (_, attributes) <- htmlOpenTag "a"
- url <- case (extractAttribute "href" attributes) of
- Just url -> return url
- Nothing -> fail "no href"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- lab <- inlinesTilEnd "a"
- return $ Link (normalizeSpaces lab) (escapeURI url, title)
-
-image :: GenParser Char ParserState Inline
-image = try $ do
- (_, attributes) <- htmlSelfClosingTag "img"
- url <- case (extractAttribute "src" attributes) of
- Just url -> return url
- Nothing -> fail "no src"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- let alt = fromMaybe "" (extractAttribute "alt" attributes)
- return $ Image [Str alt] (escapeURI url, title)
-
+--- parsers for use in markdown, textile readers
+
+-- | Matches a stretch of HTML in balanced tags.
+htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
+htmlInBalanced f = try $ do
+ (TagOpen t _, tag) <- htmlTag f
+ guard $ '/' `notElem` tag -- not a self-closing tag
+ let nonTagChunk = many1 $ satisfy (/= '<')
+ let stopper = htmlTag (~== TagClose t)
+ let anytag = liftM snd $ htmlTag (const True)
+ contents <- many $ notFollowedBy' stopper >>
+ (nonTagChunk <|> htmlInBalanced (const True) <|> anytag)
+ endtag <- liftM snd stopper
+ return $ tag ++ concat contents ++ endtag
+
+-- | Matches a tag meeting a certain condition.
+htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
+htmlTag f = try $ do
+ lookAhead (char '<')
+ (next : _) <- getInput >>= return . canonicalizeTags . parseTags
+ guard $ f next
+ -- advance the parser
+ case next of
+ TagComment s -> do
+ count (length s + 4) anyChar
+ skipMany (satisfy (/='>'))
+ char '>'
+ return (next, "<!--" ++ s ++ "-->")
+ _ -> do
+ rendered <- manyTill anyChar (char '>')
+ return (next, rendered ++ ">")