diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 1105 |
1 files changed, 425 insertions, 680 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f47309d3f..18e3113d3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -27,43 +27,397 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of HTML to 'Pandoc' document. -} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement, - htmlComment, - unsanitaryURI +module Text.Pandoc.Readers.HTML ( readHtml + , htmlTag + , htmlInBalanced + , isInlineTag + , isBlockTag + , isTextTag + , isCommentTag ) where import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Parsec.Pos +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match import Text.Pandoc.Definition +import Text.Pandoc.Builder (text, toList) import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( isPrefixOf, isSuffixOf, intercalate ) -import Data.Char ( toLower, isAlphaNum ) -import Network.URI ( parseURIReference, URI (..) ) -import Control.Monad ( liftM, when ) +import Data.Maybe ( fromMaybe, isJust ) +import Data.List ( intercalate ) +import Data.Char ( isSpace, isDigit ) +import Control.Monad ( liftM, guard ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc -readHtml = readWith parseHtml +readHtml st inp = Pandoc meta blocks + where blocks = readWith parseBody st rest + tags = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } inp + hasHeader = any (~== TagOpen "head" []) tags + (meta, rest) = if hasHeader + then parseHeader tags + else (Meta [] [] [], tags) + +type TagParser = GenParser (Tag String) ParserState + +-- TODO - fix this - not every header has a title tag +parseHeader :: [Tag String] -> (Meta, [Tag String]) +parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest) + where (tit,_) = break (~== TagClose "title") $ drop 1 $ + dropWhile (\t -> not $ t ~== TagOpen "title" []) tags + tit' = concatMap fromTagText $ filter isTagText tit + tit'' = normalizeSpaces $ toList $ text tit' + rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" || + t ~== TagOpen "body" []) tags + +parseBody :: TagParser [Block] +parseBody = liftM concat $ manyTill block eof + +block :: TagParser [Block] +block = choice + [ pPara + , pHeader + , pBlockQuote + , pCodeBlock + , pList + , pHrule + , pSimpleTable + , pPlain + , pRawHtmlBlock + ] + +renderTags' :: [Tag String] -> String +renderTags' = renderTagsOptions + renderOptions{ optMinimize = (`elem` ["hr","br","img"]) } + +pList :: TagParser [Block] +pList = pBulletList <|> pOrderedList <|> pDefinitionList + +pBulletList :: TagParser [Block] +pBulletList = try $ do + pSatisfy (~== TagOpen "ul" []) + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ul")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") + return [BulletList items] + +pOrderedList :: TagParser [Block] +pOrderedList = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + st <- getState + let (start, style) = if stateStrict st + then (1, DefaultStyle) + else (sta', sty') + where sta = fromMaybe "1" $ + lookup "start" attribs + sta' = if all isDigit sta + then read sta + else 1 + sty = fromMaybe (fromMaybe "" $ + lookup "style" attribs) $ + lookup "class" attribs + sty' = case sty of + "lower-roman" -> LowerRoman + "upper-roman" -> UpperRoman + "lower-alpha" -> LowerAlpha + "upper-alpha" -> UpperAlpha + "decimal" -> Decimal + _ -> DefaultStyle + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ol")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") + return [OrderedList (start, style, DefaultDelim) items] + +pDefinitionList :: TagParser [Block] +pDefinitionList = try $ do + pSatisfy (~== TagOpen "dl" []) + items <- manyTill pDefListItem (pCloses "dl") + return [DefinitionList items] + +pDefListItem :: TagParser ([Inline],[[Block]]) +pDefListItem = try $ do + let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && + not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) + defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) + skipMany nonItem + let term = intercalate [LineBreak] terms + return (term, defs) + +pRawTag :: TagParser String +pRawTag = do + tag <- pAnyTag + let ignorable x = x `elem` ["html","head","body"] + if tagOpen ignorable (const True) tag || tagClose ignorable tag + then return [] + else return $ renderTags' [tag] + +pRawHtmlBlock :: TagParser [Block] +pRawHtmlBlock = do + raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag + state <- getState + if stateParseRaw state && not (null raw) + then return [RawBlock "html" raw] + else return [] + +pHtmlBlock :: String -> TagParser String +pHtmlBlock t = try $ do + open <- pSatisfy (~== TagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) + return $ renderTags' $ [open] ++ contents ++ [TagClose t] + +pHeader :: TagParser [Block] +pHeader = try $ do + TagOpen tagtype attr <- pSatisfy $ + tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) + (const True) + let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + let level = read (drop 1 tagtype) + contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof) + return $ if bodyTitle + then [] -- skip a representation of the title in the body + else [Header level $ normalizeSpaces contents] + +pHrule :: TagParser [Block] +pHrule = do + pSelfClosing (=="hr") (const True) + return [HorizontalRule] + +pSimpleTable :: TagParser [Block] +pSimpleTable = try $ do + TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + skipMany pBlank + head' <- option [] $ pInTags "th" pTd + rows <- many1 $ try $ + skipMany pBlank >> pInTags "tr" pTd + skipMany pBlank + TagClose _ <- pSatisfy (~== TagClose "table") + let cols = maximum $ map length rows + let aligns = replicate cols AlignLeft + let widths = replicate cols 0 + return [Table [] aligns widths head' rows] + +pTd :: TagParser [TableCell] +pTd = try $ do + skipMany pBlank + res <- pInTags "td" pPlain + skipMany pBlank + return [res] + +pBlockQuote :: TagParser [Block] +pBlockQuote = do + contents <- pInTags "blockquote" block + return [BlockQuote contents] + +pPlain :: TagParser [Block] +pPlain = do + contents <- liftM (normalizeSpaces . concat) $ many1 inline + if null contents + then return [] + else return [Plain contents] + +pPara :: TagParser [Block] +pPara = do + contents <- pInTags "p" inline + return [Para $ normalizeSpaces contents] + +pCodeBlock :: TagParser [Block] +pCodeBlock = try $ do + TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + contents <- manyTill pAnyTag (pCloses "pre" <|> eof) + let rawText = concatMap fromTagText $ filter isTagText contents + -- drop leading newline if any + let result' = case rawText of + '\n':xs -> xs + _ -> rawText + -- drop trailing newline if any + let result = case reverse result' of + '\n':_ -> init result' + _ -> result' + let attribsId = fromMaybe "" $ lookup "id" attr + let attribsClasses = words $ fromMaybe "" $ lookup "class" attr + let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + st <- getState + let attribs = if stateStrict st + then ("",[],[]) + else (attribsId, attribsClasses, attribsKV) + return [CodeBlock attribs result] + +inline :: TagParser [Inline] +inline = choice + [ pTagText + , pEmph + , pStrong + , pSuperscript + , pSubscript + , pStrikeout + , pLineBreak + , pLink + , pImage + , pCode + , pRawHtmlInline + ] + +pLocation :: TagParser () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: (Tag String -> Bool) -> TagParser (Tag String) +pSat f = do + pos <- getPosition + token show (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String) +pSatisfy f = try $ optional pLocation >> pSat f + +pAnyTag :: TagParser (Tag String) +pAnyTag = pSatisfy (const True) + +pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool) + -> TagParser (Tag String) +pSelfClosing f g = do + open <- pSatisfy (tagOpen f g) + optional $ pSatisfy (tagClose f) + return open + +pEmph :: TagParser [Inline] +pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph + +pStrong :: TagParser [Inline] +pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong + +pSuperscript :: TagParser [Inline] +pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript + +pSubscript :: TagParser [Inline] +pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript + +pStrikeout :: TagParser [Inline] +pStrikeout = do + failIfStrict + pInlinesInTags "s" Strikeout <|> + pInlinesInTags "strike" Strikeout <|> + pInlinesInTags "del" Strikeout <|> + try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + contents <- liftM concat $ manyTill inline (pCloses "span") + return [Strikeout contents]) + +pLineBreak :: TagParser [Inline] +pLineBreak = do + pSelfClosing (=="br") (const True) + return [LineBreak] + +pLink :: TagParser [Inline] +pLink = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) + let url = fromAttrib "href" tag + let title = fromAttrib "title" tag + lab <- liftM concat $ manyTill inline (pCloses "a") + return [Link (normalizeSpaces lab) (escapeURI url, title)] + +pImage :: TagParser [Inline] +pImage = do + tag <- pSelfClosing (=="img") (isJust . lookup "src") + let url = fromAttrib "src" tag + let title = fromAttrib "title" tag + let alt = fromAttrib "alt" tag + return [Image (toList $ text alt) (escapeURI url, title)] + +pCode :: TagParser [Inline] +pCode = try $ do + (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + result <- manyTill pAnyTag (pCloses open) + let ident = fromMaybe "" $ lookup "id" attr + let classes = words $ fromMaybe [] $ lookup "class" attr + let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr + return [Code (ident,classes,rest) + $ intercalate " " $ lines $ innerText result] + +pRawHtmlInline :: TagParser [Inline] +pRawHtmlInline = do + result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag + state <- getState + if stateParseRaw state + then return [RawInline "html" $ renderTags' [result]] + else return [] + +pInlinesInTags :: String -> ([Inline] -> Inline) + -> TagParser [Inline] +pInlinesInTags tagtype f = do + contents <- pInTags tagtype inline + return [f contents] + +pInTags :: String -> TagParser [a] + -> TagParser [a] +pInTags tagtype parser = try $ do + pSatisfy (~== TagOpen tagtype []) + liftM concat $ manyTill parser (pCloses tagtype <|> eof) + +pCloses :: String -> TagParser () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "li" -> return () + _ -> pzero + +pTagText :: TagParser [Inline] +pTagText = try $ do + (TagText str) <- pSatisfy isTagText + st <- getState + case runParser (many pTagContents) st "text" str of + Left _ -> fail $ "Could not parse `" ++ str ++ "'" + Right result -> return result + +pBlank :: TagParser () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ all isSpace str + +pTagContents :: GenParser Char ParserState Inline +pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol + +pStr :: GenParser Char ParserState Inline +pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) + +isSpecial :: Char -> Bool +isSpecial '"' = True +isSpecial '\'' = True +isSpecial '.' = True +isSpecial '-' = True +isSpecial '\8216' = True +isSpecial '\8217' = True +isSpecial '\8220' = True +isSpecial '\8221' = True +isSpecial _ = False + +pSymbol :: GenParser Char ParserState Inline +pSymbol = satisfy isSpecial >>= return . Str . (:[]) + +pSpace :: GenParser Char ParserState Inline +pSpace = many1 (satisfy isSpace) >> return Space -- -- Constants -- -eitherBlockOrInline :: [[Char]] +eitherBlockOrInline :: [String] eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", "map", "area", "object"] @@ -76,57 +430,41 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "textarea", "tt", "u", "var"] -} -blockHtmlTags :: [[Char]] +blockHtmlTags :: [String] blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", + "h5", "h6", "head", "hr", "html", "isindex", "menu", + "noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script", "style"] -sanitaryTags :: [[Char]] -sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", - "blockquote", "br", "button", "caption", "center", - "cite", "code", "col", "colgroup", "dd", "del", "dfn", - "dir", "div", "dl", "dt", "em", "fieldset", "font", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", - "li", "map", "menu", "ol", "optgroup", "option", "p", - "pre", "q", "s", "samp", "select", "small", "span", - "strike", "strong", "sub", "sup", "table", "tbody", - "td", "textarea", "tfoot", "th", "thead", "tr", "tt", - "u", "ul", "var"] - -sanitaryAttributes :: [[Char]] -sanitaryAttributes = ["abbr", "accept", "accept-charset", - "accesskey", "action", "align", "alt", "axis", - "border", "cellpadding", "cellspacing", "char", - "charoff", "charset", "checked", "cite", "class", - "clear", "cols", "colspan", "color", "compact", - "coords", "datetime", "dir", "disabled", - "enctype", "for", "frame", "headers", "height", - "href", "hreflang", "hspace", "id", "ismap", - "label", "lang", "longdesc", "maxlength", "media", - "method", "multiple", "name", "nohref", "noshade", - "nowrap", "prompt", "readonly", "rel", "rev", - "rows", "rowspan", "rules", "scope", "selected", - "shape", "size", "span", "src", "start", - "summary", "tabindex", "target", "title", "type", - "usemap", "valign", "value", "vspace", "width"] +isInlineTag :: Tag String -> Bool +isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t || + tagClose (`notElem` blockHtmlTags) t || + tagComment (const True) t + +isBlockTag :: Tag String -> Bool +isBlockTag t = tagOpen (`elem` blocktags) (const True) t || + tagClose (`elem` blocktags) t || + tagComment (const True) t + where blocktags = blockHtmlTags ++ eitherBlockOrInline + +isTextTag :: Tag String -> Bool +isTextTag = tagText (const True) + +isCommentTag :: Tag String -> Bool +isCommentTag = tagComment (const True) -- taken from HXT and extended closes :: String -> String -> Bool -"EOF" `closes` _ = True _ `closes` "body" = False _ `closes` "html" = False "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True -"td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True -"dd" `closes` t | t `elem` ["dt","dd"] = True "hr" `closes` "p" = True "p" `closes` "p" = True "meta" `closes` "meta" = True @@ -148,627 +486,34 @@ t1 `closes` t2 | t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True _ `closes` _ = False --- --- HTML utility functions --- - --- | Returns @True@ if sanitization is specified and the specified tag is --- not on the sanitized tag list. -unsanitaryTag :: [Char] - -> GenParser tok ParserState Bool -unsanitaryTag tag = do - st <- getState - return $ stateSanitizeHTML st && tag `notElem` sanitaryTags - --- | returns @True@ if sanitization is specified and the specified attribute --- is not on the sanitized attribute list. -unsanitaryAttribute :: ([Char], String, t) - -> GenParser tok ParserState Bool -unsanitaryAttribute (attr, val, _) = do - st <- getState - return $ stateSanitizeHTML st && - (attr `notElem` sanitaryAttributes || - (attr `elem` ["href","src"] && unsanitaryURI val)) - --- | Returns @True@ if the specified URI is potentially a security risk. -unsanitaryURI :: String -> Bool -unsanitaryURI u = - let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", - "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", - "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", - "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", - "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", - "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", - "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", - "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", - "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", - "snews:", "webcal:", "ymsgr:"] - in case parseURIReference (escapeURI u) of - Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes - Nothing -> True - --- | Read blocks until end tag. -blocksTilEnd :: String -> GenParser Char ParserState [Block] -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd :: String -> GenParser Char ParserState [Inline] -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn :: String -> GenParser Char ParserState [Block] -blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn :: String -> GenParser Char ParserState [Inline] -inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\<br\>@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- Parse any HTML tag (opening or self-closing) and return tag type -anyOpener :: GenParser Char ParserState [Char] -anyOpener = try $ do - char '<' - spaces - tag <- many1 alphaNum - skipMany htmlAttribute - spaces - option "" (string "/") - spaces - char '>' - return $ map toLower tag - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag :: GenParser Char ParserState [Char] -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - let result = "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -anyHtmlEndTag :: GenParser Char ParserState [Char] -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tag <- many1 alphaNum - spaces - char '>' - let result = "</" ++ tag ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -htmlTag :: Bool - -> String - -> GenParser Char ParserState (String, [(String, String)]) -htmlTag selfClosing tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - -- note: we want to handle both HTML and XHTML, - -- so we don't require the / - when selfClosing $ optional $ char '/' >> spaces - char '>' - return (tag, (map (\(name, content, _) -> (name, content)) attribs)) - -htmlOpenTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlOpenTag = htmlTag False - -htmlCloseTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlCloseTag = htmlTag False . ('/':) - -htmlSelfClosingTag :: String - -> GenParser Char ParserState (String, [(String, String)]) -htmlSelfClosingTag = htmlTag True - --- parses a quoted html attribute value -quoted :: Char -> GenParser Char st (String, String) -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -nullAttribute :: ([Char], [Char], [Char]) -nullAttribute = ("", "", "") - -htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) -htmlAttribute = do - attr <- htmlRegularAttribute <|> htmlMinimizedAttribute - unsanitary <- unsanitaryAttribute attr - if unsanitary - then return nullAttribute - else return attr - --- minimized boolean attribute -htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (noneOf " \t\n\r\"'<>") - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag :: [Char] -> GenParser Char ParserState [Char] -htmlEndTag tag = try $ do - closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $ - anyOpener <|> (eof >> return "EOF") - if closedByNext - then return "" - else do char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock :: String -> Bool -isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline) - -anyHtmlBlockTag :: GenParser Char ParserState [Char] -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" - -anyHtmlInlineTag :: GenParser Char ParserState [Char] -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if not (isBlock tag) then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript :: GenParser Char ParserState [Char] -htmlScript = try $ do - lookAhead $ htmlOpenTag "script" - open <- anyHtmlTag - rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") - st <- getState - if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</script>" - -scriptChunk :: GenParser Char ParserState [Char] -scriptChunk = jsComment <|> jsString <|> jsChars - where jsComment = jsEndlineComment <|> jsMultilineComment - jsString = jsSingleQuoteString <|> jsDoubleQuoteString - jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar - jsEndlineComment = try $ do - string "//" - res <- manyTill anyChar newline - return ("//" ++ res) - jsMultilineComment = try $ do - string "/*" - res <- manyTill anyChar (try $ string "*/") - return ("/*" ++ res ++ "*/") - jsSingleQuoteString = stringwith '\'' - jsDoubleQuoteString = stringwith '"' - charWithEsc escapable = try $ - (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x]) - <|> count 1 anyChar - stringwith c = try $ do - char c - res <- liftM concat $ manyTill (charWithEsc [c]) (char c) - return (c : (res ++ [c])) - --- | Parses material between style tags. --- Style tags must be treated differently, because they can contain CSS -htmlStyle :: GenParser Char ParserState [Char] -htmlStyle = try $ do - lookAhead $ htmlOpenTag "style" - open <- anyHtmlTag - rest <- manyTill anyChar (htmlEndTag "style") - st <- getState - if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</style>" - -htmlBlockElement :: GenParser Char ParserState [Char] -htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] - -rawHtmlBlock :: GenParser Char ParserState Block -rawHtmlBlock = try $ do - body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag - state <- getState - if stateParseRaw state then return (RawHtml body) else return Null - --- This is a block whose contents should be passed through verbatim, not interpreted. -rawVerbatimBlock :: GenParser Char ParserState [Char] -rawVerbatimBlock = try $ do - start <- anyHtmlBlockTag - let tagtype = extractTagType start - if tagtype `elem` ["pre"] - then do - contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) - end <- htmlEndTag tagtype - return $ start ++ contents ++ end - else fail "Not a verbatim block" - --- We don't want to parse </body> or </html> as raw HTML, since these --- are handled in parseHtml. -rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|> - htmlCloseTag "html") - rawHtmlBlock - --- | Parses an HTML comment. -htmlComment :: GenParser Char st [Char] -htmlComment = try $ do - string "<!--" - comment <- many $ noneOf "-" - <|> try (char '-' >>~ notFollowedBy (try (char '-' >> char '>'))) - string "-->" - return $ "<!--" ++ comment ++ "-->" - --- --- parsing documents --- - -xmlDec :: GenParser Char st [Char] -xmlDec = try $ do - string "<?" - rest <- manyTill anyChar (char '>') - return $ "<?" ++ rest ++ ">" - -definition :: GenParser Char st [Char] -definition = try $ do - string "<!" - rest <- manyTill anyChar (char '>') - return $ "<!" ++ rest ++ ">" - -nonTitleNonHead :: GenParser Char ParserState Char -nonTitleNonHead = try $ do - notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|> - (htmlEndTag "head" >> return ' ') - (rawHtmlBlock >> return ' ') <|> anyChar - -parseTitle :: GenParser Char ParserState [Inline] -parseTitle = try $ do - (tag, _) <- htmlOpenTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead :: GenParser Char ParserState Meta -parseHead = try $ do - htmlOpenTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlEndTag "head" - return $ Meta contents [] [] - --- h1 class="title" representation of title in body -bodyTitle :: GenParser Char ParserState [Inline] -bodyTitle = try $ do - (_, attribs) <- htmlOpenTag "h1" - case (extractAttribute "class" attribs) of - Just "title" -> return "" - _ -> fail "not title" - inlinesTilEnd "h1" - -endOfDoc :: GenParser Char ParserState () -endOfDoc = try $ do - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html> - eof - -parseHtml :: GenParser Char ParserState Pandoc -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - spaces - optional $ htmlOpenTag "html" - spaces - meta <- option (Meta [] [] []) parseHead - spaces - optional $ htmlOpenTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - endOfDoc - return $ Pandoc meta blocks - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block :: GenParser Char ParserState Block -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock' - , notFollowedBy' endOfDoc >> char '<' >> return Null - ] <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" - -headerLevel :: Int -> GenParser Char ParserState Block -headerLevel n = try $ do - let level = "h" ++ show n - htmlOpenTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule :: GenParser Char ParserState Block -hrule = try $ do - (_, attribs) <- htmlSelfClosingTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock :: GenParser Char ParserState Block -codeBlock = try $ do - htmlOpenTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = try $ htmlOpenTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (_, attribs) <- htmlOpenTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - htmlOpenTag "ul" - spaces - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - htmlOpenTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = intercalate [LineBreak] terms - return (term, defs) - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - , char '&' >> return (Str "&") -- common HTML error - ] <?> "inline" - -code :: GenParser Char ParserState Inline -code = try $ do - result <- (htmlOpenTag "code" >> manyTill anyChar (htmlEndTag "code")) - <|> (htmlOpenTag "tt" >> manyTill anyChar (htmlEndTag "tt")) - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - intercalate " " $ lines result - -rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = do - result <- anyHtmlInlineTag <|> htmlComment - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags :: [Char] -> GenParser Char ParserState [Inline] -betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph :: GenParser Char ParserState Inline -emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph - -strong :: GenParser Char ParserState Inline -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript :: GenParser Char ParserState Inline -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript :: GenParser Char ParserState Inline -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout :: GenParser Char ParserState Inline -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout :: GenParser Char ParserState Inline -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (_, attributes) <- htmlOpenTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace :: GenParser Char st Inline -whitespace = many1 space >> return Space - --- hard line break -linebreak :: GenParser Char ParserState Inline -linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak - -str :: GenParser Char st Inline -str = many1 (noneOf "< \t\n&") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute :: [Char] -> [([Char], String)] -> Maybe String -extractAttribute _ [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link :: GenParser Char ParserState Inline -link = try $ do - (_, attributes) <- htmlOpenTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - lab <- inlinesTilEnd "a" - return $ Link (normalizeSpaces lab) (escapeURI url, title) - -image :: GenParser Char ParserState Inline -image = try $ do - (_, attributes) <- htmlSelfClosingTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (escapeURI url, title) - +--- parsers for use in markdown, textile readers + +-- | Matches a stretch of HTML in balanced tags. +htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String +htmlInBalanced f = try $ do + (TagOpen t _, tag) <- htmlTag f + guard $ '/' `notElem` tag -- not a self-closing tag + let nonTagChunk = many1 $ satisfy (/= '<') + let stopper = htmlTag (~== TagClose t) + let anytag = liftM snd $ htmlTag (const True) + contents <- many $ notFollowedBy' stopper >> + (nonTagChunk <|> htmlInBalanced (const True) <|> anytag) + endtag <- liftM snd stopper + return $ tag ++ concat contents ++ endtag + +-- | Matches a tag meeting a certain condition. +htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String) +htmlTag f = try $ do + lookAhead (char '<') + (next : _) <- getInput >>= return . canonicalizeTags . parseTags + guard $ f next + -- advance the parser + case next of + TagComment s -> do + count (length s + 4) anyChar + skipMany (satisfy (/='>')) + char '>' + return (next, "<!--" ++ s ++ "-->") + _ -> do + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") |