summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2009-01-24 20:00:26 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2009-01-24 20:00:26 +0000
commit42aca57dee8d88afa5fac512aeb1198102908865 (patch)
tree1c6a98bd226f4fffde6768010715bc1d80e5d168 /src/Text/Pandoc/Readers/HTML.hs
parent39e8d8486693029abfef84c45e85416f7c775280 (diff)
Moved all haskell source to src subdirectory.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1528 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs675
1 files changed, 675 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..65e512b5e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,675 @@
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.HTML
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of HTML to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.HTML (
+ readHtml,
+ rawHtmlInline,
+ rawHtmlBlock,
+ anyHtmlBlockTag,
+ anyHtmlInlineTag,
+ anyHtmlTag,
+ anyHtmlEndTag,
+ htmlEndTag,
+ extractTagType,
+ htmlBlockElement,
+ unsanitaryURI
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
+import Data.Maybe ( fromMaybe )
+import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate )
+import Data.Char ( toLower, isAlphaNum )
+import Network.URI ( parseURIReference, URI (..) )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState -- ^ Parser state
+ -> String -- ^ String to parse
+ -> Pandoc
+readHtml = readWith parseHtml
+
+--
+-- Constants
+--
+
+eitherBlockOrInline :: [[Char]]
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+ "map", "area", "object"]
+
+{-
+inlineHtmlTags :: [[Char]]
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
+ "br", "cite", "code", "dfn", "em", "font", "i", "img",
+ "input", "kbd", "label", "q", "s", "samp", "select",
+ "small", "span", "strike", "strong", "sub", "sup",
+ "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+-}
+
+blockHtmlTags :: [[Char]]
+blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
+ "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "hr", "html", "isindex", "menu", "noframes",
+ "noscript", "ol", "p", "pre", "table", "ul", "dd",
+ "dt", "frameset", "li", "tbody", "td", "tfoot",
+ "th", "thead", "tr", "script"] ++ eitherBlockOrInline
+
+sanitaryTags :: [[Char]]
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+ "blockquote", "br", "button", "caption", "center",
+ "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+ "dir", "div", "dl", "dt", "em", "fieldset", "font",
+ "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+ "i", "img", "input", "ins", "kbd", "label", "legend",
+ "li", "map", "menu", "ol", "optgroup", "option", "p",
+ "pre", "q", "s", "samp", "select", "small", "span",
+ "strike", "strong", "sub", "sup", "table", "tbody",
+ "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+ "u", "ul", "var"]
+
+sanitaryAttributes :: [[Char]]
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+ "accesskey", "action", "align", "alt", "axis",
+ "border", "cellpadding", "cellspacing", "char",
+ "charoff", "charset", "checked", "cite", "class",
+ "clear", "cols", "colspan", "color", "compact",
+ "coords", "datetime", "dir", "disabled",
+ "enctype", "for", "frame", "headers", "height",
+ "href", "hreflang", "hspace", "id", "ismap",
+ "label", "lang", "longdesc", "maxlength", "media",
+ "method", "multiple", "name", "nohref", "noshade",
+ "nowrap", "prompt", "readonly", "rel", "rev",
+ "rows", "rowspan", "rules", "scope", "selected",
+ "shape", "size", "span", "src", "start",
+ "summary", "tabindex", "target", "title", "type",
+ "usemap", "valign", "value", "vspace", "width"]
+
+--
+-- HTML utility functions
+--
+
+-- | Returns @True@ if sanitization is specified and the specified tag is
+-- not on the sanitized tag list.
+unsanitaryTag :: [Char]
+ -> GenParser tok ParserState Bool
+unsanitaryTag tag = do
+ st <- getState
+ return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+-- is not on the sanitized attribute list.
+unsanitaryAttribute :: ([Char], String, t)
+ -> GenParser tok ParserState Bool
+unsanitaryAttribute (attr, val, _) = do
+ st <- getState
+ return $ stateSanitizeHTML st &&
+ (attr `notElem` sanitaryAttributes ||
+ (attr `elem` ["href","src"] && unsanitaryURI val))
+
+-- | Returns @True@ if the specified URI is potentially a security risk.
+unsanitaryURI :: String -> Bool
+unsanitaryURI u =
+ let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
+ "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
+ "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
+ "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
+ "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
+ "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
+ "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
+ "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
+ "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
+ "snews:", "webcal:", "ymsgr:"]
+ in case parseURIReference u of
+ Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
+ Nothing -> True
+
+-- | Read blocks until end tag.
+blocksTilEnd :: String -> GenParser Char ParserState [Block]
+blocksTilEnd tag = do
+ blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+ return $ filter (/= Null) blocks
+
+-- | Read inlines until end tag.
+inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
+
+-- | Parse blocks between open and close tag.
+blocksIn :: String -> GenParser Char ParserState [Block]
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
+
+-- | Parse inlines between open and close tag.
+inlinesIn :: String -> GenParser Char ParserState [Inline]
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
+
+-- | Extract type from a tag: e.g. @br@ from @\<br\>@
+extractTagType :: String -> String
+extractTagType ('<':rest) =
+ let isSpaceOrSlash c = c `elem` "/ \n\t" in
+ map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
+extractTagType _ = ""
+
+-- | Parse any HTML tag (opening or self-closing) and return text of tag
+anyHtmlTag :: GenParser Char ParserState [Char]
+anyHtmlTag = try $ do
+ char '<'
+ spaces
+ tag <- many1 alphaNum
+ attribs <- many htmlAttribute
+ spaces
+ ender <- option "" (string "/")
+ let ender' = if null ender then "" else " /"
+ spaces
+ char '>'
+ let result = "<" ++ tag ++
+ concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe HTML removed -->"
+ else return result
+
+anyHtmlEndTag :: GenParser Char ParserState [Char]
+anyHtmlEndTag = try $ do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ tag <- many1 alphaNum
+ spaces
+ char '>'
+ let result = "</" ++ tag ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe HTML removed -->"
+ else return result
+
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
+htmlTag tag = try $ do
+ char '<'
+ spaces
+ stringAnyCase tag
+ attribs <- many htmlAttribute
+ spaces
+ optional (string "/")
+ spaces
+ char '>'
+ return (tag, (map (\(name, content, _) -> (name, content)) attribs))
+
+-- parses a quoted html attribute value
+quoted :: Char -> GenParser Char st (String, String)
+quoted quoteChar = do
+ result <- between (char quoteChar) (char quoteChar)
+ (many (noneOf [quoteChar]))
+ return (result, [quoteChar])
+
+nullAttribute :: ([Char], [Char], [Char])
+nullAttribute = ("", "", "")
+
+htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
+htmlAttribute = do
+ attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+ unsanitary <- unsanitaryAttribute attr
+ if unsanitary
+ then return nullAttribute
+ else return attr
+
+-- minimized boolean attribute
+htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlMinimizedAttribute = try $ do
+ many1 space
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ return (name, name, name)
+
+htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlRegularAttribute = try $ do
+ many1 space
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ spaces
+ char '='
+ spaces
+ (content, quoteStr) <- choice [ (quoted '\''),
+ (quoted '"'),
+ (do
+ a <- many (alphaNum <|> (oneOf "-._:"))
+ return (a,"")) ]
+ return (name, content,
+ (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
+
+-- | Parse an end tag of type 'tag'
+htmlEndTag :: [Char] -> GenParser Char st [Char]
+htmlEndTag tag = try $ do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ stringAnyCase tag
+ spaces
+ char '>'
+ return $ "</" ++ tag ++ ">"
+
+{-
+-- | Returns @True@ if the tag is (or can be) an inline tag.
+isInline :: String -> Bool
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+-}
+
+-- | Returns @True@ if the tag is (or can be) a block tag.
+isBlock :: String -> Bool
+isBlock tag = (extractTagType tag) `elem` blockHtmlTags
+
+anyHtmlBlockTag :: GenParser Char ParserState [Char]
+anyHtmlBlockTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if isBlock tag then return tag else fail "not a block tag"
+
+anyHtmlInlineTag :: GenParser Char ParserState [Char]
+anyHtmlInlineTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if not (isBlock tag) then return tag else fail "not an inline tag"
+
+-- | Parses material between script tags.
+-- Scripts must be treated differently, because they can contain '<>' etc.
+htmlScript :: GenParser Char ParserState [Char]
+htmlScript = try $ do
+ open <- string "<script"
+ rest <- manyTill anyChar (htmlEndTag "script")
+ st <- getState
+ if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+ then return "<!-- unsafe HTML removed -->"
+ else return $ open ++ rest ++ "</script>"
+
+-- | Parses material between style tags.
+-- Style tags must be treated differently, because they can contain CSS
+htmlStyle :: GenParser Char ParserState [Char]
+htmlStyle = try $ do
+ open <- string "<style"
+ rest <- manyTill anyChar (htmlEndTag "style")
+ st <- getState
+ if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+ then return "<!-- unsafe HTML removed -->"
+ else return $ open ++ rest ++ "</style>"
+
+htmlBlockElement :: GenParser Char ParserState [Char]
+htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
+
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+ body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
+ state <- getState
+ if stateParseRaw state then return (RawHtml body) else return Null
+
+-- This is a block whose contents should be passed through verbatim, not interpreted.
+rawVerbatimBlock :: GenParser Char ParserState [Char]
+rawVerbatimBlock = try $ do
+ start <- anyHtmlBlockTag
+ let tagtype = extractTagType start
+ if tagtype `elem` ["pre"]
+ then do
+ contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
+ end <- htmlEndTag tagtype
+ return $ start ++ contents ++ end
+ else fail "Not a verbatim block"
+
+-- We don't want to parse </body> or </html> as raw HTML, since these
+-- are handled in parseHtml.
+rawHtmlBlock' :: GenParser Char ParserState Block
+rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
+ rawHtmlBlock
+
+-- | Parses an HTML comment.
+htmlComment :: GenParser Char st [Char]
+htmlComment = try $ do
+ string "<!--"
+ comment <- manyTill anyChar (try (string "-->"))
+ return $ "<!--" ++ comment ++ "-->"
+
+--
+-- parsing documents
+--
+
+xmlDec :: GenParser Char st [Char]
+xmlDec = try $ do
+ string "<?"
+ rest <- manyTill anyChar (char '>')
+ return $ "<?" ++ rest ++ ">"
+
+definition :: GenParser Char st [Char]
+definition = try $ do
+ string "<!"
+ rest <- manyTill anyChar (char '>')
+ return $ "<!" ++ rest ++ ">"
+
+nonTitleNonHead :: GenParser Char ParserState Char
+nonTitleNonHead = try $ do
+ notFollowedBy $ (htmlTag "title" >> return ' ') <|>
+ (htmlEndTag "head" >> return ' ')
+ (rawHtmlBlock >> return ' ') <|> anyChar
+
+parseTitle :: GenParser Char ParserState [Inline]
+parseTitle = try $ do
+ (tag, _) <- htmlTag "title"
+ contents <- inlinesTilEnd tag
+ spaces
+ return contents
+
+-- parse header and return meta-information (for now, just title)
+parseHead :: GenParser Char ParserState ([Inline], [a], [Char])
+parseHead = try $ do
+ htmlTag "head"
+ spaces
+ skipMany nonTitleNonHead
+ contents <- option [] parseTitle
+ skipMany nonTitleNonHead
+ htmlEndTag "head"
+ return (contents, [], "")
+
+skipHtmlTag :: String -> GenParser Char ParserState ()
+skipHtmlTag tag = optional (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle :: GenParser Char ParserState [Inline]
+bodyTitle = try $ do
+ (_, attribs) <- htmlTag "h1"
+ case (extractAttribute "class" attribs) of
+ Just "title" -> return ""
+ _ -> fail "not title"
+ inlinesTilEnd "h1"
+
+parseHtml :: GenParser Char ParserState Pandoc
+parseHtml = do
+ sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+ skipHtmlTag "html"
+ spaces
+ (title, authors, date) <- option ([], [], "") parseHead
+ spaces
+ skipHtmlTag "body"
+ spaces
+ optional bodyTitle -- skip title in body, because it's represented in meta
+ blocks <- parseBlocks
+ spaces
+ optional (htmlEndTag "body")
+ spaces
+ optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
+ eof
+ return $ Pandoc (Meta title authors date) blocks
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
+
+block :: GenParser Char ParserState Block
+block = choice [ codeBlock
+ , header
+ , hrule
+ , list
+ , blockQuote
+ , para
+ , plain
+ , rawHtmlBlock'
+ ] <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel :: Int -> GenParser Char ParserState Block
+headerLevel n = try $ do
+ let level = "h" ++ show n
+ htmlTag level
+ contents <- inlinesTilEnd level
+ return $ Header n (normalizeSpaces contents)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char ParserState Block
+hrule = try $ do
+ (_, attribs) <- htmlTag "hr"
+ state <- getState
+ if not (null attribs) && stateParseRaw state
+ then unexpected "attributes in hr" -- parse as raw in this case
+ else return HorizontalRule
+
+--
+-- code blocks
+--
+
+-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
+-- skipped, because they are not portable to output formats other than HTML.
+codeBlock :: GenParser Char ParserState Block
+codeBlock = try $ do
+ htmlTag "pre"
+ result <- manyTill
+ (many1 (satisfy (/= '<')) <|>
+ ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
+ (htmlEndTag "pre")
+ let result' = concat result
+ -- drop leading newline if any
+ let result'' = if "\n" `isPrefixOf` result'
+ then drop 1 result'
+ else result'
+ -- drop trailing newline if any
+ let result''' = if "\n" `isSuffixOf` result''
+ then init result''
+ else result''
+ return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
+
+--
+-- block quotes
+--
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = try $ htmlTag "blockquote" >> spaces >>
+ blocksTilEnd "blockquote" >>= (return . BlockQuote)
+
+--
+-- list blocks
+--
+
+list :: GenParser Char ParserState Block
+list = choice [ bulletList, orderedList, definitionList ] <?> "list"
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ (_, attribs) <- htmlTag "ol"
+ (start, style) <- option (1, DefaultStyle) $
+ do failIfStrict
+ let sta = fromMaybe "1" $
+ lookup "start" attribs
+ let sty = fromMaybe (fromMaybe "" $
+ lookup "style" attribs) $
+ lookup "class" attribs
+ let sty' = case sty of
+ "lower-roman" -> LowerRoman
+ "upper-roman" -> UpperRoman
+ "lower-alpha" -> LowerAlpha
+ "upper-alpha" -> UpperAlpha
+ "decimal" -> Decimal
+ _ -> DefaultStyle
+ return (read sta, sty')
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ol"
+ return $ OrderedList (start, style, DefaultDelim) items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+ htmlTag "ul"
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ul"
+ return $ BulletList items
+
+definitionList :: GenParser Char ParserState Block
+definitionList = try $ do
+ failIfStrict -- def lists not part of standard markdown
+ htmlTag "dl"
+ spaces
+ items <- sepEndBy1 definitionListItem spaces
+ htmlEndTag "dl"
+ return $ DefinitionList items
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+ terms <- sepEndBy1 (inlinesIn "dt") spaces
+ defs <- sepEndBy1 (blocksIn "dd") spaces
+ let term = intercalate [LineBreak] terms
+ return (term, concat defs)
+
+--
+-- paragraph block
+--
+
+para :: GenParser Char ParserState Block
+para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
+ return . Para . normalizeSpaces
+
+--
+-- plain block
+--
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+--
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice [ charRef
+ , strong
+ , emph
+ , superscript
+ , subscript
+ , strikeout
+ , spanStrikeout
+ , code
+ , str
+ , linebreak
+ , whitespace
+ , link
+ , image
+ , rawHtmlInline
+ ] <?> "inline"
+
+code :: GenParser Char ParserState Inline
+code = try $ do
+ htmlTag "code"
+ result <- manyTill anyChar (htmlEndTag "code")
+ -- remove internal line breaks, leading and trailing space,
+ -- and decode character references
+ return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
+ intercalate " " $ lines result
+
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = do
+ result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
+ state <- getState
+ if stateParseRaw state then return (HtmlInline result) else return (Str "")
+
+betweenTags :: [Char] -> GenParser Char ParserState [Inline]
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
+ return . normalizeSpaces
+
+emph :: GenParser Char ParserState Inline
+emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
+
+strong :: GenParser Char ParserState Inline
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
+
+superscript :: GenParser Char ParserState Inline
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
+
+subscript :: GenParser Char ParserState Inline
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+ return . Strikeout
+
+spanStrikeout :: GenParser Char ParserState Inline
+spanStrikeout = try $ do
+ failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
+ (_, attributes) <- htmlTag "span"
+ result <- case (extractAttribute "class" attributes) of
+ Just "strikeout" -> inlinesTilEnd "span"
+ _ -> fail "not a strikeout"
+ return $ Strikeout result
+
+whitespace :: GenParser Char st Inline
+whitespace = many1 space >> return Space
+
+-- hard line break
+linebreak :: GenParser Char ParserState Inline
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
+
+str :: GenParser Char st Inline
+str = many1 (noneOf "<& \t\n") >>= return . Str
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
+extractAttribute _ [] = Nothing
+extractAttribute name ((attrName, contents):rest) =
+ let name' = map toLower name
+ attrName' = map toLower attrName
+ in if attrName' == name'
+ then Just (decodeCharacterReferences contents)
+ else extractAttribute name rest
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+ (_, attributes) <- htmlTag "a"
+ url <- case (extractAttribute "href" attributes) of
+ Just url -> return url
+ Nothing -> fail "no href"
+ let title = fromMaybe "" $ extractAttribute "title" attributes
+ lab <- inlinesTilEnd "a"
+ return $ Link (normalizeSpaces lab) (url, title)
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+ (_, attributes) <- htmlTag "img"
+ url <- case (extractAttribute "src" attributes) of
+ Just url -> return url
+ Nothing -> fail "no src"
+ let title = fromMaybe "" $ extractAttribute "title" attributes
+ let alt = fromMaybe "" (extractAttribute "alt" attributes)
+ return $ Image [Str alt] (url, title)
+