Moved all haskell source to src subdirectory.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@1528 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2009-01-24 20:00:26 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2009-01-24 20:00:26 +0000
commit: 42aca57dee8d88afa5fac512aeb1198102908865 (patch)
tree: 1c6a98bd226f4fffde6768010715bc1d80e5d168 /src/Text/Pandoc/Readers/HTML.hs
parent: 39e8d8486693029abfef84c45e85416f7c775280 (diff)
1 files changed, 675 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..65e512b5e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,675 @@
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.HTML
+   Copyright   : Copyright (C) 2006-8 John MacFarlane
+   License     : GNU GPL, version 2 or above 
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha 
+   Portability : portable
+
+Conversion of HTML to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.HTML ( 
+                                 readHtml, 
+                                 rawHtmlInline, 
+                                 rawHtmlBlock, 
+                                 anyHtmlBlockTag, 
+                                 anyHtmlInlineTag,  
+                                 anyHtmlTag,
+                                 anyHtmlEndTag,
+                                 htmlEndTag,
+                                 extractTagType,
+                                 htmlBlockElement,
+                                 unsanitaryURI
+                                ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared 
+import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
+import Data.Maybe ( fromMaybe )
+import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate )
+import Data.Char ( toLower, isAlphaNum )
+import Network.URI ( parseURIReference, URI (..) )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState   -- ^ Parser state
+         -> String        -- ^ String to parse
+         -> Pandoc
+readHtml = readWith parseHtml
+
+--
+-- Constants
+--
+
+eitherBlockOrInline :: [[Char]]
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+                  "map", "area", "object"]
+
+{-
+inlineHtmlTags :: [[Char]]
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
+                  "br", "cite", "code", "dfn", "em", "font", "i", "img",
+                  "input", "kbd", "label", "q", "s", "samp", "select",
+                  "small", "span", "strike", "strong", "sub", "sup",
+                  "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+-}
+
+blockHtmlTags :: [[Char]]
+blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
+                 "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
+                 "h5", "h6", "hr", "html", "isindex", "menu", "noframes",
+                 "noscript", "ol", "p", "pre", "table", "ul", "dd",
+                 "dt", "frameset", "li", "tbody", "td", "tfoot",
+                 "th", "thead", "tr", "script"] ++ eitherBlockOrInline
+
+sanitaryTags :: [[Char]]
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+                "blockquote", "br", "button", "caption", "center",
+                "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+                "dir", "div", "dl", "dt", "em", "fieldset", "font",
+                "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+                "i", "img", "input", "ins", "kbd", "label", "legend",
+                "li", "map", "menu", "ol", "optgroup", "option", "p",
+                "pre", "q", "s", "samp", "select", "small", "span",
+                "strike", "strong", "sub", "sup", "table", "tbody",
+                "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+                "u", "ul", "var"]
+
+sanitaryAttributes :: [[Char]]
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+                      "accesskey", "action", "align", "alt", "axis",
+                      "border", "cellpadding", "cellspacing", "char",
+                      "charoff", "charset", "checked", "cite", "class",
+                      "clear", "cols", "colspan", "color", "compact",
+                      "coords", "datetime", "dir", "disabled",
+                      "enctype", "for", "frame", "headers", "height",
+                      "href", "hreflang", "hspace", "id", "ismap",
+                      "label", "lang", "longdesc", "maxlength", "media",
+                      "method", "multiple", "name", "nohref", "noshade",
+                      "nowrap", "prompt", "readonly", "rel", "rev",
+                      "rows", "rowspan", "rules", "scope", "selected",
+                      "shape", "size", "span", "src", "start",
+                      "summary", "tabindex", "target", "title", "type",
+                      "usemap", "valign", "value", "vspace", "width"]
+
+--
+-- HTML utility functions
+--
+
+-- | Returns @True@ if sanitization is specified and the specified tag is 
+--  not on the sanitized tag list.
+unsanitaryTag :: [Char]
+              -> GenParser tok ParserState Bool
+unsanitaryTag tag = do
+  st <- getState
+  return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+--  is not on the sanitized attribute list.
+unsanitaryAttribute :: ([Char], String, t)
+                    -> GenParser tok ParserState Bool
+unsanitaryAttribute (attr, val, _) = do
+  st <- getState
+  return $ stateSanitizeHTML st &&
+           (attr `notElem` sanitaryAttributes ||
+             (attr `elem` ["href","src"] && unsanitaryURI val))
+
+-- | Returns @True@ if the specified URI is potentially a security risk.
+unsanitaryURI :: String -> Bool
+unsanitaryURI u =
+  let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
+             "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
+             "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
+             "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
+             "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
+             "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
+             "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
+             "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
+             "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
+             "snews:", "webcal:", "ymsgr:"]
+  in  case parseURIReference u of
+           Just p  -> (map toLower $ uriScheme p) `notElem` safeURISchemes
+           Nothing -> True
+
+-- | Read blocks until end tag.
+blocksTilEnd :: String -> GenParser Char ParserState [Block]
+blocksTilEnd tag = do
+  blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+  return $ filter (/= Null) blocks
+
+-- | Read inlines until end tag.
+inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
+
+-- | Parse blocks between open and close tag.
+blocksIn :: String -> GenParser Char ParserState [Block]
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
+
+-- | Parse inlines between open and close tag.
+inlinesIn :: String -> GenParser Char ParserState [Inline]
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
+
+-- | Extract type from a tag:  e.g. @br@ from @\<br\>@
+extractTagType :: String -> String
+extractTagType ('<':rest) = 
+  let isSpaceOrSlash c = c `elem` "/ \n\t" in
+  map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
+extractTagType _ = ""
+
+-- | Parse any HTML tag (opening or self-closing) and return text of tag
+anyHtmlTag :: GenParser Char ParserState [Char]
+anyHtmlTag = try $ do
+  char '<'
+  spaces
+  tag <- many1 alphaNum
+  attribs <- many htmlAttribute
+  spaces
+  ender <- option "" (string "/")
+  let ender' = if null ender then "" else " /"
+  spaces
+  char '>'
+  let result = "<" ++ tag ++ 
+               concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe HTML removed -->"
+     else return result
+
+anyHtmlEndTag :: GenParser Char ParserState [Char]
+anyHtmlEndTag = try $ do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  tag <- many1 alphaNum
+  spaces
+  char '>'
+  let result = "</" ++ tag ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe HTML removed -->"
+     else return result 
+
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
+htmlTag tag = try $ do
+  char '<'
+  spaces
+  stringAnyCase tag
+  attribs <- many htmlAttribute
+  spaces
+  optional (string "/")
+  spaces
+  char '>'
+  return (tag, (map (\(name, content, _) -> (name, content)) attribs))
+
+-- parses a quoted html attribute value
+quoted :: Char -> GenParser Char st (String, String)
+quoted quoteChar = do
+  result <- between (char quoteChar) (char quoteChar) 
+                    (many (noneOf [quoteChar]))
+  return (result, [quoteChar])
+
+nullAttribute :: ([Char], [Char], [Char])
+nullAttribute = ("", "", "") 
+
+htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
+htmlAttribute = do
+  attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+  unsanitary <- unsanitaryAttribute attr
+  if unsanitary
+     then return nullAttribute
+     else return attr
+
+-- minimized boolean attribute
+htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlMinimizedAttribute = try $ do
+  many1 space
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  return (name, name, name)
+
+htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlRegularAttribute = try $ do
+  many1 space
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  spaces
+  char '='
+  spaces
+  (content, quoteStr) <- choice [ (quoted '\''), 
+                                  (quoted '"'), 
+                                  (do
+                                     a <- many (alphaNum <|> (oneOf "-._:"))
+                                     return (a,"")) ]
+  return (name, content,
+          (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
+
+-- | Parse an end tag of type 'tag'
+htmlEndTag :: [Char] -> GenParser Char st [Char]
+htmlEndTag tag = try $ do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  stringAnyCase tag
+  spaces
+  char '>'
+  return $ "</" ++ tag ++ ">"
+
+{-
+-- | Returns @True@ if the tag is (or can be) an inline tag.
+isInline :: String -> Bool
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+-}
+
+-- | Returns @True@ if the tag is (or can be) a block tag.
+isBlock :: String -> Bool
+isBlock tag = (extractTagType tag) `elem` blockHtmlTags 
+
+anyHtmlBlockTag :: GenParser Char ParserState [Char]
+anyHtmlBlockTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if isBlock tag then return tag else fail "not a block tag"
+
+anyHtmlInlineTag :: GenParser Char ParserState [Char]
+anyHtmlInlineTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if not (isBlock tag) then return tag else fail "not an inline tag"
+
+-- | Parses material between script tags.
+-- Scripts must be treated differently, because they can contain '<>' etc.
+htmlScript :: GenParser Char ParserState [Char]
+htmlScript = try $ do
+  open <- string "<script"
+  rest <- manyTill anyChar (htmlEndTag "script")
+  st <- getState
+  if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+     then return "<!-- unsafe HTML removed -->"
+     else return $ open ++ rest ++ "</script>"
+
+-- | Parses material between style tags.
+-- Style tags must be treated differently, because they can contain CSS
+htmlStyle :: GenParser Char ParserState [Char]
+htmlStyle = try $ do
+  open <- string "<style"
+  rest <- manyTill anyChar (htmlEndTag "style")
+  st <- getState
+  if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+     then return "<!-- unsafe HTML removed -->"
+     else return $ open ++ rest ++ "</style>"
+
+htmlBlockElement :: GenParser Char ParserState [Char]
+htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
+
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+  body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
+  state <- getState
+  if stateParseRaw state then return (RawHtml body) else return Null
+
+-- This is a block whose contents should be passed through verbatim, not interpreted.
+rawVerbatimBlock :: GenParser Char ParserState [Char]
+rawVerbatimBlock = try $ do
+  start <- anyHtmlBlockTag
+  let tagtype = extractTagType start
+  if tagtype `elem` ["pre"]
+     then do
+       contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
+       end <- htmlEndTag tagtype
+       return $ start ++ contents ++ end
+     else fail "Not a verbatim block"
+
+-- We don't want to parse </body> or </html> as raw HTML, since these
+-- are handled in parseHtml.
+rawHtmlBlock' :: GenParser Char ParserState Block
+rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
+                   rawHtmlBlock
+
+-- | Parses an HTML comment.
+htmlComment :: GenParser Char st [Char]
+htmlComment = try $ do
+  string "<!--"
+  comment <- manyTill anyChar (try (string "-->"))
+  return $ "<!--" ++ comment ++ "-->"
+
+--
+-- parsing documents
+--
+
+xmlDec :: GenParser Char st [Char]
+xmlDec = try $ do
+  string "<?"
+  rest <- manyTill anyChar (char '>')
+  return $ "<?" ++ rest ++ ">"
+
+definition :: GenParser Char st [Char]
+definition = try $ do
+  string "<!"
+  rest <- manyTill anyChar (char '>')
+  return $ "<!" ++ rest ++ ">"
+
+nonTitleNonHead :: GenParser Char ParserState Char
+nonTitleNonHead = try $ do
+  notFollowedBy $ (htmlTag "title" >> return ' ') <|> 
+                  (htmlEndTag "head" >> return ' ')
+  (rawHtmlBlock >> return ' ') <|> anyChar
+
+parseTitle :: GenParser Char ParserState [Inline]
+parseTitle = try $ do
+  (tag, _) <- htmlTag "title"
+  contents <- inlinesTilEnd tag
+  spaces
+  return contents
+
+-- parse header and return meta-information (for now, just title)
+parseHead :: GenParser Char ParserState ([Inline], [a], [Char])
+parseHead = try $ do
+  htmlTag "head"
+  spaces
+  skipMany nonTitleNonHead
+  contents <- option [] parseTitle
+  skipMany nonTitleNonHead
+  htmlEndTag "head"
+  return (contents, [], "")
+
+skipHtmlTag :: String -> GenParser Char ParserState ()
+skipHtmlTag tag = optional (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle :: GenParser Char ParserState [Inline]
+bodyTitle = try $ do
+  (_, attribs) <- htmlTag "h1"  
+  case (extractAttribute "class" attribs) of
+       Just "title" -> return ""
+       _            -> fail "not title"
+  inlinesTilEnd "h1"
+
+parseHtml :: GenParser Char ParserState Pandoc
+parseHtml = do
+  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+  skipHtmlTag "html"
+  spaces
+  (title, authors, date) <- option ([], [], "") parseHead 
+  spaces
+  skipHtmlTag "body"
+  spaces
+  optional bodyTitle  -- skip title in body, because it's represented in meta
+  blocks <- parseBlocks
+  spaces
+  optional (htmlEndTag "body")
+  spaces
+  optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
+  eof
+  return $ Pandoc (Meta title authors date) blocks
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
+
+block :: GenParser Char ParserState Block
+block = choice [ codeBlock
+               , header
+               , hrule
+               , list
+               , blockQuote
+               , para
+               , plain
+               , rawHtmlBlock'
+               ] <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel :: Int -> GenParser Char ParserState Block
+headerLevel n = try $ do
+    let level = "h" ++ show n
+    htmlTag level
+    contents <- inlinesTilEnd level
+    return $ Header n (normalizeSpaces contents)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char ParserState Block
+hrule = try  $ do
+  (_, attribs) <- htmlTag "hr"
+  state <- getState
+  if not (null attribs) && stateParseRaw state
+     then unexpected "attributes in hr" -- parse as raw in this case
+     else return HorizontalRule
+
+--
+-- code blocks
+--
+
+-- Note:  HTML tags in code blocks (e.g. for syntax highlighting) are 
+-- skipped, because they are not portable to output formats other than HTML.
+codeBlock :: GenParser Char ParserState Block
+codeBlock = try $ do
+    htmlTag "pre" 
+    result <- manyTill 
+              (many1 (satisfy (/= '<')) <|> 
+               ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
+              (htmlEndTag "pre")
+    let result' = concat result
+    -- drop leading newline if any
+    let result'' = if "\n" `isPrefixOf` result'
+                      then drop 1 result'
+                      else result'
+    -- drop trailing newline if any
+    let result''' = if "\n" `isSuffixOf` result''
+                       then init result''
+                       else result''
+    return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
+
+--
+-- block quotes
+--
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = try $ htmlTag "blockquote" >> spaces >> 
+                   blocksTilEnd "blockquote" >>= (return . BlockQuote)
+
+--
+-- list blocks
+--
+
+list :: GenParser Char ParserState Block
+list = choice [ bulletList, orderedList, definitionList ] <?> "list"
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+  (_, attribs) <- htmlTag "ol"
+  (start, style) <- option (1, DefaultStyle) $
+                           do failIfStrict
+                              let sta = fromMaybe "1" $ 
+                                        lookup "start" attribs
+                              let sty = fromMaybe (fromMaybe "" $
+                                        lookup "style" attribs) $
+                                        lookup "class" attribs
+                              let sty' = case sty of
+                                          "lower-roman"  -> LowerRoman
+                                          "upper-roman"  -> UpperRoman
+                                          "lower-alpha"  -> LowerAlpha
+                                          "upper-alpha"  -> UpperAlpha
+                                          "decimal"      -> Decimal
+                                          _              -> DefaultStyle
+                              return (read sta, sty')
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ol"
+  return $ OrderedList (start, style, DefaultDelim) items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+  htmlTag "ul"
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ul"
+  return $ BulletList items
+
+definitionList :: GenParser Char ParserState Block
+definitionList = try $ do
+  failIfStrict  -- def lists not part of standard markdown
+  htmlTag "dl"
+  spaces
+  items <- sepEndBy1 definitionListItem spaces
+  htmlEndTag "dl"
+  return $ DefinitionList items
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+  terms <- sepEndBy1 (inlinesIn "dt") spaces
+  defs <- sepEndBy1 (blocksIn "dd") spaces
+  let term = intercalate [LineBreak] terms
+  return (term, concat defs)
+
+--
+-- paragraph block
+--
+
+para :: GenParser Char ParserState Block
+para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= 
+             return . Para . normalizeSpaces
+
+-- 
+-- plain block
+--
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+-- 
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice [ charRef
+                , strong
+                , emph
+                , superscript
+                , subscript
+                , strikeout
+                , spanStrikeout
+                , code
+                , str
+                , linebreak
+                , whitespace
+                , link
+                , image
+                , rawHtmlInline
+                ] <?> "inline"
+
+code :: GenParser Char ParserState Inline
+code = try $ do 
+  htmlTag "code"
+  result <- manyTill anyChar (htmlEndTag "code")
+  -- remove internal line breaks, leading and trailing space,
+  -- and decode character references
+  return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ 
+                  intercalate " " $ lines result
+
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = do
+  result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
+  state <- getState
+  if stateParseRaw state then return (HtmlInline result) else return (Str "")
+
+betweenTags :: [Char] -> GenParser Char ParserState [Inline]
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= 
+                        return . normalizeSpaces
+
+emph :: GenParser Char ParserState Inline
+emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
+
+strong :: GenParser Char ParserState Inline
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
+
+superscript :: GenParser Char ParserState Inline
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
+
+subscript :: GenParser Char ParserState Inline
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+            return . Strikeout
+
+spanStrikeout :: GenParser Char ParserState Inline
+spanStrikeout = try $ do
+  failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
+  (_, attributes) <- htmlTag "span" 
+  result <- case (extractAttribute "class" attributes) of
+              Just "strikeout" -> inlinesTilEnd "span"
+              _                -> fail "not a strikeout"
+  return $ Strikeout result
+
+whitespace :: GenParser Char st Inline
+whitespace = many1 space >> return Space
+
+-- hard line break
+linebreak :: GenParser Char ParserState Inline
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
+
+str :: GenParser Char st Inline
+str = many1 (noneOf "<& \t\n") >>= return . Str
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
+extractAttribute _    [] = Nothing
+extractAttribute name ((attrName, contents):rest) = 
+  let name'     = map toLower name 
+      attrName' = map toLower attrName
+  in  if attrName' == name'
+         then Just (decodeCharacterReferences contents)
+         else extractAttribute name rest
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+  (_, attributes) <- htmlTag "a"  
+  url <- case (extractAttribute "href" attributes) of
+           Just url -> return url
+           Nothing  -> fail "no href"
+  let title = fromMaybe "" $ extractAttribute "title" attributes
+  lab <- inlinesTilEnd "a"
+  return $ Link (normalizeSpaces lab) (url, title)
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+  (_, attributes) <- htmlTag "img" 
+  url <- case (extractAttribute "src" attributes) of
+           Just url -> return url
+           Nothing  -> fail "no src"
+  let title = fromMaybe "" $ extractAttribute "title" attributes
+  let alt = fromMaybe "" (extractAttribute "alt" attributes)
+  return $ Image [Str alt] (url, title)
+
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2009-01-24 20:00:26 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2009-01-24 20:00:26 +0000
commit	42aca57dee8d88afa5fac512aeb1198102908865 (patch)
tree	1c6a98bd226f4fffde6768010715bc1d80e5d168 /src/Text/Pandoc/Readers/HTML.hs
parent	39e8d8486693029abfef84c45e85416f7c775280 (diff)