summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs675
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs774
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs1243
-rw-r--r--src/Text/Pandoc/Readers/RST.hs707
-rw-r--r--src/Text/Pandoc/Readers/TeXMath.hs233
5 files changed, 3632 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..65e512b5e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,675 @@
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.HTML
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of HTML to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.HTML (
+ readHtml,
+ rawHtmlInline,
+ rawHtmlBlock,
+ anyHtmlBlockTag,
+ anyHtmlInlineTag,
+ anyHtmlTag,
+ anyHtmlEndTag,
+ htmlEndTag,
+ extractTagType,
+ htmlBlockElement,
+ unsanitaryURI
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
+import Data.Maybe ( fromMaybe )
+import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate )
+import Data.Char ( toLower, isAlphaNum )
+import Network.URI ( parseURIReference, URI (..) )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState -- ^ Parser state
+ -> String -- ^ String to parse
+ -> Pandoc
+readHtml = readWith parseHtml
+
+--
+-- Constants
+--
+
+eitherBlockOrInline :: [[Char]]
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+ "map", "area", "object"]
+
+{-
+inlineHtmlTags :: [[Char]]
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
+ "br", "cite", "code", "dfn", "em", "font", "i", "img",
+ "input", "kbd", "label", "q", "s", "samp", "select",
+ "small", "span", "strike", "strong", "sub", "sup",
+ "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+-}
+
+blockHtmlTags :: [[Char]]
+blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
+ "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "hr", "html", "isindex", "menu", "noframes",
+ "noscript", "ol", "p", "pre", "table", "ul", "dd",
+ "dt", "frameset", "li", "tbody", "td", "tfoot",
+ "th", "thead", "tr", "script"] ++ eitherBlockOrInline
+
+sanitaryTags :: [[Char]]
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+ "blockquote", "br", "button", "caption", "center",
+ "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+ "dir", "div", "dl", "dt", "em", "fieldset", "font",
+ "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+ "i", "img", "input", "ins", "kbd", "label", "legend",
+ "li", "map", "menu", "ol", "optgroup", "option", "p",
+ "pre", "q", "s", "samp", "select", "small", "span",
+ "strike", "strong", "sub", "sup", "table", "tbody",
+ "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+ "u", "ul", "var"]
+
+sanitaryAttributes :: [[Char]]
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+ "accesskey", "action", "align", "alt", "axis",
+ "border", "cellpadding", "cellspacing", "char",
+ "charoff", "charset", "checked", "cite", "class",
+ "clear", "cols", "colspan", "color", "compact",
+ "coords", "datetime", "dir", "disabled",
+ "enctype", "for", "frame", "headers", "height",
+ "href", "hreflang", "hspace", "id", "ismap",
+ "label", "lang", "longdesc", "maxlength", "media",
+ "method", "multiple", "name", "nohref", "noshade",
+ "nowrap", "prompt", "readonly", "rel", "rev",
+ "rows", "rowspan", "rules", "scope", "selected",
+ "shape", "size", "span", "src", "start",
+ "summary", "tabindex", "target", "title", "type",
+ "usemap", "valign", "value", "vspace", "width"]
+
+--
+-- HTML utility functions
+--
+
+-- | Returns @True@ if sanitization is specified and the specified tag is
+-- not on the sanitized tag list.
+unsanitaryTag :: [Char]
+ -> GenParser tok ParserState Bool
+unsanitaryTag tag = do
+ st <- getState
+ return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+-- is not on the sanitized attribute list.
+unsanitaryAttribute :: ([Char], String, t)
+ -> GenParser tok ParserState Bool
+unsanitaryAttribute (attr, val, _) = do
+ st <- getState
+ return $ stateSanitizeHTML st &&
+ (attr `notElem` sanitaryAttributes ||
+ (attr `elem` ["href","src"] && unsanitaryURI val))
+
+-- | Returns @True@ if the specified URI is potentially a security risk.
+unsanitaryURI :: String -> Bool
+unsanitaryURI u =
+ let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
+ "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
+ "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
+ "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
+ "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
+ "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
+ "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
+ "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
+ "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
+ "snews:", "webcal:", "ymsgr:"]
+ in case parseURIReference u of
+ Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
+ Nothing -> True
+
+-- | Read blocks until end tag.
+blocksTilEnd :: String -> GenParser Char ParserState [Block]
+blocksTilEnd tag = do
+ blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+ return $ filter (/= Null) blocks
+
+-- | Read inlines until end tag.
+inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
+
+-- | Parse blocks between open and close tag.
+blocksIn :: String -> GenParser Char ParserState [Block]
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
+
+-- | Parse inlines between open and close tag.
+inlinesIn :: String -> GenParser Char ParserState [Inline]
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
+
+-- | Extract type from a tag: e.g. @br@ from @\<br\>@
+extractTagType :: String -> String
+extractTagType ('<':rest) =
+ let isSpaceOrSlash c = c `elem` "/ \n\t" in
+ map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
+extractTagType _ = ""
+
+-- | Parse any HTML tag (opening or self-closing) and return text of tag
+anyHtmlTag :: GenParser Char ParserState [Char]
+anyHtmlTag = try $ do
+ char '<'
+ spaces
+ tag <- many1 alphaNum
+ attribs <- many htmlAttribute
+ spaces
+ ender <- option "" (string "/")
+ let ender' = if null ender then "" else " /"
+ spaces
+ char '>'
+ let result = "<" ++ tag ++
+ concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe HTML removed -->"
+ else return result
+
+anyHtmlEndTag :: GenParser Char ParserState [Char]
+anyHtmlEndTag = try $ do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ tag <- many1 alphaNum
+ spaces
+ char '>'
+ let result = "</" ++ tag ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe HTML removed -->"
+ else return result
+
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
+htmlTag tag = try $ do
+ char '<'
+ spaces
+ stringAnyCase tag
+ attribs <- many htmlAttribute
+ spaces
+ optional (string "/")
+ spaces
+ char '>'
+ return (tag, (map (\(name, content, _) -> (name, content)) attribs))
+
+-- parses a quoted html attribute value
+quoted :: Char -> GenParser Char st (String, String)
+quoted quoteChar = do
+ result <- between (char quoteChar) (char quoteChar)
+ (many (noneOf [quoteChar]))
+ return (result, [quoteChar])
+
+nullAttribute :: ([Char], [Char], [Char])
+nullAttribute = ("", "", "")
+
+htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
+htmlAttribute = do
+ attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+ unsanitary <- unsanitaryAttribute attr
+ if unsanitary
+ then return nullAttribute
+ else return attr
+
+-- minimized boolean attribute
+htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlMinimizedAttribute = try $ do
+ many1 space
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ return (name, name, name)
+
+htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
+htmlRegularAttribute = try $ do
+ many1 space
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ spaces
+ char '='
+ spaces
+ (content, quoteStr) <- choice [ (quoted '\''),
+ (quoted '"'),
+ (do
+ a <- many (alphaNum <|> (oneOf "-._:"))
+ return (a,"")) ]
+ return (name, content,
+ (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
+
+-- | Parse an end tag of type 'tag'
+htmlEndTag :: [Char] -> GenParser Char st [Char]
+htmlEndTag tag = try $ do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ stringAnyCase tag
+ spaces
+ char '>'
+ return $ "</" ++ tag ++ ">"
+
+{-
+-- | Returns @True@ if the tag is (or can be) an inline tag.
+isInline :: String -> Bool
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+-}
+
+-- | Returns @True@ if the tag is (or can be) a block tag.
+isBlock :: String -> Bool
+isBlock tag = (extractTagType tag) `elem` blockHtmlTags
+
+anyHtmlBlockTag :: GenParser Char ParserState [Char]
+anyHtmlBlockTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if isBlock tag then return tag else fail "not a block tag"
+
+anyHtmlInlineTag :: GenParser Char ParserState [Char]
+anyHtmlInlineTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if not (isBlock tag) then return tag else fail "not an inline tag"
+
+-- | Parses material between script tags.
+-- Scripts must be treated differently, because they can contain '<>' etc.
+htmlScript :: GenParser Char ParserState [Char]
+htmlScript = try $ do
+ open <- string "<script"
+ rest <- manyTill anyChar (htmlEndTag "script")
+ st <- getState
+ if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+ then return "<!-- unsafe HTML removed -->"
+ else return $ open ++ rest ++ "</script>"
+
+-- | Parses material between style tags.
+-- Style tags must be treated differently, because they can contain CSS
+htmlStyle :: GenParser Char ParserState [Char]
+htmlStyle = try $ do
+ open <- string "<style"
+ rest <- manyTill anyChar (htmlEndTag "style")
+ st <- getState
+ if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+ then return "<!-- unsafe HTML removed -->"
+ else return $ open ++ rest ++ "</style>"
+
+htmlBlockElement :: GenParser Char ParserState [Char]
+htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
+
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+ body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
+ state <- getState
+ if stateParseRaw state then return (RawHtml body) else return Null
+
+-- This is a block whose contents should be passed through verbatim, not interpreted.
+rawVerbatimBlock :: GenParser Char ParserState [Char]
+rawVerbatimBlock = try $ do
+ start <- anyHtmlBlockTag
+ let tagtype = extractTagType start
+ if tagtype `elem` ["pre"]
+ then do
+ contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
+ end <- htmlEndTag tagtype
+ return $ start ++ contents ++ end
+ else fail "Not a verbatim block"
+
+-- We don't want to parse </body> or </html> as raw HTML, since these
+-- are handled in parseHtml.
+rawHtmlBlock' :: GenParser Char ParserState Block
+rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
+ rawHtmlBlock
+
+-- | Parses an HTML comment.
+htmlComment :: GenParser Char st [Char]
+htmlComment = try $ do
+ string "<!--"
+ comment <- manyTill anyChar (try (string "-->"))
+ return $ "<!--" ++ comment ++ "-->"
+
+--
+-- parsing documents
+--
+
+xmlDec :: GenParser Char st [Char]
+xmlDec = try $ do
+ string "<?"
+ rest <- manyTill anyChar (char '>')
+ return $ "<?" ++ rest ++ ">"
+
+definition :: GenParser Char st [Char]
+definition = try $ do
+ string "<!"
+ rest <- manyTill anyChar (char '>')
+ return $ "<!" ++ rest ++ ">"
+
+nonTitleNonHead :: GenParser Char ParserState Char
+nonTitleNonHead = try $ do
+ notFollowedBy $ (htmlTag "title" >> return ' ') <|>
+ (htmlEndTag "head" >> return ' ')
+ (rawHtmlBlock >> return ' ') <|> anyChar
+
+parseTitle :: GenParser Char ParserState [Inline]
+parseTitle = try $ do
+ (tag, _) <- htmlTag "title"
+ contents <- inlinesTilEnd tag
+ spaces
+ return contents
+
+-- parse header and return meta-information (for now, just title)
+parseHead :: GenParser Char ParserState ([Inline], [a], [Char])
+parseHead = try $ do
+ htmlTag "head"
+ spaces
+ skipMany nonTitleNonHead
+ contents <- option [] parseTitle
+ skipMany nonTitleNonHead
+ htmlEndTag "head"
+ return (contents, [], "")
+
+skipHtmlTag :: String -> GenParser Char ParserState ()
+skipHtmlTag tag = optional (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle :: GenParser Char ParserState [Inline]
+bodyTitle = try $ do
+ (_, attribs) <- htmlTag "h1"
+ case (extractAttribute "class" attribs) of
+ Just "title" -> return ""
+ _ -> fail "not title"
+ inlinesTilEnd "h1"
+
+parseHtml :: GenParser Char ParserState Pandoc
+parseHtml = do
+ sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+ skipHtmlTag "html"
+ spaces
+ (title, authors, date) <- option ([], [], "") parseHead
+ spaces
+ skipHtmlTag "body"
+ spaces
+ optional bodyTitle -- skip title in body, because it's represented in meta
+ blocks <- parseBlocks
+ spaces
+ optional (htmlEndTag "body")
+ spaces
+ optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
+ eof
+ return $ Pandoc (Meta title authors date) blocks
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
+
+block :: GenParser Char ParserState Block
+block = choice [ codeBlock
+ , header
+ , hrule
+ , list
+ , blockQuote
+ , para
+ , plain
+ , rawHtmlBlock'
+ ] <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel :: Int -> GenParser Char ParserState Block
+headerLevel n = try $ do
+ let level = "h" ++ show n
+ htmlTag level
+ contents <- inlinesTilEnd level
+ return $ Header n (normalizeSpaces contents)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char ParserState Block
+hrule = try $ do
+ (_, attribs) <- htmlTag "hr"
+ state <- getState
+ if not (null attribs) && stateParseRaw state
+ then unexpected "attributes in hr" -- parse as raw in this case
+ else return HorizontalRule
+
+--
+-- code blocks
+--
+
+-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
+-- skipped, because they are not portable to output formats other than HTML.
+codeBlock :: GenParser Char ParserState Block
+codeBlock = try $ do
+ htmlTag "pre"
+ result <- manyTill
+ (many1 (satisfy (/= '<')) <|>
+ ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
+ (htmlEndTag "pre")
+ let result' = concat result
+ -- drop leading newline if any
+ let result'' = if "\n" `isPrefixOf` result'
+ then drop 1 result'
+ else result'
+ -- drop trailing newline if any
+ let result''' = if "\n" `isSuffixOf` result''
+ then init result''
+ else result''
+ return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
+
+--
+-- block quotes
+--
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = try $ htmlTag "blockquote" >> spaces >>
+ blocksTilEnd "blockquote" >>= (return . BlockQuote)
+
+--
+-- list blocks
+--
+
+list :: GenParser Char ParserState Block
+list = choice [ bulletList, orderedList, definitionList ] <?> "list"
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ (_, attribs) <- htmlTag "ol"
+ (start, style) <- option (1, DefaultStyle) $
+ do failIfStrict
+ let sta = fromMaybe "1" $
+ lookup "start" attribs
+ let sty = fromMaybe (fromMaybe "" $
+ lookup "style" attribs) $
+ lookup "class" attribs
+ let sty' = case sty of
+ "lower-roman" -> LowerRoman
+ "upper-roman" -> UpperRoman
+ "lower-alpha" -> LowerAlpha
+ "upper-alpha" -> UpperAlpha
+ "decimal" -> Decimal
+ _ -> DefaultStyle
+ return (read sta, sty')
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ol"
+ return $ OrderedList (start, style, DefaultDelim) items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+ htmlTag "ul"
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ul"
+ return $ BulletList items
+
+definitionList :: GenParser Char ParserState Block
+definitionList = try $ do
+ failIfStrict -- def lists not part of standard markdown
+ htmlTag "dl"
+ spaces
+ items <- sepEndBy1 definitionListItem spaces
+ htmlEndTag "dl"
+ return $ DefinitionList items
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+ terms <- sepEndBy1 (inlinesIn "dt") spaces
+ defs <- sepEndBy1 (blocksIn "dd") spaces
+ let term = intercalate [LineBreak] terms
+ return (term, concat defs)
+
+--
+-- paragraph block
+--
+
+para :: GenParser Char ParserState Block
+para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
+ return . Para . normalizeSpaces
+
+--
+-- plain block
+--
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+--
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice [ charRef
+ , strong
+ , emph
+ , superscript
+ , subscript
+ , strikeout
+ , spanStrikeout
+ , code
+ , str
+ , linebreak
+ , whitespace
+ , link
+ , image
+ , rawHtmlInline
+ ] <?> "inline"
+
+code :: GenParser Char ParserState Inline
+code = try $ do
+ htmlTag "code"
+ result <- manyTill anyChar (htmlEndTag "code")
+ -- remove internal line breaks, leading and trailing space,
+ -- and decode character references
+ return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
+ intercalate " " $ lines result
+
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = do
+ result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
+ state <- getState
+ if stateParseRaw state then return (HtmlInline result) else return (Str "")
+
+betweenTags :: [Char] -> GenParser Char ParserState [Inline]
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
+ return . normalizeSpaces
+
+emph :: GenParser Char ParserState Inline
+emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
+
+strong :: GenParser Char ParserState Inline
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
+
+superscript :: GenParser Char ParserState Inline
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
+
+subscript :: GenParser Char ParserState Inline
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+ return . Strikeout
+
+spanStrikeout :: GenParser Char ParserState Inline
+spanStrikeout = try $ do
+ failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
+ (_, attributes) <- htmlTag "span"
+ result <- case (extractAttribute "class" attributes) of
+ Just "strikeout" -> inlinesTilEnd "span"
+ _ -> fail "not a strikeout"
+ return $ Strikeout result
+
+whitespace :: GenParser Char st Inline
+whitespace = many1 space >> return Space
+
+-- hard line break
+linebreak :: GenParser Char ParserState Inline
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
+
+str :: GenParser Char st Inline
+str = many1 (noneOf "<& \t\n") >>= return . Str
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
+extractAttribute _ [] = Nothing
+extractAttribute name ((attrName, contents):rest) =
+ let name' = map toLower name
+ attrName' = map toLower attrName
+ in if attrName' == name'
+ then Just (decodeCharacterReferences contents)
+ else extractAttribute name rest
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+ (_, attributes) <- htmlTag "a"
+ url <- case (extractAttribute "href" attributes) of
+ Just url -> return url
+ Nothing -> fail "no href"
+ let title = fromMaybe "" $ extractAttribute "title" attributes
+ lab <- inlinesTilEnd "a"
+ return $ Link (normalizeSpaces lab) (url, title)
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+ (_, attributes) <- htmlTag "img"
+ url <- case (extractAttribute "src" attributes) of
+ Just url -> return url
+ Nothing -> fail "no src"
+ let title = fromMaybe "" $ extractAttribute "title" attributes
+ let alt = fromMaybe "" (extractAttribute "alt" attributes)
+ return $ Image [Str alt] (url, title)
+
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
new file mode 100644
index 000000000..f35ab4f29
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -0,0 +1,774 @@
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.LaTeX
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of LaTeX to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.LaTeX (
+ readLaTeX,
+ rawLaTeXInline,
+ rawLaTeXEnvironment'
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Data.Maybe ( fromMaybe )
+import Data.Char ( chr )
+import Data.List ( isPrefixOf, isSuffixOf )
+
+-- | Parse LaTeX from string and return 'Pandoc' document.
+readLaTeX :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse
+ -> Pandoc
+readLaTeX = readWith parseLaTeX
+
+-- characters with special meaning
+specialChars :: [Char]
+specialChars = "\\`$%^&_~#{}\n \t|<>'\"-"
+
+--
+-- utility functions
+--
+
+-- | Returns text between brackets and its matching pair.
+bracketedText :: Char -> Char -> GenParser Char st [Char]
+bracketedText openB closeB = do
+ result <- charsInBalanced' openB closeB
+ return $ [openB] ++ result ++ [closeB]
+
+-- | Returns an option or argument of a LaTeX command.
+optOrArg :: GenParser Char st [Char]
+optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']'
+
+-- | True if the string begins with '{'.
+isArg :: [Char] -> Bool
+isArg ('{':_) = True
+isArg _ = False
+
+-- | Returns list of options and arguments of a LaTeX command.
+commandArgs :: GenParser Char st [[Char]]
+commandArgs = many optOrArg
+
+-- | Parses LaTeX command, returns (name, star, list of options or arguments).
+command :: GenParser Char st ([Char], [Char], [[Char]])
+command = do
+ char '\\'
+ name <- many1 letter
+ star <- option "" (string "*") -- some commands have starred versions
+ args <- commandArgs
+ return (name, star, args)
+
+begin :: [Char] -> GenParser Char st [Char]
+begin name = try $ do
+ string $ "\\begin{" ++ name ++ "}"
+ optional commandArgs
+ spaces
+ return name
+
+end :: [Char] -> GenParser Char st [Char]
+end name = try $ do
+ string $ "\\end{" ++ name ++ "}"
+ return name
+
+-- | Returns a list of block elements containing the contents of an
+-- environment.
+environment :: [Char] -> GenParser Char ParserState [Block]
+environment name = try $ begin name >> spaces >> manyTill block (end name) >>~ spaces
+
+anyEnvironment :: GenParser Char ParserState Block
+anyEnvironment = try $ do
+ string "\\begin{"
+ name <- many letter
+ star <- option "" (string "*") -- some environments have starred variants
+ char '}'
+ optional commandArgs
+ spaces
+ contents <- manyTill block (end (name ++ star))
+ spaces
+ return $ BlockQuote contents
+
+--
+-- parsing documents
+--
+
+-- | Process LaTeX preamble, extracting metadata.
+processLaTeXPreamble :: GenParser Char ParserState ()
+processLaTeXPreamble = try $ manyTill
+ (choice [bibliographic, comment, unknownCommand, nullBlock])
+ (try (string "\\begin{document}")) >>
+ spaces
+
+-- | Parse LaTeX and return 'Pandoc'.
+parseLaTeX :: GenParser Char ParserState Pandoc
+parseLaTeX = do
+ optional processLaTeXPreamble -- preamble might not be present (fragment)
+ spaces
+ blocks <- parseBlocks
+ spaces
+ optional $ try (string "\\end{document}" >> many anyChar)
+ -- might not be present (fragment)
+ spaces
+ eof
+ state <- getState
+ let blocks' = filter (/= Null) blocks
+ let title' = stateTitle state
+ let authors' = stateAuthors state
+ let date' = stateDate state
+ return $ Pandoc (Meta title' authors' date') blocks'
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = spaces >> many block
+
+block :: GenParser Char ParserState Block
+block = choice [ hrule
+ , codeBlock
+ , header
+ , list
+ , blockQuote
+ , comment
+ , bibliographic
+ , para
+ , itemBlock
+ , unknownEnvironment
+ , ignore
+ , unknownCommand ] <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = try $ do
+ char '\\'
+ subs <- many (try (string "sub"))
+ string "section"
+ optional (char '*')
+ char '{'
+ title' <- manyTill inline (char '}')
+ spaces
+ return $ Header (length subs + 1) (normalizeSpaces title')
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char st Block
+hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n",
+ "\\newpage" ] >> spaces >> return HorizontalRule
+
+--
+-- code blocks
+--
+
+codeBlock :: GenParser Char ParserState Block
+codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> lhsCodeBlock
+-- Note: Verbatim is from fancyvrb.
+
+codeBlockWith :: String -> GenParser Char st Block
+codeBlockWith env = try $ do
+ string ("\\begin{" ++ env ++ "}") -- don't use begin function because it
+ -- gobbles whitespace
+ optional blanklines -- we want to gobble blank lines, but not
+ -- leading space
+ contents <- manyTill anyChar (try (string $ "\\end{" ++ env ++ "}"))
+ spaces
+ let classes = if env == "code" then ["haskell"] else []
+ return $ CodeBlock ("",classes,[]) (stripTrailingNewlines contents)
+
+lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock = do
+ failUnlessLHS
+ (CodeBlock (_,_,_) cont) <- codeBlockWith "code"
+ return $ CodeBlock ("", ["sourceCode","haskell"], []) cont
+
+--
+-- block quotes
+--
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>=
+ return . BlockQuote
+
+--
+-- list blocks
+--
+
+list :: GenParser Char ParserState Block
+list = bulletList <|> orderedList <|> definitionList <?> "list"
+
+listItem :: GenParser Char ParserState ([Inline], [Block])
+listItem = try $ do
+ ("item", _, args) <- command
+ spaces
+ state <- getState
+ let oldParserContext = stateParserContext state
+ updateState (\s -> s {stateParserContext = ListItemState})
+ blocks <- many block
+ updateState (\s -> s {stateParserContext = oldParserContext})
+ opt <- case args of
+ ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x ->
+ parseFromString (many inline) $ tail $ init x
+ _ -> return []
+ return (opt, blocks)
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ string "\\begin{enumerate}"
+ (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $
+ try $ do failIfStrict
+ char '['
+ res <- anyOrderedListMarker
+ char ']'
+ return res
+ spaces
+ option "" $ try $ do string "\\setlength{\\itemindent}"
+ char '{'
+ manyTill anyChar (char '}')
+ spaces
+ start <- option 1 $ try $ do failIfStrict
+ string "\\setcounter{enum"
+ many1 (oneOf "iv")
+ string "}{"
+ num <- many1 digit
+ char '}'
+ spaces
+ return $ (read num) + 1
+ items <- many listItem
+ end "enumerate"
+ spaces
+ return $ OrderedList (start, style, delim) $ map snd items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+ begin "itemize"
+ spaces
+ items <- many listItem
+ end "itemize"
+ spaces
+ return (BulletList $ map snd items)
+
+definitionList :: GenParser Char ParserState Block
+definitionList = try $ do
+ begin "description"
+ spaces
+ items <- many listItem
+ end "description"
+ spaces
+ return (DefinitionList items)
+
+--
+-- paragraph block
+--
+
+para :: GenParser Char ParserState Block
+para = do
+ res <- many1 inline
+ spaces
+ return $ if null (filter (`notElem` [Str "", Space]) res)
+ then Null
+ else Para $ normalizeSpaces res
+
+--
+-- title authors date
+--
+
+bibliographic :: GenParser Char ParserState Block
+bibliographic = choice [ maketitle, title, authors, date ]
+
+maketitle :: GenParser Char st Block
+maketitle = try (string "\\maketitle") >> spaces >> return Null
+
+title :: GenParser Char ParserState Block
+title = try $ do
+ string "\\title{"
+ tit <- manyTill inline (char '}')
+ spaces
+ updateState (\state -> state { stateTitle = tit })
+ return Null
+
+authors :: GenParser Char ParserState Block
+authors = try $ do
+ string "\\author{"
+ authors' <- manyTill anyChar (char '}')
+ spaces
+ let authors'' = map removeLeadingTrailingSpace $ lines $
+ substitute "\\\\" "\n" authors'
+ updateState (\s -> s { stateAuthors = authors'' })
+ return Null
+
+date :: GenParser Char ParserState Block
+date = try $ do
+ string "\\date{"
+ date' <- manyTill anyChar (char '}')
+ spaces
+ updateState (\state -> state { stateDate = date' })
+ return Null
+
+--
+-- item block
+-- for use in unknown environments that aren't being parsed as raw latex
+--
+
+-- this forces items to be parsed in different blocks
+itemBlock :: GenParser Char ParserState Block
+itemBlock = try $ do
+ ("item", _, args) <- command
+ state <- getState
+ if stateParserContext state == ListItemState
+ then fail "item should be handled by list block"
+ else if null args
+ then return Null
+ else return $ Plain [Str (stripFirstAndLast (head args))]
+
+--
+-- raw LaTeX
+--
+
+-- | Parse any LaTeX environment and return a Para block containing
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment :: GenParser Char st Block
+rawLaTeXEnvironment = do
+ contents <- rawLaTeXEnvironment'
+ spaces
+ return $ Para [TeX contents]
+
+-- | Parse any LaTeX environment and return a string containing
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment' :: GenParser Char st String
+rawLaTeXEnvironment' = try $ do
+ string "\\begin{"
+ name <- many1 letter
+ star <- option "" (string "*") -- for starred variants
+ let name' = name ++ star
+ char '}'
+ args <- option [] commandArgs
+ let argStr = concat args
+ contents <- manyTill (choice [ (many1 (noneOf "\\")),
+ rawLaTeXEnvironment',
+ string "\\" ])
+ (end name')
+ return $ "\\begin{" ++ name' ++ "}" ++ argStr ++
+ concat contents ++ "\\end{" ++ name' ++ "}"
+
+unknownEnvironment :: GenParser Char ParserState Block
+unknownEnvironment = try $ do
+ state <- getState
+ result <- if stateParseRaw state -- check whether we should include raw TeX
+ then rawLaTeXEnvironment -- if so, get whole raw environment
+ else anyEnvironment -- otherwise just the contents
+ return result
+
+-- \ignore{} is used conventionally in literate haskell for definitions
+-- that are to be processed by the compiler but not printed.
+ignore :: GenParser Char ParserState Block
+ignore = try $ do
+ ("ignore", _, _) <- command
+ spaces
+ return Null
+
+unknownCommand :: GenParser Char ParserState Block
+unknownCommand = try $ do
+ notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
+ "document"]
+ state <- getState
+ if stateParserContext state == ListItemState
+ then notFollowedBy' $ string "\\item"
+ else return ()
+ if stateParseRaw state
+ then do
+ (name, star, args) <- command
+ spaces
+ return $ Plain [TeX ("\\" ++ name ++ star ++ concat args)]
+ else do -- skip unknown command, leaving arguments to be parsed
+ char '\\'
+ letter
+ many (letter <|> digit)
+ optional (try $ string "{}")
+ spaces
+ return Null
+
+-- latex comment
+comment :: GenParser Char st Block
+comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null
+
+--
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice [ str
+ , endline
+ , whitespace
+ , quoted
+ , apostrophe
+ , spacer
+ , strong
+ , math
+ , ellipses
+ , emDash
+ , enDash
+ , hyphen
+ , emph
+ , strikeout
+ , superscript
+ , subscript
+ , ref
+ , lab
+ , code
+ , url
+ , link
+ , image
+ , footnote
+ , linebreak
+ , accentedChar
+ , specialChar
+ , rawLaTeXInline
+ , escapedChar
+ , unescapedChar
+ ] <?> "inline"
+
+accentedChar :: GenParser Char st Inline
+accentedChar = normalAccentedChar <|> specialAccentedChar
+
+normalAccentedChar :: GenParser Char st Inline
+normalAccentedChar = try $ do
+ char '\\'
+ accent <- oneOf "'`^\"~"
+ character <- (try $ char '{' >> letter >>~ char '}') <|> letter
+ let table = fromMaybe [] $ lookup character accentTable
+ let result = case lookup accent table of
+ Just num -> chr num
+ Nothing -> '?'
+ return $ Str [result]
+
+-- an association list of letters and association list of accents
+-- and decimal character numbers.
+accentTable :: [(Char, [(Char, Int)])]
+accentTable =
+ [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
+ ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
+ ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
+ ('N', [('~', 209)]),
+ ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
+ ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
+ ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
+ ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
+ ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
+ ('n', [('~', 241)]),
+ ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
+ ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
+
+specialAccentedChar :: GenParser Char st Inline
+specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig,
+ oslash, pound, euro, copyright, sect ]
+
+ccedil :: GenParser Char st Inline
+ccedil = try $ do
+ char '\\'
+ letter' <- oneOfStrings ["cc", "cC"]
+ let num = if letter' == "cc" then 231 else 199
+ return $ Str [chr num]
+
+aring :: GenParser Char st Inline
+aring = try $ do
+ char '\\'
+ letter' <- oneOfStrings ["aa", "AA"]
+ let num = if letter' == "aa" then 229 else 197
+ return $ Str [chr num]
+
+iuml :: GenParser Char st Inline
+iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >>
+ return (Str [chr 239])
+
+szlig :: GenParser Char st Inline
+szlig = try (string "\\ss") >> return (Str [chr 223])
+
+oslash :: GenParser Char st Inline
+oslash = try $ do
+ char '\\'
+ letter' <- choice [char 'o', char 'O']
+ let num = if letter' == 'o' then 248 else 216
+ return $ Str [chr num]
+
+aelig :: GenParser Char st Inline
+aelig = try $ do
+ char '\\'
+ letter' <- oneOfStrings ["ae", "AE"]
+ let num = if letter' == "ae" then 230 else 198
+ return $ Str [chr num]
+
+pound :: GenParser Char st Inline
+pound = try (string "\\pounds") >> return (Str [chr 163])
+
+euro :: GenParser Char st Inline
+euro = try (string "\\euro") >> return (Str [chr 8364])
+
+copyright :: GenParser Char st Inline
+copyright = try (string "\\copyright") >> return (Str [chr 169])
+
+sect :: GenParser Char st Inline
+sect = try (string "\\S") >> return (Str [chr 167])
+
+escapedChar :: GenParser Char st Inline
+escapedChar = do
+ result <- escaped (oneOf " $%&_#{}\n")
+ return $ if result == Str "\n" then Str " " else result
+
+-- nonescaped special characters
+unescapedChar :: GenParser Char st Inline
+unescapedChar = oneOf "`$^&_#{}|<>" >>= return . (\c -> Str [c])
+
+specialChar :: GenParser Char st Inline
+specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ]
+
+backslash :: GenParser Char st Inline
+backslash = try (string "\\textbackslash") >> optional (try $ string "{}") >> return (Str "\\")
+
+tilde :: GenParser Char st Inline
+tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~")
+
+caret :: GenParser Char st Inline
+caret = try (string "\\^{}") >> return (Str "^")
+
+bar :: GenParser Char st Inline
+bar = try (string "\\textbar") >> optional (try $ string "{}") >> return (Str "\\")
+
+lt :: GenParser Char st Inline
+lt = try (string "\\textless") >> optional (try $ string "{}") >> return (Str "<")
+
+gt :: GenParser Char st Inline
+gt = try (string "\\textgreater") >> optional (try $ string "{}") >> return (Str ">")
+
+doubleQuote :: GenParser Char st Inline
+doubleQuote = char '"' >> return (Str "\"")
+
+code :: GenParser Char ParserState Inline
+code = code1 <|> code2 <|> lhsInlineCode
+
+code1 :: GenParser Char st Inline
+code1 = try $ do
+ string "\\verb"
+ marker <- anyChar
+ result <- manyTill anyChar (char marker)
+ return $ Code $ removeLeadingTrailingSpace result
+
+code2 :: GenParser Char st Inline
+code2 = try $ do
+ string "\\texttt{"
+ result <- manyTill (noneOf "\\\n~$%^&{}") (char '}')
+ return $ Code result
+
+lhsInlineCode :: GenParser Char ParserState Inline
+lhsInlineCode = try $ do
+ failUnlessLHS
+ char '|'
+ result <- manyTill (noneOf "|\n") (char '|')
+ return $ Code result
+
+emph :: GenParser Char ParserState Inline
+emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >>
+ manyTill inline (char '}') >>= return . Emph
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>=
+ return . Strikeout
+
+superscript :: GenParser Char ParserState Inline
+superscript = try $ string "\\textsuperscript{" >>
+ manyTill inline (char '}') >>= return . Superscript
+
+-- note: \textsubscript isn't a standard latex command, but we use
+-- a defined version in pandoc.
+subscript :: GenParser Char ParserState Inline
+subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>=
+ return . Subscript
+
+apostrophe :: GenParser Char ParserState Inline
+apostrophe = char '\'' >> return Apostrophe
+
+quoted :: GenParser Char ParserState Inline
+quoted = doubleQuoted <|> singleQuoted
+
+singleQuoted :: GenParser Char ParserState Inline
+singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>=
+ return . Quoted SingleQuote . normalizeSpaces
+
+doubleQuoted :: GenParser Char ParserState Inline
+doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>=
+ return . Quoted DoubleQuote . normalizeSpaces
+
+singleQuoteStart :: GenParser Char st Char
+singleQuoteStart = char '`'
+
+singleQuoteEnd :: GenParser Char st ()
+singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum
+
+doubleQuoteStart :: CharParser st String
+doubleQuoteStart = string "``"
+
+doubleQuoteEnd :: CharParser st String
+doubleQuoteEnd = try $ string "''"
+
+ellipses :: GenParser Char st Inline
+ellipses = try $ string "\\ldots" >> optional (try $ string "{}") >>
+ return Ellipses
+
+enDash :: GenParser Char st Inline
+enDash = try (string "--") >> return EnDash
+
+emDash :: GenParser Char st Inline
+emDash = try (string "---") >> return EmDash
+
+hyphen :: GenParser Char st Inline
+hyphen = char '-' >> return (Str "-")
+
+lab :: GenParser Char st Inline
+lab = try $ do
+ string "\\label{"
+ result <- manyTill anyChar (char '}')
+ return $ Str $ "(" ++ result ++ ")"
+
+ref :: GenParser Char st Inline
+ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str
+
+strong :: GenParser Char ParserState Inline
+strong = try (string "\\textbf{") >> manyTill inline (char '}') >>=
+ return . Strong
+
+whitespace :: GenParser Char st Inline
+whitespace = many1 (oneOf "~ \t") >> return Space
+
+-- hard line break
+linebreak :: GenParser Char st Inline
+linebreak = try (string "\\\\") >> return LineBreak
+
+spacer :: GenParser Char st Inline
+spacer = try (string "\\,") >> return (Str "")
+
+str :: GenParser Char st Inline
+str = many1 (noneOf specialChars) >>= return . Str
+
+-- endline internal to paragraph
+endline :: GenParser Char st Inline
+endline = try $ newline >> notFollowedBy blankline >> return Space
+
+-- math
+math :: GenParser Char st Inline
+math = (math3 >>= return . Math DisplayMath)
+ <|> (math1 >>= return . Math InlineMath)
+ <|> (math2 >>= return . Math InlineMath)
+ <|> (math4 >>= return . Math DisplayMath)
+ <|> (math5 >>= return . Math DisplayMath)
+ <|> (math6 >>= return . Math DisplayMath)
+ <?> "math"
+
+math1 :: GenParser Char st String
+math1 = try $ char '$' >> manyTill anyChar (char '$')
+
+math2 :: GenParser Char st String
+math2 = try $ string "\\(" >> manyTill anyChar (try $ string "\\)")
+
+math3 :: GenParser Char st String
+math3 = try $ char '$' >> math1 >>~ char '$'
+
+math4 :: GenParser Char st String
+math4 = try $ do
+ name <- begin "equation" <|> begin "equation*" <|> begin "displaymath" <|> begin "displaymath*"
+ spaces
+ manyTill anyChar (end name)
+
+math5 :: GenParser Char st String
+math5 = try $ (string "\\[") >> spaces >> manyTill anyChar (try $ string "\\]")
+
+math6 :: GenParser Char st String
+math6 = try $ do
+ name <- begin "eqnarray" <|> begin "eqnarray*"
+ spaces
+ res <- manyTill anyChar (end name)
+ return $ filter (/= '&') res -- remove eqnarray alignment codes
+
+--
+-- links and images
+--
+
+url :: GenParser Char ParserState Inline
+url = try $ do
+ string "\\url"
+ url' <- charsInBalanced '{' '}'
+ return $ Link [Code url'] (url', "")
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+ string "\\href{"
+ url' <- manyTill anyChar (char '}')
+ char '{'
+ label' <- manyTill inline (char '}')
+ return $ Link (normalizeSpaces label') (url', "")
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+ ("includegraphics", _, args) <- command
+ let args' = filter isArg args -- filter out options
+ let src = if null args' then
+ ("", "")
+ else
+ (stripFirstAndLast (head args'), "")
+ return $ Image [Str "image"] src
+
+footnote :: GenParser Char ParserState Inline
+footnote = try $ do
+ (name, _, (contents:[])) <- command
+ if ((name == "footnote") || (name == "thanks"))
+ then string ""
+ else fail "not a footnote or thanks command"
+ let contents' = stripFirstAndLast contents
+ -- parse the extracted block, which may contain various block elements:
+ rest <- getInput
+ setInput $ contents'
+ blocks <- parseBlocks
+ setInput rest
+ return $ Note blocks
+
+-- | Parse any LaTeX command and return it in a raw TeX inline element.
+rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline = try $ do
+ notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore"]
+ state <- getState
+ if stateParseRaw state
+ then do
+ (name, star, args) <- command
+ return $ TeX ("\\" ++ name ++ star ++ concat args)
+ else do -- skip unknown command, leaving arguments to be parsed
+ char '\\'
+ letter
+ many (letter <|> digit)
+ optional (try $ string "{}")
+ return $ Str ""
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
new file mode 100644
index 000000000..896f5832e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -0,0 +1,1243 @@
+{-# LANGUAGE CPP #-}
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Markdown
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of markdown-formatted plain text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Markdown (
+ readMarkdown
+ ) where
+
+import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
+import Data.Ord ( comparing )
+import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper )
+import Data.Maybe
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
+import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
+ anyHtmlInlineTag, anyHtmlTag,
+ anyHtmlEndTag, htmlEndTag, extractTagType,
+ htmlBlockElement, unsanitaryURI )
+import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
+import Text.ParserCombinators.Parsec
+import Control.Monad (when)
+
+-- | Read markdown from an input string and return a Pandoc document.
+readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n")
+
+--
+-- Constants and data structure definitions
+--
+
+spaceChars :: [Char]
+spaceChars = " \t"
+
+bulletListMarkers :: [Char]
+bulletListMarkers = "*+-"
+
+hruleChars :: [Char]
+hruleChars = "*-_"
+
+setextHChars :: [Char]
+setextHChars = "=-"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars :: [Char]
+specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221"
+
+--
+-- auxiliary functions
+--
+
+indentSpaces :: GenParser Char ParserState [Char]
+indentSpaces = try $ do
+ state <- getState
+ let tabStop = stateTabStop state
+ try (count tabStop (char ' ')) <|>
+ (many (char ' ') >> string "\t") <?> "indentation"
+
+nonindentSpaces :: GenParser Char ParserState [Char]
+nonindentSpaces = do
+ state <- getState
+ let tabStop = stateTabStop state
+ sps <- many (char ' ')
+ if length sps < tabStop
+ then return sps
+ else unexpected "indented line"
+
+-- | Fail unless we're at beginning of a line.
+failUnlessBeginningOfLine :: GenParser tok st ()
+failUnlessBeginningOfLine = do
+ pos <- getPosition
+ if sourceColumn pos == 1 then return () else fail "not beginning of line"
+
+-- | Fail unless we're in "smart typography" mode.
+failUnlessSmart :: GenParser tok ParserState ()
+failUnlessSmart = do
+ state <- getState
+ if stateSmart state then return () else fail "Smart typography feature"
+
+-- | Parse a sequence of inline elements between square brackets,
+-- including inlines between balanced pairs of square brackets.
+inlinesInBalancedBrackets :: GenParser Char ParserState Inline
+ -> GenParser Char ParserState [Inline]
+inlinesInBalancedBrackets parser = try $ do
+ char '['
+ result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
+ if res == "["
+ then return ()
+ else pzero
+ bal <- inlinesInBalancedBrackets parser
+ return $ [Str "["] ++ bal ++ [Str "]"])
+ <|> (count 1 parser))
+ (char ']')
+ return $ concat result
+
+--
+-- document structure
+--
+
+titleLine :: GenParser Char ParserState [Inline]
+titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
+
+authorsLine :: GenParser Char st [String]
+authorsLine = try $ do
+ char '%'
+ skipSpaces
+ authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
+ newline
+ return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
+
+dateLine :: GenParser Char st String
+dateLine = try $ do
+ char '%'
+ skipSpaces
+ date <- many (noneOf "\n")
+ newline
+ return $ decodeCharacterReferences $ removeTrailingSpace date
+
+titleBlock :: GenParser Char ParserState ([Inline], [String], [Char])
+titleBlock = try $ do
+ failIfStrict
+ title <- option [] titleLine
+ author <- option [] authorsLine
+ date <- option "" dateLine
+ optional blanklines
+ return (title, author, date)
+
+parseMarkdown :: GenParser Char ParserState Pandoc
+parseMarkdown = do
+ -- markdown allows raw HTML
+ updateState (\state -> state { stateParseRaw = True })
+ startPos <- getPosition
+ -- go through once just to get list of reference keys
+ -- docMinusKeys is the raw document with blanks where the keys were...
+ docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>=
+ return . concat
+ setInput docMinusKeys
+ setPosition startPos
+ st <- getState
+ -- go through again for notes unless strict...
+ if stateStrict st
+ then return ()
+ else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
+ return . concat
+ st' <- getState
+ let reversedNotes = stateNotes st'
+ updateState $ \s -> s { stateNotes = reverse reversedNotes }
+ setInput docMinusNotes
+ setPosition startPos
+ -- now parse it for real...
+ (title, author, date) <- option ([],[],"") titleBlock
+ blocks <- parseBlocks
+ return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
+
+--
+-- initial pass for references and notes
+--
+
+referenceKey :: GenParser Char ParserState [Char]
+referenceKey = try $ do
+ startPos <- getPosition
+ nonindentSpaces
+ lab <- reference
+ char ':'
+ skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
+ let sourceURL excludes = many $
+ optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' '))
+ src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ tit <- option "" referenceTitle
+ blanklines
+ endPos <- getPosition
+ let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit))
+ st <- getState
+ let oldkeys = stateKeys st
+ updateState $ \s -> s { stateKeys = newkey : oldkeys }
+ -- return blanks so line count isn't affected
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+referenceTitle :: GenParser Char st String
+referenceTitle = try $ do
+ skipSpaces >> optional newline >> skipSpaces
+ tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
+ <|> do delim <- char '\'' <|> char '"'
+ manyTill anyChar (try (char delim >> skipSpaces >>
+ notFollowedBy (noneOf ")\n")))
+ return $ decodeCharacterReferences tit
+
+noteMarker :: GenParser Char st [Char]
+noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']')
+
+rawLine :: GenParser Char ParserState [Char]
+rawLine = do
+ notFollowedBy blankline
+ notFollowedBy' noteMarker
+ contents <- many1 nonEndline
+ end <- option "" (newline >> optional indentSpaces >> return "\n")
+ return $ contents ++ end
+
+rawLines :: GenParser Char ParserState [Char]
+rawLines = many1 rawLine >>= return . concat
+
+noteBlock :: GenParser Char ParserState [Char]
+noteBlock = try $ do
+ startPos <- getPosition
+ ref <- noteMarker
+ char ':'
+ optional blankline
+ optional indentSpaces
+ raw <- sepBy rawLines (try (blankline >> indentSpaces))
+ optional blanklines
+ endPos <- getPosition
+ -- parse the extracted text, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
+ let newnote = (ref, contents)
+ st <- getState
+ let oldnotes = stateNotes st
+ updateState $ \s -> s { stateNotes = newnote : oldnotes }
+ -- return blanks so line count isn't affected
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = manyTill block eof
+
+block :: GenParser Char ParserState Block
+block = do
+ st <- getState
+ choice (if stateStrict st
+ then [ header
+ , codeBlockIndented
+ , blockQuote
+ , hrule
+ , bulletList
+ , orderedList
+ , htmlBlock
+ , para
+ , plain
+ , nullBlock ]
+ else [ codeBlockDelimited
+ , header
+ , table
+ , codeBlockIndented
+ , lhsCodeBlock
+ , blockQuote
+ , hrule
+ , bulletList
+ , orderedList
+ , definitionList
+ , para
+ , rawHtmlBlocks
+ , plain
+ , nullBlock ]) <?> "block"
+
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = setextHeader <|> atxHeader <?> "header"
+
+atxHeader :: GenParser Char ParserState Block
+atxHeader = try $ do
+ level <- many1 (char '#') >>= return . length
+ notFollowedBy (char '.' <|> char ')') -- this would be a list
+ skipSpaces
+ text <- manyTill inline atxClosing >>= return . normalizeSpaces
+ return $ Header level text
+
+atxClosing :: GenParser Char st [Char]
+atxClosing = try $ skipMany (char '#') >> blanklines
+
+setextHeader :: GenParser Char ParserState Block
+setextHeader = try $ do
+ text <- many1Till inline newline
+ underlineChar <- oneOf setextHChars
+ many (char underlineChar)
+ blanklines
+ let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1
+ return $ Header level (normalizeSpaces text)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char st Block
+hrule = try $ do
+ skipSpaces
+ start <- oneOf hruleChars
+ count 2 (skipSpaces >> char start)
+ skipMany (oneOf spaceChars <|> char start)
+ newline
+ optional blanklines
+ return HorizontalRule
+
+--
+-- code blocks
+--
+
+indentedLine :: GenParser Char ParserState [Char]
+indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
+
+codeBlockDelimiter :: Maybe Int
+ -> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])]))
+codeBlockDelimiter len = try $ do
+ size <- case len of
+ Just l -> count l (char '~') >> many (char '~') >> return l
+ Nothing -> count 3 (char '~') >> many (char '~') >>=
+ return . (+ 3) . length
+ many spaceChar
+ attr <- option ([],[],[]) attributes
+ blankline
+ return (size, attr)
+
+attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attributes = try $ do
+ char '{'
+ many spaceChar
+ attrs <- many (attribute >>~ many spaceChar)
+ char '}'
+ let (ids, classes, keyvals) = unzip3 attrs
+ let id' = if null ids then "" else head ids
+ return (id', concat classes, concat keyvals)
+
+attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attribute = identifierAttr <|> classAttr <|> keyValAttr
+
+identifier :: GenParser Char st [Char]
+identifier = do
+ first <- letter
+ rest <- many alphaNum
+ return (first:rest)
+
+identifierAttr :: GenParser Char st ([Char], [a], [a1])
+identifierAttr = try $ do
+ char '#'
+ result <- identifier
+ return (result,[],[])
+
+classAttr :: GenParser Char st ([Char], [[Char]], [a])
+classAttr = try $ do
+ char '.'
+ result <- identifier
+ return ("",[result],[])
+
+keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])])
+keyValAttr = try $ do
+ key <- identifier
+ char '='
+ char '"'
+ val <- manyTill (noneOf "\n") (char '"')
+ return ("",[],[(key,val)])
+
+codeBlockDelimited :: GenParser Char st Block
+codeBlockDelimited = try $ do
+ (size, attr) <- codeBlockDelimiter Nothing
+ contents <- manyTill anyLine (codeBlockDelimiter (Just size))
+ blanklines
+ return $ CodeBlock attr $ intercalate "\n" contents
+
+codeBlockIndented :: GenParser Char ParserState Block
+codeBlockIndented = do
+ contents <- many1 (indentedLine <|>
+ try (do b <- blanklines
+ l <- indentedLine
+ return $ b ++ l))
+ optional blanklines
+ return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents
+
+lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock = do
+ failUnlessLHS
+ contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX
+ return $ CodeBlock ("",["sourceCode","haskell"],[]) contents
+
+lhsCodeBlockLaTeX :: GenParser Char ParserState String
+lhsCodeBlockLaTeX = try $ do
+ string "\\begin{code}"
+ manyTill spaceChar newline
+ contents <- many1Till anyChar (try $ string "\\end{code}")
+ blanklines
+ return $ stripTrailingNewlines contents
+
+lhsCodeBlockBird :: GenParser Char ParserState String
+lhsCodeBlockBird = try $ do
+ pos <- getPosition
+ when (sourceColumn pos /= 1) $ fail "Not in first column"
+ lns <- many1 birdTrackLine
+ -- if (as is normal) there is always a space after >, drop it
+ let lns' = if all (\ln -> null ln || take 1 ln == " ") lns
+ then map (drop 1) lns
+ else lns
+ blanklines
+ return $ intercalate "\n" lns'
+
+birdTrackLine :: GenParser Char st [Char]
+birdTrackLine = do
+ char '>'
+ manyTill anyChar newline
+
+
+--
+-- block quotes
+--
+
+emailBlockQuoteStart :: GenParser Char ParserState Char
+emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
+
+emailBlockQuote :: GenParser Char ParserState [[Char]]
+emailBlockQuote = try $ do
+ emailBlockQuoteStart
+ raw <- sepBy (many (nonEndline <|>
+ (try (endline >> notFollowedBy emailBlockQuoteStart >>
+ return '\n'))))
+ (try (newline >> emailBlockQuoteStart))
+ newline <|> (eof >> return '\n')
+ optional blanklines
+ return raw
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = do
+ raw <- emailBlockQuote
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
+ return $ BlockQuote contents
+
+--
+-- list blocks
+--
+
+bulletListStart :: GenParser Char ParserState ()
+bulletListStart = try $ do
+ optional newline -- if preceded by a Plain block in a list context
+ nonindentSpaces
+ notFollowedBy' hrule -- because hrules start out just like lists
+ oneOf bulletListMarkers
+ spaceChar
+ skipSpaces
+
+anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim)
+anyOrderedListStart = try $ do
+ optional newline -- if preceded by a Plain block in a list context
+ nonindentSpaces
+ notFollowedBy $ string "p." >> spaceChar >> digit -- page number
+ state <- getState
+ if stateStrict state
+ then do many1 digit
+ char '.'
+ spaceChar
+ return (1, DefaultStyle, DefaultDelim)
+ else do (num, style, delim) <- anyOrderedListMarker
+ -- if it could be an abbreviated first name, insist on more than one space
+ if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
+ num `elem` [1, 5, 10, 50, 100, 500, 1000]))
+ then char '\t' <|> (char ' ' >>~ notFollowedBy (satisfy isUpper))
+ else spaceChar
+ skipSpaces
+ return (num, style, delim)
+
+listStart :: GenParser Char ParserState ()
+listStart = bulletListStart <|> (anyOrderedListStart >> return ())
+
+-- parse a line of a list item (start = parser for beginning of list item)
+listLine :: GenParser Char ParserState [Char]
+listLine = try $ do
+ notFollowedBy' listStart
+ notFollowedBy blankline
+ notFollowedBy' (do indentSpaces
+ many (spaceChar)
+ listStart)
+ line <- manyTill anyChar newline
+ return $ line ++ "\n"
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem :: GenParser Char ParserState [Char]
+rawListItem = try $ do
+ listStart
+ result <- many1 listLine
+ blanks <- many blankline
+ return $ concat result ++ blanks
+
+-- continuation of a list item - indented and separated by blankline
+-- or (in compact lists) endline.
+-- note: nested lists are parsed as continuations
+listContinuation :: GenParser Char ParserState [Char]
+listContinuation = try $ do
+ lookAhead indentSpaces
+ result <- many1 listContinuationLine
+ blanks <- many blankline
+ return $ concat result ++ blanks
+
+listContinuationLine :: GenParser Char ParserState [Char]
+listContinuationLine = try $ do
+ notFollowedBy blankline
+ notFollowedBy' listStart
+ optional indentSpaces
+ result <- manyTill anyChar newline
+ return $ result ++ "\n"
+
+listItem :: GenParser Char ParserState [Block]
+listItem = try $ do
+ first <- rawListItem
+ continuations <- many listContinuation
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let oldContext = stateParserContext state
+ setState $ state {stateParserContext = ListItemState}
+ -- parse the extracted block, which may contain various block elements:
+ let raw = concat (first:continuations)
+ contents <- parseFromString parseBlocks raw
+ updateState (\st -> st {stateParserContext = oldContext})
+ return contents
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ (start, style, delim) <- lookAhead anyOrderedListStart
+ items <- many1 listItem
+ return $ OrderedList (start, style, delim) $ compactify items
+
+bulletList :: GenParser Char ParserState Block
+bulletList = try $ do
+ lookAhead bulletListStart
+ many1 listItem >>= return . BulletList . compactify
+
+-- definition lists
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+ notFollowedBy blankline
+ notFollowedBy' indentSpaces
+ -- first, see if this has any chance of being a definition list:
+ lookAhead (anyLine >> char ':')
+ term <- manyTill inline newline
+ raw <- many1 defRawBlock
+ state <- getState
+ let oldContext = stateParserContext state
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ concat raw
+ updateState (\st -> st {stateParserContext = oldContext})
+ return ((normalizeSpaces term), contents)
+
+defRawBlock :: GenParser Char ParserState [Char]
+defRawBlock = try $ do
+ char ':'
+ state <- getState
+ let tabStop = stateTabStop state
+ try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
+ firstline <- anyLine
+ rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
+ trailing <- option "" blanklines
+ return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
+
+definitionList :: GenParser Char ParserState Block
+definitionList = do
+ items <- many1 definitionListItem
+ let (terms, defs) = unzip items
+ let defs' = compactify defs
+ let items' = zip terms defs'
+ return $ DefinitionList items'
+
+--
+-- paragraph block
+--
+
+isHtmlOrBlank :: Inline -> Bool
+isHtmlOrBlank (HtmlInline _) = True
+isHtmlOrBlank (Space) = True
+isHtmlOrBlank (LineBreak) = True
+isHtmlOrBlank _ = False
+
+para :: GenParser Char ParserState Block
+para = try $ do
+ result <- many1 inline
+ if all isHtmlOrBlank result
+ then fail "treat as raw HTML"
+ else return ()
+ newline
+ blanklines <|> do st <- getState
+ if stateStrict st
+ then lookAhead (blockQuote <|> header) >> return ""
+ else pzero
+ return $ Para $ normalizeSpaces result
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+--
+-- raw html
+--
+
+htmlElement :: GenParser Char ParserState [Char]
+htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
+
+htmlBlock :: GenParser Char ParserState Block
+htmlBlock = try $ do
+ failUnlessBeginningOfLine
+ first <- htmlElement
+ finalSpace <- many (oneOf spaceChars)
+ finalNewlines <- many newline
+ return $ RawHtml $ first ++ finalSpace ++ finalNewlines
+
+-- True if tag is self-closing
+isSelfClosing :: [Char] -> Bool
+isSelfClosing tag =
+ isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
+
+strictHtmlBlock :: GenParser Char ParserState [Char]
+strictHtmlBlock = try $ do
+ tag <- anyHtmlBlockTag
+ let tag' = extractTagType tag
+ if isSelfClosing tag || tag' == "hr"
+ then return tag
+ else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
+ (htmlElement <|> (count 1 anyChar)))
+ end <- htmlEndTag tag'
+ return $ tag ++ concat contents ++ end
+
+rawHtmlBlocks :: GenParser Char ParserState Block
+rawHtmlBlocks = do
+ htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock
+ sps <- do sp1 <- many spaceChar
+ sp2 <- option "" (blankline >> return "\n")
+ sp3 <- many spaceChar
+ sp4 <- option "" blanklines
+ return $ sp1 ++ sp2 ++ sp3 ++ sp4
+ -- note: we want raw html to be able to
+ -- precede a code block, when separated
+ -- by a blank line
+ return $ blk ++ sps
+ let combined = concat htmlBlocks
+ let combined' = if last combined == '\n' then init combined else combined
+ return $ RawHtml combined'
+
+--
+-- Tables
+--
+
+-- Parse a dashed line with optional trailing spaces; return its length
+-- and the length including trailing space.
+dashedLine :: Char
+ -> GenParser Char st (Int, Int)
+dashedLine ch = do
+ dashes <- many1 (char ch)
+ sp <- many spaceChar
+ return $ (length dashes, length $ dashes ++ sp)
+
+-- Parse a table header with dashed lines of '-' preceded by
+-- one line of text.
+simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
+simpleTableHeader = try $ do
+ rawContent <- anyLine
+ initSp <- nonindentSpaces
+ dashes <- many1 (dashedLine '-')
+ newline
+ let (lengths, lines') = unzip dashes
+ let indices = scanl (+) (length initSp) lines'
+ let rawHeads = tail $ splitByIndices (init indices) rawContent
+ let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
+ return (rawHeads, aligns, indices)
+
+-- Parse a table footer - dashed lines followed by blank line.
+tableFooter :: GenParser Char ParserState [Char]
+tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
+
+-- Parse a table separator - dashed line.
+tableSep :: GenParser Char ParserState String
+tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
+
+-- Parse a raw line and split it into chunks by indices.
+rawTableLine :: [Int]
+ -> GenParser Char ParserState [String]
+rawTableLine indices = do
+ notFollowedBy' (blanklines <|> tableFooter)
+ line <- many1Till anyChar newline
+ return $ map removeLeadingTrailingSpace $ tail $
+ splitByIndices (init indices) line
+
+-- Parse a table line and return a list of lists of blocks (columns).
+tableLine :: [Int]
+ -> GenParser Char ParserState [[Block]]
+tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
+
+-- Parse a multiline table row and return a list of blocks (columns).
+multilineRow :: [Int]
+ -> GenParser Char ParserState [[Block]]
+multilineRow indices = do
+ colLines <- many1 (rawTableLine indices)
+ optional blanklines
+ let cols = map unlines $ transpose colLines
+ mapM (parseFromString (many plain)) cols
+
+-- Calculate relative widths of table columns, based on indices
+widthsFromIndices :: Int -- Number of columns on terminal
+ -> [Int] -- Indices
+ -> [Double] -- Fractional relative sizes of columns
+widthsFromIndices _ [] = []
+widthsFromIndices numColumns indices =
+ let lengths = zipWith (-) indices (0:indices)
+ totLength = sum lengths
+ quotient = if totLength > numColumns
+ then fromIntegral totLength
+ else fromIntegral numColumns
+ fracs = map (\l -> (fromIntegral l) / quotient) lengths in
+ tail fracs
+
+-- Parses a table caption: inlines beginning with 'Table:'
+-- and followed by blank lines.
+tableCaption :: GenParser Char ParserState [Inline]
+tableCaption = try $ do
+ nonindentSpaces
+ string "Table:"
+ result <- many1 inline
+ blanklines
+ return $ normalizeSpaces result
+
+-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'.
+tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int])
+ -> ([Int] -> GenParser Char ParserState [[Block]])
+ -> GenParser Char ParserState end
+ -> GenParser Char ParserState Block
+tableWith headerParser lineParser footerParser = try $ do
+ (rawHeads, aligns, indices) <- headerParser
+ lines' <- many1Till (lineParser indices) footerParser
+ caption <- option [] tableCaption
+ heads <- mapM (parseFromString (many plain)) rawHeads
+ state <- getState
+ let numColumns = stateColumns state
+ let widths = widthsFromIndices numColumns indices
+ return $ Table caption aligns widths heads lines'
+
+-- Parse a simple table with '---' header and one line per row.
+simpleTable :: GenParser Char ParserState Block
+simpleTable = tableWith simpleTableHeader tableLine blanklines
+
+-- Parse a multiline table: starts with row of '-' on top, then header
+-- (which may be multiline), then the rows,
+-- which may be multiline, separated by blank lines, and
+-- ending with a footer (dashed line followed by blank line).
+multilineTable :: GenParser Char ParserState Block
+multilineTable = tableWith multilineTableHeader multilineRow tableFooter
+
+multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int])
+multilineTableHeader = try $ do
+ tableSep
+ rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
+ initSp <- nonindentSpaces
+ dashes <- many1 (dashedLine '-')
+ newline
+ let (lengths, lines') = unzip dashes
+ let indices = scanl (+) (length initSp) lines'
+ let rawHeadsList = transpose $ map
+ (\ln -> tail $ splitByIndices (init indices) ln)
+ rawContent
+ let rawHeads = map (intercalate " ") rawHeadsList
+ let aligns = zipWith alignType rawHeadsList lengths
+ return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
+
+-- Returns an alignment type for a table, based on a list of strings
+-- (the rows of the column header) and a number (the length of the
+-- dashed line under the rows.
+alignType :: [String]
+ -> Int
+ -> Alignment
+alignType [] _ = AlignDefault
+alignType strLst len =
+ let s = head $ sortBy (comparing length) $
+ map removeTrailingSpace strLst
+ leftSpace = if null s then False else (s !! 0) `elem` " \t"
+ rightSpace = length s < len || (s !! (len - 1)) `elem` " \t"
+ in case (leftSpace, rightSpace) of
+ (True, False) -> AlignRight
+ (False, True) -> AlignLeft
+ (True, True) -> AlignCenter
+ (False, False) -> AlignDefault
+
+table :: GenParser Char ParserState Block
+table = simpleTable <|> multilineTable <?> "table"
+
+--
+-- inline
+--
+
+inline :: GenParser Char ParserState Inline
+inline = choice inlineParsers <?> "inline"
+
+inlineParsers :: [GenParser Char ParserState Inline]
+inlineParsers = [ abbrev
+ , str
+ , smartPunctuation
+ , whitespace
+ , endline
+ , code
+ , charRef
+ , strong
+ , emph
+ , note
+ , inlineNote
+ , link
+#ifdef _CITEPROC
+ , inlineCitation
+#endif
+ , image
+ , math
+ , strikeout
+ , superscript
+ , subscript
+ , autoLink
+ , rawHtmlInline'
+ , rawLaTeXInline'
+ , escapedChar
+ , symbol
+ , ltSign ]
+
+inlineNonLink :: GenParser Char ParserState Inline
+inlineNonLink = (choice $
+ map (\parser -> try (parser >>= failIfLink)) inlineParsers)
+ <?> "inline (non-link)"
+
+failIfLink :: Inline -> GenParser tok st Inline
+failIfLink (Link _ _) = pzero
+failIfLink elt = return elt
+
+escapedChar :: GenParser Char ParserState Inline
+escapedChar = do
+ char '\\'
+ state <- getState
+ result <- option '\\' $ if stateStrict state
+ then oneOf "\\`*_{}[]()>#+-.!~"
+ else satisfy (not . isAlphaNum)
+ let result' = if result == ' '
+ then '\160' -- '\ ' is a nonbreaking space
+ else result
+ return $ Str [result']
+
+ltSign :: GenParser Char ParserState Inline
+ltSign = do
+ st <- getState
+ if stateStrict st
+ then char '<'
+ else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
+ return $ Str ['<']
+
+specialCharsMinusLt :: [Char]
+specialCharsMinusLt = filter (/= '<') specialChars
+
+symbol :: GenParser Char ParserState Inline
+symbol = do
+ result <- oneOf specialCharsMinusLt
+ return $ Str [result]
+
+-- parses inline code, between n `s and n `s
+code :: GenParser Char ParserState Inline
+code = try $ do
+ starts <- many1 (char '`')
+ skipSpaces
+ result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
+ (char '\n' >> return " "))
+ (try (skipSpaces >> count (length starts) (char '`') >>
+ notFollowedBy (char '`')))
+ return $ Code $ removeLeadingTrailingSpace $ concat result
+
+mathWord :: GenParser Char st [Char]
+mathWord = many1 ((noneOf " \t\n\\$") <|>
+ (try (char '\\') >>~ notFollowedBy (char '$')))
+
+math :: GenParser Char ParserState Inline
+math = (mathDisplay >>= return . Math DisplayMath)
+ <|> (mathInline >>= return . Math InlineMath)
+
+mathDisplay :: GenParser Char ParserState String
+mathDisplay = try $ do
+ failIfStrict
+ string "$$"
+ many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$")
+
+mathInline :: GenParser Char ParserState String
+mathInline = try $ do
+ failIfStrict
+ char '$'
+ notFollowedBy space
+ words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline)))
+ char '$'
+ notFollowedBy digit
+ return $ intercalate " " words'
+
+emph :: GenParser Char ParserState Inline
+emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
+ (enclosed (char '_') (notFollowedBy' strong >> char '_' >>
+ notFollowedBy alphaNum) inline)) >>=
+ return . Emph . normalizeSpaces
+
+strong :: GenParser Char ParserState Inline
+strong = ((enclosed (string "**") (try $ string "**") inline) <|>
+ (enclosed (string "__") (try $ string "__") inline)) >>=
+ return . Strong . normalizeSpaces
+
+strikeout :: GenParser Char ParserState Inline
+strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
+ return . Strikeout . normalizeSpaces
+
+superscript :: GenParser Char ParserState Inline
+superscript = failIfStrict >> enclosed (char '^') (char '^')
+ (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
+ return . Superscript
+
+subscript :: GenParser Char ParserState Inline
+subscript = failIfStrict >> enclosed (char '~') (char '~')
+ (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
+ return . Subscript
+
+abbrev :: GenParser Char ParserState Inline
+abbrev = failUnlessSmart >>
+ (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
+
+-- an string of letters followed by a period that does not end a sentence
+-- is assumed to be an abbreviation. It is assumed that sentences don't
+-- start with lowercase letters or numerals.
+assumedAbbrev :: GenParser Char ParserState [Char]
+assumedAbbrev = try $ do
+ result <- many1 $ satisfy isAlpha
+ string ". "
+ lookAhead $ satisfy (\x -> isLower x || isDigit x)
+ return result
+
+-- these strings are treated as abbreviations even if they are followed
+-- by a capital letter (such as a name).
+knownAbbrev :: GenParser Char ParserState [Char]
+knownAbbrev = try $ do
+ result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
+ "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs",
+ "Sen", "Rep", "Pres", "Hon", "Rev" ]
+ string ". "
+ return result
+
+smartPunctuation :: GenParser Char ParserState Inline
+smartPunctuation = failUnlessSmart >>
+ choice [ quoted, apostrophe, dash, ellipses ]
+
+apostrophe :: GenParser Char ParserState Inline
+apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
+
+quoted :: GenParser Char ParserState Inline
+quoted = doubleQuoted <|> singleQuoted
+
+withQuoteContext :: QuoteContext
+ -> (GenParser Char ParserState Inline)
+ -> GenParser Char ParserState Inline
+withQuoteContext context parser = do
+ oldState <- getState
+ let oldQuoteContext = stateQuoteContext oldState
+ setState oldState { stateQuoteContext = context }
+ result <- parser
+ newState <- getState
+ setState newState { stateQuoteContext = oldQuoteContext }
+ return result
+
+singleQuoted :: GenParser Char ParserState Inline
+singleQuoted = try $ do
+ singleQuoteStart
+ withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
+ return . Quoted SingleQuote . normalizeSpaces
+
+doubleQuoted :: GenParser Char ParserState Inline
+doubleQuoted = try $ do
+ doubleQuoteStart
+ withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
+ return . Quoted DoubleQuote . normalizeSpaces
+
+failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState ()
+failIfInQuoteContext context = do
+ st <- getState
+ if stateQuoteContext st == context
+ then fail "already inside quotes"
+ else return ()
+
+singleQuoteStart :: GenParser Char ParserState Char
+singleQuoteStart = do
+ failIfInQuoteContext InSingleQuote
+ char '\8216' <|>
+ (try $ do char '\''
+ notFollowedBy (oneOf ")!],.;:-? \t\n")
+ notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
+ satisfy (not . isAlphaNum)))
+ -- possess/contraction
+ return '\'')
+
+singleQuoteEnd :: GenParser Char st Char
+singleQuoteEnd = try $ do
+ char '\8217' <|> char '\''
+ notFollowedBy alphaNum
+ return '\''
+
+doubleQuoteStart :: GenParser Char ParserState Char
+doubleQuoteStart = do
+ failIfInQuoteContext InDoubleQuote
+ char '\8220' <|>
+ (try $ do char '"'
+ notFollowedBy (oneOf " \t\n")
+ return '"')
+
+doubleQuoteEnd :: GenParser Char st Char
+doubleQuoteEnd = char '\8221' <|> char '"'
+
+ellipses :: GenParser Char st Inline
+ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses
+
+dash :: GenParser Char st Inline
+dash = enDash <|> emDash
+
+enDash :: GenParser Char st Inline
+enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
+
+emDash :: GenParser Char st Inline
+emDash = oneOfStrings ["---", "--"] >> return EmDash
+
+whitespace :: GenParser Char ParserState Inline
+whitespace = do
+ sps <- many1 (oneOf spaceChars)
+ if length sps >= 2
+ then option Space (endline >> return LineBreak)
+ else return Space <?> "whitespace"
+
+nonEndline :: GenParser Char st Char
+nonEndline = satisfy (/='\n')
+
+strChar :: GenParser Char st Char
+strChar = noneOf (specialChars ++ spaceChars ++ "\n")
+
+str :: GenParser Char st Inline
+str = many1 strChar >>= return . Str
+
+-- an endline character that can be treated as a space, not a structural break
+endline :: GenParser Char ParserState Inline
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ st <- getState
+ if stateStrict st
+ then do notFollowedBy emailBlockQuoteStart
+ notFollowedBy (char '#') -- atx header
+ else return ()
+ -- parse potential list-starts differently if in a list:
+ if stateParserContext st == ListItemState
+ then notFollowedBy' (bulletListStart <|>
+ (anyOrderedListStart >> return ()))
+ else return ()
+ return Space
+
+--
+-- links
+--
+
+-- a reference label for a link
+reference :: GenParser Char ParserState [Inline]
+reference = do notFollowedBy' (string "[^") -- footnote reference
+ result <- inlinesInBalancedBrackets inlineNonLink
+ return $ normalizeSpaces result
+
+-- source for a link, with optional title
+source :: GenParser Char st (String, [Char])
+source =
+ (try $ charsInBalanced '(' ')' >>= parseFromString source') <|>
+ -- the following is needed for cases like: [ref](/url(a).
+ (enclosed (char '(') (char ')') anyChar >>=
+ parseFromString source')
+
+-- auxiliary function for source
+source' :: GenParser Char st (String, [Char])
+source' = do
+ skipSpaces
+ let sourceURL excludes = many $
+ optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' '))
+ src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ tit <- option "" linkTitle
+ skipSpaces
+ eof
+ return (intercalate "%20" $ words $ removeTrailingSpace src, tit)
+
+linkTitle :: GenParser Char st String
+linkTitle = try $ do
+ (many1 spaceChar >> option '\n' newline) <|> newline
+ skipSpaces
+ delim <- oneOf "'\""
+ tit <- manyTill (optional (char '\\') >> anyChar)
+ (try (char delim >> skipSpaces >> eof))
+ return $ decodeCharacterReferences tit
+
+link :: GenParser Char ParserState Inline
+link = try $ do
+ lab <- reference
+ src <- source <|> referenceLink lab
+ sanitize <- getState >>= return . stateSanitizeHTML
+ if sanitize && unsanitaryURI (fst src)
+ then fail "Unsanitary URI"
+ else return $ Link lab src
+
+-- a link like [this][ref] or [this][] or [this]
+referenceLink :: [Inline]
+ -> GenParser Char ParserState (String, [Char])
+referenceLink lab = do
+ ref <- option [] (try (optional (char ' ') >>
+ optional (newline >> skipSpaces) >> reference))
+ let ref' = if null ref then lab else ref
+ state <- getState
+ case lookupKeySrc (stateKeys state) ref' of
+ Nothing -> fail "no corresponding key"
+ Just target -> return target
+
+autoLink :: GenParser Char ParserState Inline
+autoLink = try $ do
+ char '<'
+ src <- uri <|> (emailAddress >>= (return . ("mailto:" ++)))
+ char '>'
+ let src' = if "mailto:" `isPrefixOf` src
+ then drop 7 src
+ else src
+ st <- getState
+ let sanitize = stateSanitizeHTML st
+ if sanitize && unsanitaryURI src
+ then fail "Unsanitary URI"
+ else return $ if stateStrict st
+ then Link [Str src'] (src, "")
+ else Link [Code src'] (src, "")
+
+image :: GenParser Char ParserState Inline
+image = try $ do
+ char '!'
+ (Link lab src) <- link
+ return $ Image lab src
+
+note :: GenParser Char ParserState Inline
+note = try $ do
+ failIfStrict
+ ref <- noteMarker
+ state <- getState
+ let notes = stateNotes state
+ case lookup ref notes of
+ Nothing -> fail "note not found"
+ Just contents -> return $ Note contents
+
+inlineNote :: GenParser Char ParserState Inline
+inlineNote = try $ do
+ failIfStrict
+ char '^'
+ contents <- inlinesInBalancedBrackets inline
+ return $ Note [Para contents]
+
+rawLaTeXInline' :: GenParser Char ParserState Inline
+rawLaTeXInline' = do
+ failIfStrict
+ (rawConTeXtEnvironment' >>= return . TeX)
+ <|> (rawLaTeXEnvironment' >>= return . TeX)
+ <|> rawLaTeXInline
+
+rawConTeXtEnvironment' :: GenParser Char st String
+rawConTeXtEnvironment' = try $ do
+ string "\\start"
+ completion <- inBrackets (letter <|> digit <|> spaceChar)
+ <|> (many1 letter)
+ contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar))
+ (try $ string "\\stop" >> string completion)
+ return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion
+
+inBrackets :: (GenParser Char st Char) -> GenParser Char st String
+inBrackets parser = do
+ char '['
+ contents <- many parser
+ char ']'
+ return $ "[" ++ contents ++ "]"
+
+rawHtmlInline' :: GenParser Char ParserState Inline
+rawHtmlInline' = do
+ st <- getState
+ result <- if stateStrict st
+ then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
+ else anyHtmlInlineTag
+ return $ HtmlInline result
+
+#ifdef _CITEPROC
+inlineCitation :: GenParser Char ParserState Inline
+inlineCitation = try $ do
+ failIfStrict
+ cit <- citeMarker
+ let citations = readWith parseCitation defaultParserState cit
+ mr <- mapM chkCit citations
+ if catMaybes mr /= []
+ then return $ Cite citations []
+ else fail "no citation found"
+
+chkCit :: Target -> GenParser Char ParserState (Maybe Target)
+chkCit t = do
+ st <- getState
+ case lookupKeySrc (stateKeys st) [Str $ fst t] of
+ Just _ -> fail "This is a link"
+ Nothing -> if elem (fst t) $ stateCitations st
+ then return $ Just t
+ else return $ Nothing
+
+citeMarker :: GenParser Char ParserState String
+citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']')
+
+parseCitation :: GenParser Char ParserState [(String,String)]
+parseCitation = try $ sepBy (parseLabel) (oneOf ";")
+
+parseLabel :: GenParser Char ParserState (String,String)
+parseLabel = try $ do
+ res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@")
+ case res of
+ [lab,loc] -> return (lab, loc)
+ [lab] -> return (lab, "" )
+ _ -> return ("" , "" )
+
+#endif
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
new file mode 100644
index 000000000..255054c10
--- /dev/null
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -0,0 +1,707 @@
+{-
+Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.RST
+ Copyright : Copyright (C) 2006-8 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion from reStructuredText to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.RST (
+ readRST
+ ) where
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.ParserCombinators.Parsec
+import Control.Monad ( when )
+import Data.List ( findIndex, delete, intercalate )
+
+-- | Parse reStructuredText string and return Pandoc document.
+readRST :: ParserState -> String -> Pandoc
+readRST state s = (readWith parseRST) state (s ++ "\n\n")
+
+--
+-- Constants and data structure definitions
+---
+
+bulletListMarkers :: [Char]
+bulletListMarkers = "*+-"
+
+underlineChars :: [Char]
+underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars :: [Char]
+specialChars = "\\`|*_<>$:[-"
+
+--
+-- parsing documents
+--
+
+isHeader :: Int -> Block -> Bool
+isHeader n (Header x _) = x == n
+isHeader _ _ = False
+
+-- | Promote all headers in a list of blocks. (Part of
+-- title transformation for RST.)
+promoteHeaders :: Int -> [Block] -> [Block]
+promoteHeaders num ((Header level text):rest) =
+ (Header (level - num) text):(promoteHeaders num rest)
+promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
+promoteHeaders _ [] = []
+
+-- | If list of blocks starts with a header (or a header and subheader)
+-- of level that are not found elsewhere, return it as a title and
+-- promote all the other headers.
+titleTransform :: [Block] -- ^ list of blocks
+ -> ([Block], [Inline]) -- ^ modified list of blocks, title
+titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle
+ if (any (isHeader 1) rest) || (any (isHeader 2) rest)
+ then ((Header 1 head1):(Header 2 head2):rest, [])
+ else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
+titleTransform ((Header 1 head1):rest) = -- title, no subtitle
+ if (any (isHeader 1) rest)
+ then ((Header 1 head1):rest, [])
+ else ((promoteHeaders 1 rest), head1)
+titleTransform blocks = (blocks, [])
+
+parseRST :: GenParser Char ParserState Pandoc
+parseRST = do
+ startPos <- getPosition
+ -- go through once just to get list of reference keys
+ -- docMinusKeys is the raw document with blanks where the keys were...
+ docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat
+ setInput docMinusKeys
+ setPosition startPos
+ st <- getState
+ let reversedKeys = stateKeys st
+ updateState $ \s -> s { stateKeys = reverse reversedKeys }
+ -- now parse it for real...
+ blocks <- parseBlocks
+ let blocks' = filter (/= Null) blocks
+ state <- getState
+ let (blocks'', title) = if stateStandalone state
+ then titleTransform blocks'
+ else (blocks', [])
+ let authors = stateAuthors state
+ let date = stateDate state
+ let title' = if (null title) then (stateTitle state) else title
+ return $ Pandoc (Meta title' authors date) blocks''
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks = manyTill block eof
+
+block :: GenParser Char ParserState Block
+block = choice [ codeBlock
+ , rawHtmlBlock
+ , rawLaTeXBlock
+ , fieldList
+ , blockQuote
+ , imageBlock
+ , unknownDirective
+ , header
+ , hrule
+ , list
+ , lineBlock
+ , lhsCodeBlock
+ , para
+ , plain
+ , nullBlock ] <?> "block"
+
+--
+-- field list
+--
+
+fieldListItem :: String -> GenParser Char st ([Char], [Char])
+fieldListItem indent = try $ do
+ string indent
+ char ':'
+ name <- many1 alphaNum
+ string ": "
+ skipSpaces
+ first <- manyTill anyChar newline
+ rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >>
+ indentedBlock
+ return (name, intercalate " " (first:(lines rest)))
+
+fieldList :: GenParser Char ParserState Block
+fieldList = try $ do
+ indent <- lookAhead $ many (oneOf " \t")
+ items <- many1 $ fieldListItem indent
+ blanklines
+ let authors = case lookup "Authors" items of
+ Just auth -> [auth]
+ Nothing -> map snd (filter (\(x,_) -> x == "Author") items)
+ if null authors
+ then return ()
+ else updateState $ \st -> st {stateAuthors = authors}
+ case (lookup "Date" items) of
+ Just dat -> updateState $ \st -> st {stateDate = dat}
+ Nothing -> return ()
+ case (lookup "Title" items) of
+ Just tit -> parseFromString (many inline) tit >>=
+ \t -> updateState $ \st -> st {stateTitle = t}
+ Nothing -> return ()
+ let remaining = filter (\(x,_) -> (x /= "Authors") && (x /= "Author") &&
+ (x /= "Date") && (x /= "Title")) items
+ if null remaining
+ then return Null
+ else do terms <- mapM (return . (:[]) . Str . fst) remaining
+ defs <- mapM (parseFromString (many block) . snd)
+ remaining
+ return $ DefinitionList $ zip terms defs
+
+--
+-- line block
+--
+
+lineBlockLine :: GenParser Char ParserState [Inline]
+lineBlockLine = try $ do
+ string "| "
+ white <- many (oneOf " \t")
+ line <- manyTill inline newline
+ return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak]
+
+lineBlock :: GenParser Char ParserState Block
+lineBlock = try $ do
+ lines' <- many1 lineBlockLine
+ blanklines
+ return $ Para (concat lines')
+
+--
+-- paragraph block
+--
+
+para :: GenParser Char ParserState Block
+para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph"
+
+codeBlockStart :: GenParser Char st Char
+codeBlockStart = string "::" >> blankline >> blankline
+
+-- paragraph that ends in a :: starting a code block
+paraBeforeCodeBlock :: GenParser Char ParserState Block
+paraBeforeCodeBlock = try $ do
+ result <- many1 (notFollowedBy' codeBlockStart >> inline)
+ lookAhead (string "::")
+ return $ Para $ if last result == Space
+ then normalizeSpaces result
+ else (normalizeSpaces result) ++ [Str ":"]
+
+-- regular paragraph
+paraNormal :: GenParser Char ParserState Block
+paraNormal = try $ do
+ result <- many1 inline
+ newline
+ blanklines
+ return $ Para $ normalizeSpaces result
+
+plain :: GenParser Char ParserState Block
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+--
+-- image block
+--
+
+imageBlock :: GenParser Char st Block
+imageBlock = try $ do
+ string ".. image:: "
+ src <- manyTill anyChar newline
+ fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t")
+ many1 $ fieldListItem indent
+ optional blanklines
+ case lookup "alt" fields of
+ Just alt -> return $ Plain [Image [Str alt] (src, alt)]
+ Nothing -> return $ Plain [Image [Str "image"] (src, "")]
+--
+-- header blocks
+--
+
+header :: GenParser Char ParserState Block
+header = doubleHeader <|> singleHeader <?> "header"
+
+-- a header with lines on top and bottom
+doubleHeader :: GenParser Char ParserState Block
+doubleHeader = try $ do
+ c <- oneOf underlineChars
+ rest <- many (char c) -- the top line
+ let lenTop = length (c:rest)
+ skipSpaces
+ newline
+ txt <- many1 (notFollowedBy blankline >> inline)
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ if (len > lenTop) then fail "title longer than border" else return ()
+ blankline -- spaces and newline
+ count lenTop (char c) -- the bottom line
+ blanklines
+ -- check to see if we've had this kind of header before.
+ -- if so, get appropriate level. if not, add to list.
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return $ Header level (normalizeSpaces txt)
+
+-- a header with line on the bottom only
+singleHeader :: GenParser Char ParserState Block
+singleHeader = try $ do
+ notFollowedBy' whitespace
+ txt <- many1 (do {notFollowedBy blankline; inline})
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ blankline
+ c <- oneOf underlineChars
+ count (len - 1) (char c)
+ many (char c)
+ blanklines
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return $ Header level (normalizeSpaces txt)
+
+--
+-- hrule block
+--
+
+hrule :: GenParser Char st Block
+hrule = try $ do
+ chr <- oneOf underlineChars
+ count 3 (char chr)
+ skipMany (char chr)
+ blankline
+ blanklines
+ return HorizontalRule
+
+--
+-- code blocks
+--
+
+-- read a line indented by a given string
+indentedLine :: String -> GenParser Char st [Char]
+indentedLine indents = try $ do
+ string indents
+ result <- manyTill anyChar newline
+ return $ result ++ "\n"
+
+-- two or more indented lines, possibly separated by blank lines.
+-- any amount of indentation will work.
+indentedBlock :: GenParser Char st [Char]
+indentedBlock = do
+ indents <- lookAhead $ many1 (oneOf " \t")
+ lns <- many $ choice $ [ indentedLine indents,
+ try $ do b <- blanklines
+ l <- indentedLine indents
+ return (b ++ l) ]
+ optional blanklines
+ return $ concat lns
+
+codeBlock :: GenParser Char st Block
+codeBlock = try $ do
+ codeBlockStart
+ result <- indentedBlock
+ return $ CodeBlock ("",[],[]) $ stripTrailingNewlines result
+
+lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock = try $ do
+ failUnlessLHS
+ pos <- getPosition
+ when (sourceColumn pos /= 1) $ fail "Not in first column"
+ lns <- many1 birdTrackLine
+ -- if (as is normal) there is always a space after >, drop it
+ let lns' = if all (\ln -> null ln || take 1 ln == " ") lns
+ then map (drop 1) lns
+ else lns
+ blanklines
+ return $ CodeBlock ("", ["sourceCode", "haskell"], []) $ intercalate "\n" lns'
+
+birdTrackLine :: GenParser Char st [Char]
+birdTrackLine = do
+ char '>'
+ manyTill anyChar newline
+
+--
+-- raw html
+--
+
+rawHtmlBlock :: GenParser Char st Block
+rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >>
+ indentedBlock >>= return . RawHtml
+
+--
+-- raw latex
+--
+
+rawLaTeXBlock :: GenParser Char st Block
+rawLaTeXBlock = try $ do
+ string ".. raw:: latex"
+ blanklines
+ result <- indentedBlock
+ return $ Para [(TeX result)]
+
+--
+-- block quotes
+--
+
+blockQuote :: GenParser Char ParserState Block
+blockQuote = do
+ raw <- indentedBlock
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ raw ++ "\n\n"
+ return $ BlockQuote contents
+
+--
+-- list blocks
+--
+
+list :: GenParser Char ParserState Block
+list = choice [ bulletList, orderedList, definitionList ] <?> "list"
+
+definitionListItem :: GenParser Char ParserState ([Inline], [Block])
+definitionListItem = try $ do
+ -- avoid capturing a directive or comment
+ notFollowedBy (try $ char '.' >> char '.')
+ term <- many1Till inline endline
+ raw <- indentedBlock
+ -- parse the extracted block, which may contain various block elements:
+ contents <- parseFromString parseBlocks $ raw ++ "\n\n"
+ return (normalizeSpaces term, contents)
+
+definitionList :: GenParser Char ParserState Block
+definitionList = many1 definitionListItem >>= return . DefinitionList
+
+-- parses bullet list start and returns its length (inc. following whitespace)
+bulletListStart :: GenParser Char st Int
+bulletListStart = try $ do
+ notFollowedBy' hrule -- because hrules start out just like lists
+ marker <- oneOf bulletListMarkers
+ white <- many1 spaceChar
+ return $ length (marker:white)
+
+-- parses ordered list start and returns its length (inc following whitespace)
+orderedListStart :: ListNumberStyle
+ -> ListNumberDelim
+ -> GenParser Char st Int
+orderedListStart style delim = try $ do
+ (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim)
+ white <- many1 spaceChar
+ return $ markerLen + length white
+
+-- parse a line of a list item
+listLine :: Int -> GenParser Char ParserState [Char]
+listLine markerLength = try $ do
+ notFollowedBy blankline
+ indentWith markerLength
+ line <- manyTill anyChar newline
+ return $ line ++ "\n"
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith :: Int -> GenParser Char ParserState [Char]
+indentWith num = do
+ state <- getState
+ let tabStop = stateTabStop state
+ if (num < tabStop)
+ then count num (char ' ')
+ else choice [ try (count num (char ' ')),
+ (try (char '\t' >> count (num - tabStop) (char ' '))) ]
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem :: GenParser Char ParserState Int
+ -> GenParser Char ParserState (Int, [Char])
+rawListItem start = try $ do
+ markerLength <- start
+ firstLine <- manyTill anyChar newline
+ restLines <- many (listLine markerLength)
+ return (markerLength, (firstLine ++ "\n" ++ (concat restLines)))
+
+-- continuation of a list item - indented and separated by blankline or
+-- (in compact lists) endline.
+-- Note: nested lists are parsed as continuations.
+listContinuation :: Int -> GenParser Char ParserState [Char]
+listContinuation markerLength = try $ do
+ blanks <- many1 blankline
+ result <- many1 (listLine markerLength)
+ return $ blanks ++ concat result
+
+listItem :: GenParser Char ParserState Int
+ -> GenParser Char ParserState [Block]
+listItem start = try $ do
+ (markerLength, first) <- rawListItem start
+ rest <- many (listContinuation markerLength)
+ blanks <- choice [ try (many blankline >>~ lookAhead start),
+ many1 blankline ] -- whole list must end with blank.
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let oldContext = stateParserContext state
+ setState $ state {stateParserContext = ListItemState}
+ -- parse the extracted block, which may itself contain block elements
+ parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks
+ updateState (\st -> st {stateParserContext = oldContext})
+ return parsed
+
+orderedList :: GenParser Char ParserState Block
+orderedList = try $ do
+ (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
+ items <- many1 (listItem (orderedListStart style delim))
+ let items' = compactify items
+ return $ OrderedList (start, style, delim) items'
+
+bulletList :: GenParser Char ParserState Block
+bulletList = many1 (listItem bulletListStart) >>=
+ return . BulletList . compactify
+
+--
+-- unknown directive (e.g. comment)
+--
+
+unknownDirective :: GenParser Char st Block
+unknownDirective = try $ do
+ string ".."
+ notFollowedBy (noneOf " \t\n")
+ manyTill anyChar newline
+ many $ blanklines <|> (oneOf " \t" >> manyTill anyChar newline)
+ return Null
+
+--
+-- reference key
+--
+
+quotedReferenceName :: GenParser Char ParserState [Inline]
+quotedReferenceName = try $ do
+ char '`' >> notFollowedBy (char '`') -- `` means inline code!
+ label' <- many1Till inline (char '`')
+ return label'
+
+unquotedReferenceName :: GenParser Char ParserState [Inline]
+unquotedReferenceName = try $ do
+ label' <- many1Till inline (lookAhead $ char ':')
+ return label'
+
+isolated :: Char -> GenParser Char st Char
+isolated ch = try $ char ch >>~ notFollowedBy (char ch)
+
+simpleReferenceName :: GenParser Char st [Inline]
+simpleReferenceName = do
+ raw <- many1 (alphaNum <|> isolated '-' <|> isolated '.' <|>
+ (try $ char '_' >>~ lookAhead alphaNum))
+ return [Str raw]
+
+referenceName :: GenParser Char ParserState [Inline]
+referenceName = quotedReferenceName <|>
+ (try $ simpleReferenceName >>~ lookAhead (char ':')) <|>
+ unquotedReferenceName
+
+referenceKey :: GenParser Char ParserState [Char]
+referenceKey = do
+ startPos <- getPosition
+ key <- choice [imageKey, anonymousKey, regularKey]
+ st <- getState
+ let oldkeys = stateKeys st
+ updateState $ \s -> s { stateKeys = key : oldkeys }
+ optional blanklines
+ endPos <- getPosition
+ -- return enough blanks to replace key
+ return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
+
+targetURI :: GenParser Char st [Char]
+targetURI = do
+ skipSpaces
+ optional newline
+ contents <- many1 (try (many spaceChar >> newline >>
+ many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n")
+ blanklines
+ return contents
+
+imageKey :: GenParser Char ParserState ([Inline], (String, [Char]))
+imageKey = try $ do
+ string ".. |"
+ ref <- manyTill inline (char '|')
+ skipSpaces
+ string "image::"
+ src <- targetURI
+ return (normalizeSpaces ref, (removeLeadingTrailingSpace src, ""))
+
+anonymousKey :: GenParser Char st ([Inline], (String, [Char]))
+anonymousKey = try $ do
+ oneOfStrings [".. __:", "__"]
+ src <- targetURI
+ return ([Str "_"], (removeLeadingTrailingSpace src, ""))
+
+regularKey :: GenParser Char ParserState ([Inline], (String, [Char]))
+regularKey = try $ do
+ string ".. _"
+ ref <- referenceName
+ char ':'
+ src <- targetURI
+ return (normalizeSpaces ref, (removeLeadingTrailingSpace src, ""))
+
+ --
+ -- inline
+ --
+
+inline :: GenParser Char ParserState Inline
+inline = choice [ link
+ , str
+ , whitespace
+ , endline
+ , strong
+ , emph
+ , code
+ , image
+ , hyphens
+ , superscript
+ , subscript
+ , escapedChar
+ , symbol ] <?> "inline"
+
+hyphens :: GenParser Char ParserState Inline
+hyphens = do
+ result <- many1 (char '-')
+ option Space endline
+ -- don't want to treat endline after hyphen or dash as a space
+ return $ Str result
+
+escapedChar :: GenParser Char st Inline
+escapedChar = escaped anyChar
+
+symbol :: GenParser Char ParserState Inline
+symbol = do
+ result <- oneOf specialChars
+ return $ Str [result]
+
+-- parses inline code, between codeStart and codeEnd
+code :: GenParser Char ParserState Inline
+code = try $ do
+ string "``"
+ result <- manyTill anyChar (try (string "``"))
+ return $ Code $ removeLeadingTrailingSpace $ intercalate " " $ lines result
+
+emph :: GenParser Char ParserState Inline
+emph = enclosed (char '*') (char '*') inline >>=
+ return . Emph . normalizeSpaces
+
+strong :: GenParser Char ParserState Inline
+strong = enclosed (string "**") (try $ string "**") inline >>=
+ return . Strong . normalizeSpaces
+
+interpreted :: [Char] -> GenParser Char st [Inline]
+interpreted role = try $ do
+ optional $ try $ string "\\ "
+ result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar
+ try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "")
+ return [Str result]
+
+superscript :: GenParser Char ParserState Inline
+superscript = interpreted "sup" >>= (return . Superscript)
+
+subscript :: GenParser Char ParserState Inline
+subscript = interpreted "sub" >>= (return . Subscript)
+
+whitespace :: GenParser Char ParserState Inline
+whitespace = many1 spaceChar >> return Space <?> "whitespace"
+
+str :: GenParser Char ParserState Inline
+str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str
+
+-- an endline character that can be treated as a space, not a structural break
+endline :: GenParser Char ParserState Inline
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ -- parse potential list-starts at beginning of line differently in a list:
+ st <- getState
+ if (stateParserContext st) == ListItemState
+ then notFollowedBy (anyOrderedListMarker >> spaceChar) >>
+ notFollowedBy' bulletListStart
+ else return ()
+ return Space
+
+--
+-- links
+--
+
+link :: GenParser Char ParserState Inline
+link = choice [explicitLink, referenceLink, autoLink] <?> "link"
+
+explicitLink :: GenParser Char ParserState Inline
+explicitLink = try $ do
+ char '`'
+ notFollowedBy (char '`') -- `` marks start of inline code
+ label' <- manyTill (notFollowedBy (char '`') >> inline)
+ (try (spaces >> char '<'))
+ src <- manyTill (noneOf ">\n ") (char '>')
+ skipSpaces
+ string "`_"
+ return $ Link (normalizeSpaces label') (removeLeadingTrailingSpace src, "")
+
+referenceLink :: GenParser Char ParserState Inline
+referenceLink = try $ do
+ label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_'
+ key <- option label' (do{char '_'; return [Str "_"]}) -- anonymous link
+ state <- getState
+ let keyTable = stateKeys state
+ src <- case lookupKeySrc keyTable key of
+ Nothing -> fail "no corresponding key"
+ Just target -> return target
+ -- if anonymous link, remove first anon key so it won't be used again
+ let keyTable' = if (key == [Str "_"]) -- anonymous link?
+ then delete ([Str "_"], src) keyTable -- remove first anon key
+ else keyTable
+ setState $ state { stateKeys = keyTable' }
+ return $ Link (normalizeSpaces label') src
+
+autoURI :: GenParser Char ParserState Inline
+autoURI = do
+ src <- uri
+ return $ Link [Str src] (src, "")
+
+autoEmail :: GenParser Char ParserState Inline
+autoEmail = do
+ src <- emailAddress
+ return $ Link [Str src] ("mailto:" ++ src, "")
+
+autoLink :: GenParser Char ParserState Inline
+autoLink = autoURI <|> autoEmail
+
+-- For now, we assume that all substitution references are for images.
+image :: GenParser Char ParserState Inline
+image = try $ do
+ char '|'
+ ref <- manyTill inline (char '|')
+ state <- getState
+ let keyTable = stateKeys state
+ src <- case lookupKeySrc keyTable ref of
+ Nothing -> fail "no corresponding key"
+ Just target -> return target
+ return $ Image (normalizeSpaces ref) src
+
diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs
new file mode 100644
index 000000000..04b0f3b8f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/TeXMath.hs
@@ -0,0 +1,233 @@
+{-
+Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.TeXMath
+ Copyright : Copyright (C) 2007 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of TeX math to a list of 'Pandoc' inline elements.
+-}
+module Text.Pandoc.Readers.TeXMath (
+ readTeXMath
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+
+-- | Converts a string of raw TeX math to a list of 'Pandoc' inlines.
+readTeXMath :: String -> [Inline]
+readTeXMath inp = case parse teXMath ("formula: " ++ inp) inp of
+ Left _ -> [Str inp] -- if unparseable, just include original
+ Right res -> res
+
+teXMath :: GenParser Char st [Inline]
+teXMath = manyTill mathPart eof >>= return . concat
+
+mathPart :: GenParser Char st [Inline]
+mathPart = whitespace <|> superscript <|> subscript <|> symbol <|>
+ argument <|> digits <|> letters <|> misc
+
+whitespace :: GenParser Char st [Inline]
+whitespace = many1 space >> return []
+
+symbol :: GenParser Char st [Inline]
+symbol = try $ do
+ char '\\'
+ res <- many1 letter
+ case lookup res teXsymbols of
+ Just m -> return [Str m]
+ Nothing -> return [Str $ "\\" ++ res]
+
+argument :: GenParser Char st [Inline]
+argument = try $ do
+ char '{'
+ res <- many mathPart
+ char '}'
+ return $ if null res
+ then [Str " "]
+ else [Str "{"] ++ concat res ++ [Str "}"]
+
+digits :: GenParser Char st [Inline]
+digits = do
+ res <- many1 digit
+ return [Str res]
+
+letters :: GenParser Char st [Inline]
+letters = do
+ res <- many1 letter
+ return [Emph [Str res]]
+
+misc :: GenParser Char st [Inline]
+misc = do
+ res <- noneOf "}"
+ return [Str [res]]
+
+scriptArg :: GenParser Char st [Inline]
+scriptArg = try $ do
+ (try (do{char '{'; r <- many mathPart; char '}'; return $ concat r}))
+ <|> symbol
+ <|> (do{c <- (letter <|> digit); return [Str [c]]})
+
+superscript :: GenParser Char st [Inline]
+superscript = try $ do
+ char '^'
+ arg <- scriptArg
+ return [Superscript arg]
+
+subscript :: GenParser Char st [Inline]
+subscript = try $ do
+ char '_'
+ arg <- scriptArg
+ return [Subscript arg]
+
+withThinSpace :: String -> String
+withThinSpace str = "\x2009" ++ str ++ "\x2009"
+
+teXsymbols :: [(String, String)]
+teXsymbols =
+ [("alpha","\x3B1")
+ ,("beta", "\x3B2")
+ ,("chi", "\x3C7")
+ ,("delta", "\x3B4")
+ ,("Delta", "\x394")
+ ,("epsilon", "\x3B5")
+ ,("varepsilon", "\x25B")
+ ,("eta", "\x3B7")
+ ,("gamma", "\x3B3")
+ ,("Gamma", "\x393")
+ ,("iota", "\x3B9")
+ ,("kappa", "\x3BA")
+ ,("lambda", "\x3BB")
+ ,("Lambda", "\x39B")
+ ,("mu", "\x3BC")
+ ,("nu", "\x3BD")
+ ,("omega", "\x3C9")
+ ,("Omega", "\x3A9")
+ ,("phi", "\x3C6")
+ ,("varphi", "\x3D5")
+ ,("Phi", "\x3A6")
+ ,("pi", "\x3C0")
+ ,("Pi", "\x3A0")
+ ,("psi", "\x3C8")
+ ,("Psi", "\x3A8")
+ ,("rho", "\x3C1")
+ ,("sigma", "\x3C3")
+ ,("Sigma", "\x3A3")
+ ,("tau", "\x3C4")
+ ,("theta", "\x3B8")
+ ,("vartheta", "\x3D1")
+ ,("Theta", "\x398")
+ ,("upsilon", "\x3C5")
+ ,("xi", "\x3BE")
+ ,("Xi", "\x39E")
+ ,("zeta", "\x3B6")
+ ,("ne", "\x2260")
+ ,("lt", withThinSpace "<")
+ ,("le", withThinSpace "\x2264")
+ ,("leq", withThinSpace "\x2264")
+ ,("ge", withThinSpace "\x2265")
+ ,("geq", withThinSpace "\x2265")
+ ,("prec", withThinSpace "\x227A")
+ ,("succ", withThinSpace "\x227B")
+ ,("preceq", withThinSpace "\x2AAF")
+ ,("succeq", withThinSpace "\x2AB0")
+ ,("in", withThinSpace "\x2208")
+ ,("notin", withThinSpace "\x2209")
+ ,("subset", withThinSpace "\x2282")
+ ,("supset", withThinSpace "\x2283")
+ ,("subseteq", withThinSpace "\x2286")
+ ,("supseteq", withThinSpace "\x2287")
+ ,("equiv", withThinSpace "\x2261")
+ ,("cong", withThinSpace "\x2245")
+ ,("approx", withThinSpace "\x2248")
+ ,("propto", withThinSpace "\x221D")
+ ,("cdot", withThinSpace "\x22C5")
+ ,("star", withThinSpace "\x22C6")
+ ,("backslash", "\\")
+ ,("times", withThinSpace "\x00D7")
+ ,("divide", withThinSpace "\x00F7")
+ ,("circ", withThinSpace "\x2218")
+ ,("oplus", withThinSpace "\x2295")
+ ,("otimes", withThinSpace "\x2297")
+ ,("odot", withThinSpace "\x2299")
+ ,("sum", "\x2211")
+ ,("prod", "\x220F")
+ ,("wedge", withThinSpace "\x2227")
+ ,("bigwedge", withThinSpace "\x22C0")
+ ,("vee", withThinSpace "\x2228")
+ ,("bigvee", withThinSpace "\x22C1")
+ ,("cap", withThinSpace "\x2229")
+ ,("bigcap", withThinSpace "\x22C2")
+ ,("cup", withThinSpace "\x222A")
+ ,("bigcup", withThinSpace "\x22C3")
+ ,("neg", "\x00AC")
+ ,("implies", withThinSpace "\x21D2")
+ ,("iff", withThinSpace "\x21D4")
+ ,("forall", "\x2200")
+ ,("exists", "\x2203")
+ ,("bot", "\x22A5")
+ ,("top", "\x22A4")
+ ,("vdash", "\x22A2")
+ ,("models", withThinSpace "\x22A8")
+ ,("uparrow", "\x2191")
+ ,("downarrow", "\x2193")
+ ,("rightarrow", withThinSpace "\x2192")
+ ,("to", withThinSpace "\x2192")
+ ,("rightarrowtail", "\x21A3")
+ ,("twoheadrightarrow", withThinSpace "\x21A0")
+ ,("twoheadrightarrowtail", withThinSpace "\x2916")
+ ,("mapsto", withThinSpace "\x21A6")
+ ,("leftarrow", withThinSpace "\x2190")
+ ,("leftrightarrow", withThinSpace "\x2194")
+ ,("Rightarrow", withThinSpace "\x21D2")
+ ,("Leftarrow", withThinSpace "\x21D0")
+ ,("Leftrightarrow", withThinSpace "\x21D4")
+ ,("partial", "\x2202")
+ ,("nabla", "\x2207")
+ ,("pm", "\x00B1")
+ ,("emptyset", "\x2205")
+ ,("infty", "\x221E")
+ ,("aleph", "\x2135")
+ ,("ldots", "...")
+ ,("therefore", "\x2234")
+ ,("angle", "\x2220")
+ ,("quad", "\x00A0\x00A0")
+ ,("cdots", "\x22EF")
+ ,("vdots", "\x22EE")
+ ,("ddots", "\x22F1")
+ ,("diamond", "\x22C4")
+ ,("Box", "\x25A1")
+ ,("lfloor", "\x230A")
+ ,("rfloor", "\x230B")
+ ,("lceiling", "\x2308")
+ ,("rceiling", "\x2309")
+ ,("langle", "\x2329")
+ ,("rangle", "\x232A")
+ ,("{", "{")
+ ,("}", "}")
+ ,("[", "[")
+ ,("]", "]")
+ ,("|", "|")
+ ,("||", "||")
+ ]
+