From 42aca57dee8d88afa5fac512aeb1198102908865 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 24 Jan 2009 20:00:26 +0000 Subject: Moved all haskell source to src subdirectory. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1528 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/HTML.hs | 675 +++++++++++++++++++ src/Text/Pandoc/Readers/LaTeX.hs | 774 ++++++++++++++++++++++ src/Text/Pandoc/Readers/Markdown.hs | 1243 +++++++++++++++++++++++++++++++++++ src/Text/Pandoc/Readers/RST.hs | 707 ++++++++++++++++++++ src/Text/Pandoc/Readers/TeXMath.hs | 233 +++++++ 5 files changed, 3632 insertions(+) create mode 100644 src/Text/Pandoc/Readers/HTML.hs create mode 100644 src/Text/Pandoc/Readers/LaTeX.hs create mode 100644 src/Text/Pandoc/Readers/Markdown.hs create mode 100644 src/Text/Pandoc/Readers/RST.hs create mode 100644 src/Text/Pandoc/Readers/TeXMath.hs (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..65e512b5e --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,675 @@ +{- +Copyright (C) 2006-8 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.HTML + Copyright : Copyright (C) 2006-8 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of HTML to 'Pandoc' document. +-} +module Text.Pandoc.Readers.HTML ( + readHtml, + rawHtmlInline, + rawHtmlBlock, + anyHtmlBlockTag, + anyHtmlInlineTag, + anyHtmlTag, + anyHtmlEndTag, + htmlEndTag, + extractTagType, + htmlBlockElement, + unsanitaryURI + ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Data.Maybe ( fromMaybe ) +import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate ) +import Data.Char ( toLower, isAlphaNum ) +import Network.URI ( parseURIReference, URI (..) ) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: ParserState -- ^ Parser state + -> String -- ^ String to parse + -> Pandoc +readHtml = readWith parseHtml + +-- +-- Constants +-- + +eitherBlockOrInline :: [[Char]] +eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", + "map", "area", "object"] + +{- +inlineHtmlTags :: [[Char]] +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", + "br", "cite", "code", "dfn", "em", "font", "i", "img", + "input", "kbd", "label", "q", "s", "samp", "select", + "small", "span", "strike", "strong", "sub", "sup", + "textarea", "tt", "u", "var"] ++ eitherBlockOrInline +-} + +blockHtmlTags :: [[Char]] +blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", + "dl", "fieldset", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "hr", "html", "isindex", "menu", "noframes", + "noscript", "ol", "p", "pre", "table", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script"] ++ eitherBlockOrInline + +sanitaryTags :: [[Char]] +sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", + "blockquote", "br", "button", "caption", "center", + "cite", "code", "col", "colgroup", "dd", "del", "dfn", + "dir", "div", "dl", "dt", "em", "fieldset", "font", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", + "i", "img", "input", "ins", "kbd", "label", "legend", + "li", "map", "menu", "ol", "optgroup", "option", "p", + "pre", "q", "s", "samp", "select", "small", "span", + "strike", "strong", "sub", "sup", "table", "tbody", + "td", "textarea", "tfoot", "th", "thead", "tr", "tt", + "u", "ul", "var"] + +sanitaryAttributes :: [[Char]] +sanitaryAttributes = ["abbr", "accept", "accept-charset", + "accesskey", "action", "align", "alt", "axis", + "border", "cellpadding", "cellspacing", "char", + "charoff", "charset", "checked", "cite", "class", + "clear", "cols", "colspan", "color", "compact", + "coords", "datetime", "dir", "disabled", + "enctype", "for", "frame", "headers", "height", + "href", "hreflang", "hspace", "id", "ismap", + "label", "lang", "longdesc", "maxlength", "media", + "method", "multiple", "name", "nohref", "noshade", + "nowrap", "prompt", "readonly", "rel", "rev", + "rows", "rowspan", "rules", "scope", "selected", + "shape", "size", "span", "src", "start", + "summary", "tabindex", "target", "title", "type", + "usemap", "valign", "value", "vspace", "width"] + +-- +-- HTML utility functions +-- + +-- | Returns @True@ if sanitization is specified and the specified tag is +-- not on the sanitized tag list. +unsanitaryTag :: [Char] + -> GenParser tok ParserState Bool +unsanitaryTag tag = do + st <- getState + return $ stateSanitizeHTML st && tag `notElem` sanitaryTags + +-- | returns @True@ if sanitization is specified and the specified attribute +-- is not on the sanitized attribute list. +unsanitaryAttribute :: ([Char], String, t) + -> GenParser tok ParserState Bool +unsanitaryAttribute (attr, val, _) = do + st <- getState + return $ stateSanitizeHTML st && + (attr `notElem` sanitaryAttributes || + (attr `elem` ["href","src"] && unsanitaryURI val)) + +-- | Returns @True@ if the specified URI is potentially a security risk. +unsanitaryURI :: String -> Bool +unsanitaryURI u = + let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", + "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", + "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", + "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", + "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", + "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", + "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", + "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", + "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", + "snews:", "webcal:", "ymsgr:"] + in case parseURIReference u of + Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes + Nothing -> True + +-- | Read blocks until end tag. +blocksTilEnd :: String -> GenParser Char ParserState [Block] +blocksTilEnd tag = do + blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) + return $ filter (/= Null) blocks + +-- | Read inlines until end tag. +inlinesTilEnd :: String -> GenParser Char ParserState [Inline] +inlinesTilEnd tag = manyTill inline (htmlEndTag tag) + +-- | Parse blocks between open and close tag. +blocksIn :: String -> GenParser Char ParserState [Block] +blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag + +-- | Parse inlines between open and close tag. +inlinesIn :: String -> GenParser Char ParserState [Inline] +inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag + +-- | Extract type from a tag: e.g. @br@ from @\@ +extractTagType :: String -> String +extractTagType ('<':rest) = + let isSpaceOrSlash c = c `elem` "/ \n\t" in + map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest +extractTagType _ = "" + +-- | Parse any HTML tag (opening or self-closing) and return text of tag +anyHtmlTag :: GenParser Char ParserState [Char] +anyHtmlTag = try $ do + char '<' + spaces + tag <- many1 alphaNum + attribs <- many htmlAttribute + spaces + ender <- option "" (string "/") + let ender' = if null ender then "" else " /" + spaces + char '>' + let result = "<" ++ tag ++ + concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "" + else return result + +anyHtmlEndTag :: GenParser Char ParserState [Char] +anyHtmlEndTag = try $ do + char '<' + spaces + char '/' + spaces + tag <- many1 alphaNum + spaces + char '>' + let result = "" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "" + else return result + +htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) +htmlTag tag = try $ do + char '<' + spaces + stringAnyCase tag + attribs <- many htmlAttribute + spaces + optional (string "/") + spaces + char '>' + return (tag, (map (\(name, content, _) -> (name, content)) attribs)) + +-- parses a quoted html attribute value +quoted :: Char -> GenParser Char st (String, String) +quoted quoteChar = do + result <- between (char quoteChar) (char quoteChar) + (many (noneOf [quoteChar])) + return (result, [quoteChar]) + +nullAttribute :: ([Char], [Char], [Char]) +nullAttribute = ("", "", "") + +htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) +htmlAttribute = do + attr <- htmlRegularAttribute <|> htmlMinimizedAttribute + unsanitary <- unsanitaryAttribute attr + if unsanitary + then return nullAttribute + else return attr + +-- minimized boolean attribute +htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) +htmlMinimizedAttribute = try $ do + many1 space + name <- many1 (choice [letter, oneOf ".-_:"]) + return (name, name, name) + +htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) +htmlRegularAttribute = try $ do + many1 space + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + char '=' + spaces + (content, quoteStr) <- choice [ (quoted '\''), + (quoted '"'), + (do + a <- many (alphaNum <|> (oneOf "-._:")) + return (a,"")) ] + return (name, content, + (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) + +-- | Parse an end tag of type 'tag' +htmlEndTag :: [Char] -> GenParser Char st [Char] +htmlEndTag tag = try $ do + char '<' + spaces + char '/' + spaces + stringAnyCase tag + spaces + char '>' + return $ "" + +{- +-- | Returns @True@ if the tag is (or can be) an inline tag. +isInline :: String -> Bool +isInline tag = (extractTagType tag) `elem` inlineHtmlTags +-} + +-- | Returns @True@ if the tag is (or can be) a block tag. +isBlock :: String -> Bool +isBlock tag = (extractTagType tag) `elem` blockHtmlTags + +anyHtmlBlockTag :: GenParser Char ParserState [Char] +anyHtmlBlockTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if isBlock tag then return tag else fail "not a block tag" + +anyHtmlInlineTag :: GenParser Char ParserState [Char] +anyHtmlInlineTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if not (isBlock tag) then return tag else fail "not an inline tag" + +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. +htmlScript :: GenParser Char ParserState [Char] +htmlScript = try $ do + open <- string "" + else return $ open ++ rest ++ "" + +-- | Parses material between style tags. +-- Style tags must be treated differently, because they can contain CSS +htmlStyle :: GenParser Char ParserState [Char] +htmlStyle = try $ do + open <- string "" + else return $ open ++ rest ++ "" + +htmlBlockElement :: GenParser Char ParserState [Char] +htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] + +rawHtmlBlock :: GenParser Char ParserState Block +rawHtmlBlock = try $ do + body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag + state <- getState + if stateParseRaw state then return (RawHtml body) else return Null + +-- This is a block whose contents should be passed through verbatim, not interpreted. +rawVerbatimBlock :: GenParser Char ParserState [Char] +rawVerbatimBlock = try $ do + start <- anyHtmlBlockTag + let tagtype = extractTagType start + if tagtype `elem` ["pre"] + then do + contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) + end <- htmlEndTag tagtype + return $ start ++ contents ++ end + else fail "Not a verbatim block" + +-- We don't want to parse or as raw HTML, since these +-- are handled in parseHtml. +rawHtmlBlock' :: GenParser Char ParserState Block +rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") + rawHtmlBlock + +-- | Parses an HTML comment. +htmlComment :: GenParser Char st [Char] +htmlComment = try $ do + string "")) + return $ "" + +-- +-- parsing documents +-- + +xmlDec :: GenParser Char st [Char] +xmlDec = try $ do + string "') + return $ "" + +definition :: GenParser Char st [Char] +definition = try $ do + string "') + return $ "" + +nonTitleNonHead :: GenParser Char ParserState Char +nonTitleNonHead = try $ do + notFollowedBy $ (htmlTag "title" >> return ' ') <|> + (htmlEndTag "head" >> return ' ') + (rawHtmlBlock >> return ' ') <|> anyChar + +parseTitle :: GenParser Char ParserState [Inline] +parseTitle = try $ do + (tag, _) <- htmlTag "title" + contents <- inlinesTilEnd tag + spaces + return contents + +-- parse header and return meta-information (for now, just title) +parseHead :: GenParser Char ParserState ([Inline], [a], [Char]) +parseHead = try $ do + htmlTag "head" + spaces + skipMany nonTitleNonHead + contents <- option [] parseTitle + skipMany nonTitleNonHead + htmlEndTag "head" + return (contents, [], "") + +skipHtmlTag :: String -> GenParser Char ParserState () +skipHtmlTag tag = optional (htmlTag tag) + +-- h1 class="title" representation of title in body +bodyTitle :: GenParser Char ParserState [Inline] +bodyTitle = try $ do + (_, attribs) <- htmlTag "h1" + case (extractAttribute "class" attribs) of + Just "title" -> return "" + _ -> fail "not title" + inlinesTilEnd "h1" + +parseHtml :: GenParser Char ParserState Pandoc +parseHtml = do + sepEndBy (choice [xmlDec, definition, htmlComment]) spaces + skipHtmlTag "html" + spaces + (title, authors, date) <- option ([], [], "") parseHead + spaces + skipHtmlTag "body" + spaces + optional bodyTitle -- skip title in body, because it's represented in meta + blocks <- parseBlocks + spaces + optional (htmlEndTag "body") + spaces + optional (htmlEndTag "html" >> many anyChar) -- ignore anything after + eof + return $ Pandoc (Meta title authors date) blocks + +-- +-- parsing blocks +-- + +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) + +block :: GenParser Char ParserState Block +block = choice [ codeBlock + , header + , hrule + , list + , blockQuote + , para + , plain + , rawHtmlBlock' + ] "block" + +-- +-- header blocks +-- + +header :: GenParser Char ParserState Block +header = choice (map headerLevel (enumFromTo 1 5)) "header" + +headerLevel :: Int -> GenParser Char ParserState Block +headerLevel n = try $ do + let level = "h" ++ show n + htmlTag level + contents <- inlinesTilEnd level + return $ Header n (normalizeSpaces contents) + +-- +-- hrule block +-- + +hrule :: GenParser Char ParserState Block +hrule = try $ do + (_, attribs) <- htmlTag "hr" + state <- getState + if not (null attribs) && stateParseRaw state + then unexpected "attributes in hr" -- parse as raw in this case + else return HorizontalRule + +-- +-- code blocks +-- + +-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are +-- skipped, because they are not portable to output formats other than HTML. +codeBlock :: GenParser Char ParserState Block +codeBlock = try $ do + htmlTag "pre" + result <- manyTill + (many1 (satisfy (/= '<')) <|> + ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) + (htmlEndTag "pre") + let result' = concat result + -- drop leading newline if any + let result'' = if "\n" `isPrefixOf` result' + then drop 1 result' + else result' + -- drop trailing newline if any + let result''' = if "\n" `isSuffixOf` result'' + then init result'' + else result'' + return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' + +-- +-- block quotes +-- + +blockQuote :: GenParser Char ParserState Block +blockQuote = try $ htmlTag "blockquote" >> spaces >> + blocksTilEnd "blockquote" >>= (return . BlockQuote) + +-- +-- list blocks +-- + +list :: GenParser Char ParserState Block +list = choice [ bulletList, orderedList, definitionList ] "list" + +orderedList :: GenParser Char ParserState Block +orderedList = try $ do + (_, attribs) <- htmlTag "ol" + (start, style) <- option (1, DefaultStyle) $ + do failIfStrict + let sta = fromMaybe "1" $ + lookup "start" attribs + let sty = fromMaybe (fromMaybe "" $ + lookup "style" attribs) $ + lookup "class" attribs + let sty' = case sty of + "lower-roman" -> LowerRoman + "upper-roman" -> UpperRoman + "lower-alpha" -> LowerAlpha + "upper-alpha" -> UpperAlpha + "decimal" -> Decimal + _ -> DefaultStyle + return (read sta, sty') + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ol" + return $ OrderedList (start, style, DefaultDelim) items + +bulletList :: GenParser Char ParserState Block +bulletList = try $ do + htmlTag "ul" + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ul" + return $ BulletList items + +definitionList :: GenParser Char ParserState Block +definitionList = try $ do + failIfStrict -- def lists not part of standard markdown + htmlTag "dl" + spaces + items <- sepEndBy1 definitionListItem spaces + htmlEndTag "dl" + return $ DefinitionList items + +definitionListItem :: GenParser Char ParserState ([Inline], [Block]) +definitionListItem = try $ do + terms <- sepEndBy1 (inlinesIn "dt") spaces + defs <- sepEndBy1 (blocksIn "dd") spaces + let term = intercalate [LineBreak] terms + return (term, concat defs) + +-- +-- paragraph block +-- + +para :: GenParser Char ParserState Block +para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= + return . Para . normalizeSpaces + +-- +-- plain block +-- + +plain :: GenParser Char ParserState Block +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- inline +-- + +inline :: GenParser Char ParserState Inline +inline = choice [ charRef + , strong + , emph + , superscript + , subscript + , strikeout + , spanStrikeout + , code + , str + , linebreak + , whitespace + , link + , image + , rawHtmlInline + ] "inline" + +code :: GenParser Char ParserState Inline +code = try $ do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + -- remove internal line breaks, leading and trailing space, + -- and decode character references + return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ + intercalate " " $ lines result + +rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline = do + result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag + state <- getState + if stateParseRaw state then return (HtmlInline result) else return (Str "") + +betweenTags :: [Char] -> GenParser Char ParserState [Inline] +betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= + return . normalizeSpaces + +emph :: GenParser Char ParserState Inline +emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph + +strong :: GenParser Char ParserState Inline +strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong + +superscript :: GenParser Char ParserState Inline +superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript + +subscript :: GenParser Char ParserState Inline +subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript + +strikeout :: GenParser Char ParserState Inline +strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= + return . Strikeout + +spanStrikeout :: GenParser Char ParserState Inline +spanStrikeout = try $ do + failIfStrict -- strict markdown has no strikeout, so treat as raw HTML + (_, attributes) <- htmlTag "span" + result <- case (extractAttribute "class" attributes) of + Just "strikeout" -> inlinesTilEnd "span" + _ -> fail "not a strikeout" + return $ Strikeout result + +whitespace :: GenParser Char st Inline +whitespace = many1 space >> return Space + +-- hard line break +linebreak :: GenParser Char ParserState Inline +linebreak = htmlTag "br" >> optional newline >> return LineBreak + +str :: GenParser Char st Inline +str = many1 (noneOf "<& \t\n") >>= return . Str + +-- +-- links and images +-- + +-- extract contents of attribute (attribute names are case-insensitive) +extractAttribute :: [Char] -> [([Char], String)] -> Maybe String +extractAttribute _ [] = Nothing +extractAttribute name ((attrName, contents):rest) = + let name' = map toLower name + attrName' = map toLower attrName + in if attrName' == name' + then Just (decodeCharacterReferences contents) + else extractAttribute name rest + +link :: GenParser Char ParserState Inline +link = try $ do + (_, attributes) <- htmlTag "a" + url <- case (extractAttribute "href" attributes) of + Just url -> return url + Nothing -> fail "no href" + let title = fromMaybe "" $ extractAttribute "title" attributes + lab <- inlinesTilEnd "a" + return $ Link (normalizeSpaces lab) (url, title) + +image :: GenParser Char ParserState Inline +image = try $ do + (_, attributes) <- htmlTag "img" + url <- case (extractAttribute "src" attributes) of + Just url -> return url + Nothing -> fail "no src" + let title = fromMaybe "" $ extractAttribute "title" attributes + let alt = fromMaybe "" (extractAttribute "alt" attributes) + return $ Image [Str alt] (url, title) + diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..f35ab4f29 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,774 @@ +{- +Copyright (C) 2006-8 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.LaTeX + Copyright : Copyright (C) 2006-8 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of LaTeX to 'Pandoc' document. +-} +module Text.Pandoc.Readers.LaTeX ( + readLaTeX, + rawLaTeXInline, + rawLaTeXEnvironment' + ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Data.Maybe ( fromMaybe ) +import Data.Char ( chr ) +import Data.List ( isPrefixOf, isSuffixOf ) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse + -> Pandoc +readLaTeX = readWith parseLaTeX + +-- characters with special meaning +specialChars :: [Char] +specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" + +-- +-- utility functions +-- + +-- | Returns text between brackets and its matching pair. +bracketedText :: Char -> Char -> GenParser Char st [Char] +bracketedText openB closeB = do + result <- charsInBalanced' openB closeB + return $ [openB] ++ result ++ [closeB] + +-- | Returns an option or argument of a LaTeX command. +optOrArg :: GenParser Char st [Char] +optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' + +-- | True if the string begins with '{'. +isArg :: [Char] -> Bool +isArg ('{':_) = True +isArg _ = False + +-- | Returns list of options and arguments of a LaTeX command. +commandArgs :: GenParser Char st [[Char]] +commandArgs = many optOrArg + +-- | Parses LaTeX command, returns (name, star, list of options or arguments). +command :: GenParser Char st ([Char], [Char], [[Char]]) +command = do + char '\\' + name <- many1 letter + star <- option "" (string "*") -- some commands have starred versions + args <- commandArgs + return (name, star, args) + +begin :: [Char] -> GenParser Char st [Char] +begin name = try $ do + string $ "\\begin{" ++ name ++ "}" + optional commandArgs + spaces + return name + +end :: [Char] -> GenParser Char st [Char] +end name = try $ do + string $ "\\end{" ++ name ++ "}" + return name + +-- | Returns a list of block elements containing the contents of an +-- environment. +environment :: [Char] -> GenParser Char ParserState [Block] +environment name = try $ begin name >> spaces >> manyTill block (end name) >>~ spaces + +anyEnvironment :: GenParser Char ParserState Block +anyEnvironment = try $ do + string "\\begin{" + name <- many letter + star <- option "" (string "*") -- some environments have starred variants + char '}' + optional commandArgs + spaces + contents <- manyTill block (end (name ++ star)) + spaces + return $ BlockQuote contents + +-- +-- parsing documents +-- + +-- | Process LaTeX preamble, extracting metadata. +processLaTeXPreamble :: GenParser Char ParserState () +processLaTeXPreamble = try $ manyTill + (choice [bibliographic, comment, unknownCommand, nullBlock]) + (try (string "\\begin{document}")) >> + spaces + +-- | Parse LaTeX and return 'Pandoc'. +parseLaTeX :: GenParser Char ParserState Pandoc +parseLaTeX = do + optional processLaTeXPreamble -- preamble might not be present (fragment) + spaces + blocks <- parseBlocks + spaces + optional $ try (string "\\end{document}" >> many anyChar) + -- might not be present (fragment) + spaces + eof + state <- getState + let blocks' = filter (/= Null) blocks + let title' = stateTitle state + let authors' = stateAuthors state + let date' = stateDate state + return $ Pandoc (Meta title' authors' date') blocks' + +-- +-- parsing blocks +-- + +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = spaces >> many block + +block :: GenParser Char ParserState Block +block = choice [ hrule + , codeBlock + , header + , list + , blockQuote + , comment + , bibliographic + , para + , itemBlock + , unknownEnvironment + , ignore + , unknownCommand ] "block" + +-- +-- header blocks +-- + +header :: GenParser Char ParserState Block +header = try $ do + char '\\' + subs <- many (try (string "sub")) + string "section" + optional (char '*') + char '{' + title' <- manyTill inline (char '}') + spaces + return $ Header (length subs + 1) (normalizeSpaces title') + +-- +-- hrule block +-- + +hrule :: GenParser Char st Block +hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", + "\\newpage" ] >> spaces >> return HorizontalRule + +-- +-- code blocks +-- + +codeBlock :: GenParser Char ParserState Block +codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> lhsCodeBlock +-- Note: Verbatim is from fancyvrb. + +codeBlockWith :: String -> GenParser Char st Block +codeBlockWith env = try $ do + string ("\\begin{" ++ env ++ "}") -- don't use begin function because it + -- gobbles whitespace + optional blanklines -- we want to gobble blank lines, but not + -- leading space + contents <- manyTill anyChar (try (string $ "\\end{" ++ env ++ "}")) + spaces + let classes = if env == "code" then ["haskell"] else [] + return $ CodeBlock ("",classes,[]) (stripTrailingNewlines contents) + +lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock = do + failUnlessLHS + (CodeBlock (_,_,_) cont) <- codeBlockWith "code" + return $ CodeBlock ("", ["sourceCode","haskell"], []) cont + +-- +-- block quotes +-- + +blockQuote :: GenParser Char ParserState Block +blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>= + return . BlockQuote + +-- +-- list blocks +-- + +list :: GenParser Char ParserState Block +list = bulletList <|> orderedList <|> definitionList "list" + +listItem :: GenParser Char ParserState ([Inline], [Block]) +listItem = try $ do + ("item", _, args) <- command + spaces + state <- getState + let oldParserContext = stateParserContext state + updateState (\s -> s {stateParserContext = ListItemState}) + blocks <- many block + updateState (\s -> s {stateParserContext = oldParserContext}) + opt <- case args of + ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x -> + parseFromString (many inline) $ tail $ init x + _ -> return [] + return (opt, blocks) + +orderedList :: GenParser Char ParserState Block +orderedList = try $ do + string "\\begin{enumerate}" + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ + try $ do failIfStrict + char '[' + res <- anyOrderedListMarker + char ']' + return res + spaces + option "" $ try $ do string "\\setlength{\\itemindent}" + char '{' + manyTill anyChar (char '}') + spaces + start <- option 1 $ try $ do failIfStrict + string "\\setcounter{enum" + many1 (oneOf "iv") + string "}{" + num <- many1 digit + char '}' + spaces + return $ (read num) + 1 + items <- many listItem + end "enumerate" + spaces + return $ OrderedList (start, style, delim) $ map snd items + +bulletList :: GenParser Char ParserState Block +bulletList = try $ do + begin "itemize" + spaces + items <- many listItem + end "itemize" + spaces + return (BulletList $ map snd items) + +definitionList :: GenParser Char ParserState Block +definitionList = try $ do + begin "description" + spaces + items <- many listItem + end "description" + spaces + return (DefinitionList items) + +-- +-- paragraph block +-- + +para :: GenParser Char ParserState Block +para = do + res <- many1 inline + spaces + return $ if null (filter (`notElem` [Str "", Space]) res) + then Null + else Para $ normalizeSpaces res + +-- +-- title authors date +-- + +bibliographic :: GenParser Char ParserState Block +bibliographic = choice [ maketitle, title, authors, date ] + +maketitle :: GenParser Char st Block +maketitle = try (string "\\maketitle") >> spaces >> return Null + +title :: GenParser Char ParserState Block +title = try $ do + string "\\title{" + tit <- manyTill inline (char '}') + spaces + updateState (\state -> state { stateTitle = tit }) + return Null + +authors :: GenParser Char ParserState Block +authors = try $ do + string "\\author{" + authors' <- manyTill anyChar (char '}') + spaces + let authors'' = map removeLeadingTrailingSpace $ lines $ + substitute "\\\\" "\n" authors' + updateState (\s -> s { stateAuthors = authors'' }) + return Null + +date :: GenParser Char ParserState Block +date = try $ do + string "\\date{" + date' <- manyTill anyChar (char '}') + spaces + updateState (\state -> state { stateDate = date' }) + return Null + +-- +-- item block +-- for use in unknown environments that aren't being parsed as raw latex +-- + +-- this forces items to be parsed in different blocks +itemBlock :: GenParser Char ParserState Block +itemBlock = try $ do + ("item", _, args) <- command + state <- getState + if stateParserContext state == ListItemState + then fail "item should be handled by list block" + else if null args + then return Null + else return $ Plain [Str (stripFirstAndLast (head args))] + +-- +-- raw LaTeX +-- + +-- | Parse any LaTeX environment and return a Para block containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment :: GenParser Char st Block +rawLaTeXEnvironment = do + contents <- rawLaTeXEnvironment' + spaces + return $ Para [TeX contents] + +-- | Parse any LaTeX environment and return a string containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment' :: GenParser Char st String +rawLaTeXEnvironment' = try $ do + string "\\begin{" + name <- many1 letter + star <- option "" (string "*") -- for starred variants + let name' = name ++ star + char '}' + args <- option [] commandArgs + let argStr = concat args + contents <- manyTill (choice [ (many1 (noneOf "\\")), + rawLaTeXEnvironment', + string "\\" ]) + (end name') + return $ "\\begin{" ++ name' ++ "}" ++ argStr ++ + concat contents ++ "\\end{" ++ name' ++ "}" + +unknownEnvironment :: GenParser Char ParserState Block +unknownEnvironment = try $ do + state <- getState + result <- if stateParseRaw state -- check whether we should include raw TeX + then rawLaTeXEnvironment -- if so, get whole raw environment + else anyEnvironment -- otherwise just the contents + return result + +-- \ignore{} is used conventionally in literate haskell for definitions +-- that are to be processed by the compiler but not printed. +ignore :: GenParser Char ParserState Block +ignore = try $ do + ("ignore", _, _) <- command + spaces + return Null + +unknownCommand :: GenParser Char ParserState Block +unknownCommand = try $ do + notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description", + "document"] + state <- getState + if stateParserContext state == ListItemState + then notFollowedBy' $ string "\\item" + else return () + if stateParseRaw state + then do + (name, star, args) <- command + spaces + return $ Plain [TeX ("\\" ++ name ++ star ++ concat args)] + else do -- skip unknown command, leaving arguments to be parsed + char '\\' + letter + many (letter <|> digit) + optional (try $ string "{}") + spaces + return Null + +-- latex comment +comment :: GenParser Char st Block +comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null + +-- +-- inline +-- + +inline :: GenParser Char ParserState Inline +inline = choice [ str + , endline + , whitespace + , quoted + , apostrophe + , spacer + , strong + , math + , ellipses + , emDash + , enDash + , hyphen + , emph + , strikeout + , superscript + , subscript + , ref + , lab + , code + , url + , link + , image + , footnote + , linebreak + , accentedChar + , specialChar + , rawLaTeXInline + , escapedChar + , unescapedChar + ] "inline" + +accentedChar :: GenParser Char st Inline +accentedChar = normalAccentedChar <|> specialAccentedChar + +normalAccentedChar :: GenParser Char st Inline +normalAccentedChar = try $ do + char '\\' + accent <- oneOf "'`^\"~" + character <- (try $ char '{' >> letter >>~ char '}') <|> letter + let table = fromMaybe [] $ lookup character accentTable + let result = case lookup accent table of + Just num -> chr num + Nothing -> '?' + return $ Str [result] + +-- an association list of letters and association list of accents +-- and decimal character numbers. +accentTable :: [(Char, [(Char, Int)])] +accentTable = + [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), + ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), + ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), + ('N', [('~', 209)]), + ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), + ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), + ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), + ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), + ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), + ('n', [('~', 241)]), + ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), + ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] + +specialAccentedChar :: GenParser Char st Inline +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, + oslash, pound, euro, copyright, sect ] + +ccedil :: GenParser Char st Inline +ccedil = try $ do + char '\\' + letter' <- oneOfStrings ["cc", "cC"] + let num = if letter' == "cc" then 231 else 199 + return $ Str [chr num] + +aring :: GenParser Char st Inline +aring = try $ do + char '\\' + letter' <- oneOfStrings ["aa", "AA"] + let num = if letter' == "aa" then 229 else 197 + return $ Str [chr num] + +iuml :: GenParser Char st Inline +iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >> + return (Str [chr 239]) + +szlig :: GenParser Char st Inline +szlig = try (string "\\ss") >> return (Str [chr 223]) + +oslash :: GenParser Char st Inline +oslash = try $ do + char '\\' + letter' <- choice [char 'o', char 'O'] + let num = if letter' == 'o' then 248 else 216 + return $ Str [chr num] + +aelig :: GenParser Char st Inline +aelig = try $ do + char '\\' + letter' <- oneOfStrings ["ae", "AE"] + let num = if letter' == "ae" then 230 else 198 + return $ Str [chr num] + +pound :: GenParser Char st Inline +pound = try (string "\\pounds") >> return (Str [chr 163]) + +euro :: GenParser Char st Inline +euro = try (string "\\euro") >> return (Str [chr 8364]) + +copyright :: GenParser Char st Inline +copyright = try (string "\\copyright") >> return (Str [chr 169]) + +sect :: GenParser Char st Inline +sect = try (string "\\S") >> return (Str [chr 167]) + +escapedChar :: GenParser Char st Inline +escapedChar = do + result <- escaped (oneOf " $%&_#{}\n") + return $ if result == Str "\n" then Str " " else result + +-- nonescaped special characters +unescapedChar :: GenParser Char st Inline +unescapedChar = oneOf "`$^&_#{}|<>" >>= return . (\c -> Str [c]) + +specialChar :: GenParser Char st Inline +specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ] + +backslash :: GenParser Char st Inline +backslash = try (string "\\textbackslash") >> optional (try $ string "{}") >> return (Str "\\") + +tilde :: GenParser Char st Inline +tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") + +caret :: GenParser Char st Inline +caret = try (string "\\^{}") >> return (Str "^") + +bar :: GenParser Char st Inline +bar = try (string "\\textbar") >> optional (try $ string "{}") >> return (Str "\\") + +lt :: GenParser Char st Inline +lt = try (string "\\textless") >> optional (try $ string "{}") >> return (Str "<") + +gt :: GenParser Char st Inline +gt = try (string "\\textgreater") >> optional (try $ string "{}") >> return (Str ">") + +doubleQuote :: GenParser Char st Inline +doubleQuote = char '"' >> return (Str "\"") + +code :: GenParser Char ParserState Inline +code = code1 <|> code2 <|> lhsInlineCode + +code1 :: GenParser Char st Inline +code1 = try $ do + string "\\verb" + marker <- anyChar + result <- manyTill anyChar (char marker) + return $ Code $ removeLeadingTrailingSpace result + +code2 :: GenParser Char st Inline +code2 = try $ do + string "\\texttt{" + result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') + return $ Code result + +lhsInlineCode :: GenParser Char ParserState Inline +lhsInlineCode = try $ do + failUnlessLHS + char '|' + result <- manyTill (noneOf "|\n") (char '|') + return $ Code result + +emph :: GenParser Char ParserState Inline +emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> + manyTill inline (char '}') >>= return . Emph + +strikeout :: GenParser Char ParserState Inline +strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= + return . Strikeout + +superscript :: GenParser Char ParserState Inline +superscript = try $ string "\\textsuperscript{" >> + manyTill inline (char '}') >>= return . Superscript + +-- note: \textsubscript isn't a standard latex command, but we use +-- a defined version in pandoc. +subscript :: GenParser Char ParserState Inline +subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= + return . Subscript + +apostrophe :: GenParser Char ParserState Inline +apostrophe = char '\'' >> return Apostrophe + +quoted :: GenParser Char ParserState Inline +quoted = doubleQuoted <|> singleQuoted + +singleQuoted :: GenParser Char ParserState Inline +singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted :: GenParser Char ParserState Inline +doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= + return . Quoted DoubleQuote . normalizeSpaces + +singleQuoteStart :: GenParser Char st Char +singleQuoteStart = char '`' + +singleQuoteEnd :: GenParser Char st () +singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum + +doubleQuoteStart :: CharParser st String +doubleQuoteStart = string "``" + +doubleQuoteEnd :: CharParser st String +doubleQuoteEnd = try $ string "''" + +ellipses :: GenParser Char st Inline +ellipses = try $ string "\\ldots" >> optional (try $ string "{}") >> + return Ellipses + +enDash :: GenParser Char st Inline +enDash = try (string "--") >> return EnDash + +emDash :: GenParser Char st Inline +emDash = try (string "---") >> return EmDash + +hyphen :: GenParser Char st Inline +hyphen = char '-' >> return (Str "-") + +lab :: GenParser Char st Inline +lab = try $ do + string "\\label{" + result <- manyTill anyChar (char '}') + return $ Str $ "(" ++ result ++ ")" + +ref :: GenParser Char st Inline +ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str + +strong :: GenParser Char ParserState Inline +strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= + return . Strong + +whitespace :: GenParser Char st Inline +whitespace = many1 (oneOf "~ \t") >> return Space + +-- hard line break +linebreak :: GenParser Char st Inline +linebreak = try (string "\\\\") >> return LineBreak + +spacer :: GenParser Char st Inline +spacer = try (string "\\,") >> return (Str "") + +str :: GenParser Char st Inline +str = many1 (noneOf specialChars) >>= return . Str + +-- endline internal to paragraph +endline :: GenParser Char st Inline +endline = try $ newline >> notFollowedBy blankline >> return Space + +-- math +math :: GenParser Char st Inline +math = (math3 >>= return . Math DisplayMath) + <|> (math1 >>= return . Math InlineMath) + <|> (math2 >>= return . Math InlineMath) + <|> (math4 >>= return . Math DisplayMath) + <|> (math5 >>= return . Math DisplayMath) + <|> (math6 >>= return . Math DisplayMath) + "math" + +math1 :: GenParser Char st String +math1 = try $ char '$' >> manyTill anyChar (char '$') + +math2 :: GenParser Char st String +math2 = try $ string "\\(" >> manyTill anyChar (try $ string "\\)") + +math3 :: GenParser Char st String +math3 = try $ char '$' >> math1 >>~ char '$' + +math4 :: GenParser Char st String +math4 = try $ do + name <- begin "equation" <|> begin "equation*" <|> begin "displaymath" <|> begin "displaymath*" + spaces + manyTill anyChar (end name) + +math5 :: GenParser Char st String +math5 = try $ (string "\\[") >> spaces >> manyTill anyChar (try $ string "\\]") + +math6 :: GenParser Char st String +math6 = try $ do + name <- begin "eqnarray" <|> begin "eqnarray*" + spaces + res <- manyTill anyChar (end name) + return $ filter (/= '&') res -- remove eqnarray alignment codes + +-- +-- links and images +-- + +url :: GenParser Char ParserState Inline +url = try $ do + string "\\url" + url' <- charsInBalanced '{' '}' + return $ Link [Code url'] (url', "") + +link :: GenParser Char ParserState Inline +link = try $ do + string "\\href{" + url' <- manyTill anyChar (char '}') + char '{' + label' <- manyTill inline (char '}') + return $ Link (normalizeSpaces label') (url', "") + +image :: GenParser Char ParserState Inline +image = try $ do + ("includegraphics", _, args) <- command + let args' = filter isArg args -- filter out options + let src = if null args' then + ("", "") + else + (stripFirstAndLast (head args'), "") + return $ Image [Str "image"] src + +footnote :: GenParser Char ParserState Inline +footnote = try $ do + (name, _, (contents:[])) <- command + if ((name == "footnote") || (name == "thanks")) + then string "" + else fail "not a footnote or thanks command" + let contents' = stripFirstAndLast contents + -- parse the extracted block, which may contain various block elements: + rest <- getInput + setInput $ contents' + blocks <- parseBlocks + setInput rest + return $ Note blocks + +-- | Parse any LaTeX command and return it in a raw TeX inline element. +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = try $ do + notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore"] + state <- getState + if stateParseRaw state + then do + (name, star, args) <- command + return $ TeX ("\\" ++ name ++ star ++ concat args) + else do -- skip unknown command, leaving arguments to be parsed + char '\\' + letter + many (letter <|> digit) + optional (try $ string "{}") + return $ Str "" diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..896f5832e --- /dev/null +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,1243 @@ +{-# LANGUAGE CPP #-} +{- +Copyright (C) 2006-8 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Markdown + Copyright : Copyright (C) 2006-8 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of markdown-formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Markdown ( + readMarkdown + ) where + +import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate ) +import Data.Ord ( comparing ) +import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper ) +import Data.Maybe +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, + anyHtmlInlineTag, anyHtmlTag, + anyHtmlEndTag, htmlEndTag, extractTagType, + htmlBlockElement, unsanitaryURI ) +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Text.ParserCombinators.Parsec +import Control.Monad (when) + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: ParserState -> String -> Pandoc +readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n") + +-- +-- Constants and data structure definitions +-- + +spaceChars :: [Char] +spaceChars = " \t" + +bulletListMarkers :: [Char] +bulletListMarkers = "*+-" + +hruleChars :: [Char] +hruleChars = "*-_" + +setextHChars :: [Char] +setextHChars = "=-" + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221" + +-- +-- auxiliary functions +-- + +indentSpaces :: GenParser Char ParserState [Char] +indentSpaces = try $ do + state <- getState + let tabStop = stateTabStop state + try (count tabStop (char ' ')) <|> + (many (char ' ') >> string "\t") "indentation" + +nonindentSpaces :: GenParser Char ParserState [Char] +nonindentSpaces = do + state <- getState + let tabStop = stateTabStop state + sps <- many (char ' ') + if length sps < tabStop + then return sps + else unexpected "indented line" + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine :: GenParser tok st () +failUnlessBeginningOfLine = do + pos <- getPosition + if sourceColumn pos == 1 then return () else fail "not beginning of line" + +-- | Fail unless we're in "smart typography" mode. +failUnlessSmart :: GenParser tok ParserState () +failUnlessSmart = do + state <- getState + if stateSmart state then return () else fail "Smart typography feature" + +-- | Parse a sequence of inline elements between square brackets, +-- including inlines between balanced pairs of square brackets. +inlinesInBalancedBrackets :: GenParser Char ParserState Inline + -> GenParser Char ParserState [Inline] +inlinesInBalancedBrackets parser = try $ do + char '[' + result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser + if res == "[" + then return () + else pzero + bal <- inlinesInBalancedBrackets parser + return $ [Str "["] ++ bal ++ [Str "]"]) + <|> (count 1 parser)) + (char ']') + return $ concat result + +-- +-- document structure +-- + +titleLine :: GenParser Char ParserState [Inline] +titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline + +authorsLine :: GenParser Char st [String] +authorsLine = try $ do + char '%' + skipSpaces + authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") + newline + return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors + +dateLine :: GenParser Char st String +dateLine = try $ do + char '%' + skipSpaces + date <- many (noneOf "\n") + newline + return $ decodeCharacterReferences $ removeTrailingSpace date + +titleBlock :: GenParser Char ParserState ([Inline], [String], [Char]) +titleBlock = try $ do + failIfStrict + title <- option [] titleLine + author <- option [] authorsLine + date <- option "" dateLine + optional blanklines + return (title, author, date) + +parseMarkdown :: GenParser Char ParserState Pandoc +parseMarkdown = do + -- markdown allows raw HTML + updateState (\state -> state { stateParseRaw = True }) + startPos <- getPosition + -- go through once just to get list of reference keys + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= + return . concat + setInput docMinusKeys + setPosition startPos + st <- getState + -- go through again for notes unless strict... + if stateStrict st + then return () + else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= + return . concat + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + setInput docMinusNotes + setPosition startPos + -- now parse it for real... + (title, author, date) <- option ([],[],"") titleBlock + blocks <- parseBlocks + return $ Pandoc (Meta title author date) $ filter (/= Null) blocks + +-- +-- initial pass for references and notes +-- + +referenceKey :: GenParser Char ParserState [Char] +referenceKey = try $ do + startPos <- getPosition + nonindentSpaces + lab <- reference + char ':' + skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') + let sourceURL excludes = many $ + optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' ')) + src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + tit <- option "" referenceTitle + blanklines + endPos <- getPosition + let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit)) + st <- getState + let oldkeys = stateKeys st + updateState $ \s -> s { stateKeys = newkey : oldkeys } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +referenceTitle :: GenParser Char st String +referenceTitle = try $ do + skipSpaces >> optional newline >> skipSpaces + tit <- (charsInBalanced '(' ')' >>= return . unwords . words) + <|> do delim <- char '\'' <|> char '"' + manyTill anyChar (try (char delim >> skipSpaces >> + notFollowedBy (noneOf ")\n"))) + return $ decodeCharacterReferences tit + +noteMarker :: GenParser Char st [Char] +noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') + +rawLine :: GenParser Char ParserState [Char] +rawLine = do + notFollowedBy blankline + notFollowedBy' noteMarker + contents <- many1 nonEndline + end <- option "" (newline >> optional indentSpaces >> return "\n") + return $ contents ++ end + +rawLines :: GenParser Char ParserState [Char] +rawLines = many1 rawLine >>= return . concat + +noteBlock :: GenParser Char ParserState [Char] +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + char ':' + optional blankline + optional indentSpaces + raw <- sepBy rawLines (try (blankline >> indentSpaces)) + optional blanklines + endPos <- getPosition + -- parse the extracted text, which may contain various block elements: + contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + let newnote = (ref, contents) + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- +-- parsing blocks +-- + +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +block :: GenParser Char ParserState Block +block = do + st <- getState + choice (if stateStrict st + then [ header + , codeBlockIndented + , blockQuote + , hrule + , bulletList + , orderedList + , htmlBlock + , para + , plain + , nullBlock ] + else [ codeBlockDelimited + , header + , table + , codeBlockIndented + , lhsCodeBlock + , blockQuote + , hrule + , bulletList + , orderedList + , definitionList + , para + , rawHtmlBlocks + , plain + , nullBlock ]) "block" + +-- +-- header blocks +-- + +header :: GenParser Char ParserState Block +header = setextHeader <|> atxHeader "header" + +atxHeader :: GenParser Char ParserState Block +atxHeader = try $ do + level <- many1 (char '#') >>= return . length + notFollowedBy (char '.' <|> char ')') -- this would be a list + skipSpaces + text <- manyTill inline atxClosing >>= return . normalizeSpaces + return $ Header level text + +atxClosing :: GenParser Char st [Char] +atxClosing = try $ skipMany (char '#') >> blanklines + +setextHeader :: GenParser Char ParserState Block +setextHeader = try $ do + text <- many1Till inline newline + underlineChar <- oneOf setextHChars + many (char underlineChar) + blanklines + let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 + return $ Header level (normalizeSpaces text) + +-- +-- hrule block +-- + +hrule :: GenParser Char st Block +hrule = try $ do + skipSpaces + start <- oneOf hruleChars + count 2 (skipSpaces >> char start) + skipMany (oneOf spaceChars <|> char start) + newline + optional blanklines + return HorizontalRule + +-- +-- code blocks +-- + +indentedLine :: GenParser Char ParserState [Char] +indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") + +codeBlockDelimiter :: Maybe Int + -> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])])) +codeBlockDelimiter len = try $ do + size <- case len of + Just l -> count l (char '~') >> many (char '~') >> return l + Nothing -> count 3 (char '~') >> many (char '~') >>= + return . (+ 3) . length + many spaceChar + attr <- option ([],[],[]) attributes + blankline + return (size, attr) + +attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attributes = try $ do + char '{' + many spaceChar + attrs <- many (attribute >>~ many spaceChar) + char '}' + let (ids, classes, keyvals) = unzip3 attrs + let id' = if null ids then "" else head ids + return (id', concat classes, concat keyvals) + +attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attribute = identifierAttr <|> classAttr <|> keyValAttr + +identifier :: GenParser Char st [Char] +identifier = do + first <- letter + rest <- many alphaNum + return (first:rest) + +identifierAttr :: GenParser Char st ([Char], [a], [a1]) +identifierAttr = try $ do + char '#' + result <- identifier + return (result,[],[]) + +classAttr :: GenParser Char st ([Char], [[Char]], [a]) +classAttr = try $ do + char '.' + result <- identifier + return ("",[result],[]) + +keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])]) +keyValAttr = try $ do + key <- identifier + char '=' + char '"' + val <- manyTill (noneOf "\n") (char '"') + return ("",[],[(key,val)]) + +codeBlockDelimited :: GenParser Char st Block +codeBlockDelimited = try $ do + (size, attr) <- codeBlockDelimiter Nothing + contents <- manyTill anyLine (codeBlockDelimiter (Just size)) + blanklines + return $ CodeBlock attr $ intercalate "\n" contents + +codeBlockIndented :: GenParser Char ParserState Block +codeBlockIndented = do + contents <- many1 (indentedLine <|> + try (do b <- blanklines + l <- indentedLine + return $ b ++ l)) + optional blanklines + return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents + +lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock = do + failUnlessLHS + contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX + return $ CodeBlock ("",["sourceCode","haskell"],[]) contents + +lhsCodeBlockLaTeX :: GenParser Char ParserState String +lhsCodeBlockLaTeX = try $ do + string "\\begin{code}" + manyTill spaceChar newline + contents <- many1Till anyChar (try $ string "\\end{code}") + blanklines + return $ stripTrailingNewlines contents + +lhsCodeBlockBird :: GenParser Char ParserState String +lhsCodeBlockBird = try $ do + pos <- getPosition + when (sourceColumn pos /= 1) $ fail "Not in first column" + lns <- many1 birdTrackLine + -- if (as is normal) there is always a space after >, drop it + let lns' = if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns + blanklines + return $ intercalate "\n" lns' + +birdTrackLine :: GenParser Char st [Char] +birdTrackLine = do + char '>' + manyTill anyChar newline + + +-- +-- block quotes +-- + +emailBlockQuoteStart :: GenParser Char ParserState Char +emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') + +emailBlockQuote :: GenParser Char ParserState [[Char]] +emailBlockQuote = try $ do + emailBlockQuoteStart + raw <- sepBy (many (nonEndline <|> + (try (endline >> notFollowedBy emailBlockQuoteStart >> + return '\n')))) + (try (newline >> emailBlockQuoteStart)) + newline <|> (eof >> return '\n') + optional blanklines + return raw + +blockQuote :: GenParser Char ParserState Block +blockQuote = do + raw <- emailBlockQuote + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + return $ BlockQuote contents + +-- +-- list blocks +-- + +bulletListStart :: GenParser Char ParserState () +bulletListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy' hrule -- because hrules start out just like lists + oneOf bulletListMarkers + spaceChar + skipSpaces + +anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim) +anyOrderedListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy $ string "p." >> spaceChar >> digit -- page number + state <- getState + if stateStrict state + then do many1 digit + char '.' + spaceChar + return (1, DefaultStyle, DefaultDelim) + else do (num, style, delim) <- anyOrderedListMarker + -- if it could be an abbreviated first name, insist on more than one space + if delim == Period && (style == UpperAlpha || (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000])) + then char '\t' <|> (char ' ' >>~ notFollowedBy (satisfy isUpper)) + else spaceChar + skipSpaces + return (num, style, delim) + +listStart :: GenParser Char ParserState () +listStart = bulletListStart <|> (anyOrderedListStart >> return ()) + +-- parse a line of a list item (start = parser for beginning of list item) +listLine :: GenParser Char ParserState [Char] +listLine = try $ do + notFollowedBy' listStart + notFollowedBy blankline + notFollowedBy' (do indentSpaces + many (spaceChar) + listStart) + line <- manyTill anyChar newline + return $ line ++ "\n" + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: GenParser Char ParserState [Char] +rawListItem = try $ do + listStart + result <- many1 listLine + blanks <- many blankline + return $ concat result ++ blanks + +-- continuation of a list item - indented and separated by blankline +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation :: GenParser Char ParserState [Char] +listContinuation = try $ do + lookAhead indentSpaces + result <- many1 listContinuationLine + blanks <- many blankline + return $ concat result ++ blanks + +listContinuationLine :: GenParser Char ParserState [Char] +listContinuationLine = try $ do + notFollowedBy blankline + notFollowedBy' listStart + optional indentSpaces + result <- manyTill anyChar newline + return $ result ++ "\n" + +listItem :: GenParser Char ParserState [Block] +listItem = try $ do + first <- rawListItem + continuations <- many listContinuation + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may contain various block elements: + let raw = concat (first:continuations) + contents <- parseFromString parseBlocks raw + updateState (\st -> st {stateParserContext = oldContext}) + return contents + +orderedList :: GenParser Char ParserState Block +orderedList = try $ do + (start, style, delim) <- lookAhead anyOrderedListStart + items <- many1 listItem + return $ OrderedList (start, style, delim) $ compactify items + +bulletList :: GenParser Char ParserState Block +bulletList = try $ do + lookAhead bulletListStart + many1 listItem >>= return . BulletList . compactify + +-- definition lists + +definitionListItem :: GenParser Char ParserState ([Inline], [Block]) +definitionListItem = try $ do + notFollowedBy blankline + notFollowedBy' indentSpaces + -- first, see if this has any chance of being a definition list: + lookAhead (anyLine >> char ':') + term <- manyTill inline newline + raw <- many1 defRawBlock + state <- getState + let oldContext = stateParserContext state + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ concat raw + updateState (\st -> st {stateParserContext = oldContext}) + return ((normalizeSpaces term), contents) + +defRawBlock :: GenParser Char ParserState [Char] +defRawBlock = try $ do + char ':' + state <- getState + let tabStop = stateTabStop state + try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") + firstline <- anyLine + rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) + trailing <- option "" blanklines + return $ firstline ++ "\n" ++ unlines rawlines ++ trailing + +definitionList :: GenParser Char ParserState Block +definitionList = do + items <- many1 definitionListItem + let (terms, defs) = unzip items + let defs' = compactify defs + let items' = zip terms defs' + return $ DefinitionList items' + +-- +-- paragraph block +-- + +isHtmlOrBlank :: Inline -> Bool +isHtmlOrBlank (HtmlInline _) = True +isHtmlOrBlank (Space) = True +isHtmlOrBlank (LineBreak) = True +isHtmlOrBlank _ = False + +para :: GenParser Char ParserState Block +para = try $ do + result <- many1 inline + if all isHtmlOrBlank result + then fail "treat as raw HTML" + else return () + newline + blanklines <|> do st <- getState + if stateStrict st + then lookAhead (blockQuote <|> header) >> return "" + else pzero + return $ Para $ normalizeSpaces result + +plain :: GenParser Char ParserState Block +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- raw html +-- + +htmlElement :: GenParser Char ParserState [Char] +htmlElement = strictHtmlBlock <|> htmlBlockElement "html element" + +htmlBlock :: GenParser Char ParserState Block +htmlBlock = try $ do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines + +-- True if tag is self-closing +isSelfClosing :: [Char] -> Bool +isSelfClosing tag = + isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag + +strictHtmlBlock :: GenParser Char ParserState [Char] +strictHtmlBlock = try $ do + tag <- anyHtmlBlockTag + let tag' = extractTagType tag + if isSelfClosing tag || tag' == "hr" + then return tag + else do contents <- many (notFollowedBy' (htmlEndTag tag') >> + (htmlElement <|> (count 1 anyChar))) + end <- htmlEndTag tag' + return $ tag ++ concat contents ++ end + +rawHtmlBlocks :: GenParser Char ParserState Block +rawHtmlBlocks = do + htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock + sps <- do sp1 <- many spaceChar + sp2 <- option "" (blankline >> return "\n") + sp3 <- many spaceChar + sp4 <- option "" blanklines + return $ sp1 ++ sp2 ++ sp3 ++ sp4 + -- note: we want raw html to be able to + -- precede a code block, when separated + -- by a blank line + return $ blk ++ sps + let combined = concat htmlBlocks + let combined' = if last combined == '\n' then init combined else combined + return $ RawHtml combined' + +-- +-- Tables +-- + +-- Parse a dashed line with optional trailing spaces; return its length +-- and the length including trailing space. +dashedLine :: Char + -> GenParser Char st (Int, Int) +dashedLine ch = do + dashes <- many1 (char ch) + sp <- many spaceChar + return $ (length dashes, length $ dashes ++ sp) + +-- Parse a table header with dashed lines of '-' preceded by +-- one line of text. +simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) +simpleTableHeader = try $ do + rawContent <- anyLine + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + let rawHeads = tail $ splitByIndices (init indices) rawContent + let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths + return (rawHeads, aligns, indices) + +-- Parse a table footer - dashed lines followed by blank line. +tableFooter :: GenParser Char ParserState [Char] +tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines + +-- Parse a table separator - dashed line. +tableSep :: GenParser Char ParserState String +tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" + +-- Parse a raw line and split it into chunks by indices. +rawTableLine :: [Int] + -> GenParser Char ParserState [String] +rawTableLine indices = do + notFollowedBy' (blanklines <|> tableFooter) + line <- many1Till anyChar newline + return $ map removeLeadingTrailingSpace $ tail $ + splitByIndices (init indices) line + +-- Parse a table line and return a list of lists of blocks (columns). +tableLine :: [Int] + -> GenParser Char ParserState [[Block]] +tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) + +-- Parse a multiline table row and return a list of blocks (columns). +multilineRow :: [Int] + -> GenParser Char ParserState [[Block]] +multilineRow indices = do + colLines <- many1 (rawTableLine indices) + optional blanklines + let cols = map unlines $ transpose colLines + mapM (parseFromString (many plain)) cols + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Double] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns indices = + let lengths = zipWith (-) indices (0:indices) + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> (fromIntegral l) / quotient) lengths in + tail fracs + +-- Parses a table caption: inlines beginning with 'Table:' +-- and followed by blank lines. +tableCaption :: GenParser Char ParserState [Inline] +tableCaption = try $ do + nonindentSpaces + string "Table:" + result <- many1 inline + blanklines + return $ normalizeSpaces result + +-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. +tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) + -> ([Int] -> GenParser Char ParserState [[Block]]) + -> GenParser Char ParserState end + -> GenParser Char ParserState Block +tableWith headerParser lineParser footerParser = try $ do + (rawHeads, aligns, indices) <- headerParser + lines' <- many1Till (lineParser indices) footerParser + caption <- option [] tableCaption + heads <- mapM (parseFromString (many plain)) rawHeads + state <- getState + let numColumns = stateColumns state + let widths = widthsFromIndices numColumns indices + return $ Table caption aligns widths heads lines' + +-- Parse a simple table with '---' header and one line per row. +simpleTable :: GenParser Char ParserState Block +simpleTable = tableWith simpleTableHeader tableLine blanklines + +-- Parse a multiline table: starts with row of '-' on top, then header +-- (which may be multiline), then the rows, +-- which may be multiline, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +multilineTable :: GenParser Char ParserState Block +multilineTable = tableWith multilineTableHeader multilineRow tableFooter + +multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int]) +multilineTableHeader = try $ do + tableSep + rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + let rawHeadsList = transpose $ map + (\ln -> tail $ splitByIndices (init indices) ln) + rawContent + let rawHeads = map (intercalate " ") rawHeadsList + let aligns = zipWith alignType rawHeadsList lengths + return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) + +-- Returns an alignment type for a table, based on a list of strings +-- (the rows of the column header) and a number (the length of the +-- dashed line under the rows. +alignType :: [String] + -> Int + -> Alignment +alignType [] _ = AlignDefault +alignType strLst len = + let s = head $ sortBy (comparing length) $ + map removeTrailingSpace strLst + leftSpace = if null s then False else (s !! 0) `elem` " \t" + rightSpace = length s < len || (s !! (len - 1)) `elem` " \t" + in case (leftSpace, rightSpace) of + (True, False) -> AlignRight + (False, True) -> AlignLeft + (True, True) -> AlignCenter + (False, False) -> AlignDefault + +table :: GenParser Char ParserState Block +table = simpleTable <|> multilineTable "table" + +-- +-- inline +-- + +inline :: GenParser Char ParserState Inline +inline = choice inlineParsers "inline" + +inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers = [ abbrev + , str + , smartPunctuation + , whitespace + , endline + , code + , charRef + , strong + , emph + , note + , inlineNote + , link +#ifdef _CITEPROC + , inlineCitation +#endif + , image + , math + , strikeout + , superscript + , subscript + , autoLink + , rawHtmlInline' + , rawLaTeXInline' + , escapedChar + , symbol + , ltSign ] + +inlineNonLink :: GenParser Char ParserState Inline +inlineNonLink = (choice $ + map (\parser -> try (parser >>= failIfLink)) inlineParsers) + "inline (non-link)" + +failIfLink :: Inline -> GenParser tok st Inline +failIfLink (Link _ _) = pzero +failIfLink elt = return elt + +escapedChar :: GenParser Char ParserState Inline +escapedChar = do + char '\\' + state <- getState + result <- option '\\' $ if stateStrict state + then oneOf "\\`*_{}[]()>#+-.!~" + else satisfy (not . isAlphaNum) + let result' = if result == ' ' + then '\160' -- '\ ' is a nonbreaking space + else result + return $ Str [result'] + +ltSign :: GenParser Char ParserState Inline +ltSign = do + st <- getState + if stateStrict st + then char '<' + else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html + return $ Str ['<'] + +specialCharsMinusLt :: [Char] +specialCharsMinusLt = filter (/= '<') specialChars + +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialCharsMinusLt + return $ Str [result] + +-- parses inline code, between n `s and n `s +code :: GenParser Char ParserState Inline +code = try $ do + starts <- many1 (char '`') + skipSpaces + result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> + (char '\n' >> return " ")) + (try (skipSpaces >> count (length starts) (char '`') >> + notFollowedBy (char '`'))) + return $ Code $ removeLeadingTrailingSpace $ concat result + +mathWord :: GenParser Char st [Char] +mathWord = many1 ((noneOf " \t\n\\$") <|> + (try (char '\\') >>~ notFollowedBy (char '$'))) + +math :: GenParser Char ParserState Inline +math = (mathDisplay >>= return . Math DisplayMath) + <|> (mathInline >>= return . Math InlineMath) + +mathDisplay :: GenParser Char ParserState String +mathDisplay = try $ do + failIfStrict + string "$$" + many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$") + +mathInline :: GenParser Char ParserState String +mathInline = try $ do + failIfStrict + char '$' + notFollowedBy space + words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline))) + char '$' + notFollowedBy digit + return $ intercalate " " words' + +emph :: GenParser Char ParserState Inline +emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|> + (enclosed (char '_') (notFollowedBy' strong >> char '_' >> + notFollowedBy alphaNum) inline)) >>= + return . Emph . normalizeSpaces + +strong :: GenParser Char ParserState Inline +strong = ((enclosed (string "**") (try $ string "**") inline) <|> + (enclosed (string "__") (try $ string "__") inline)) >>= + return . Strong . normalizeSpaces + +strikeout :: GenParser Char ParserState Inline +strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= + return . Strikeout . normalizeSpaces + +superscript :: GenParser Char ParserState Inline +superscript = failIfStrict >> enclosed (char '^') (char '^') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Superscript + +subscript :: GenParser Char ParserState Inline +subscript = failIfStrict >> enclosed (char '~') (char '~') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Subscript + +abbrev :: GenParser Char ParserState Inline +abbrev = failUnlessSmart >> + (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160") + +-- an string of letters followed by a period that does not end a sentence +-- is assumed to be an abbreviation. It is assumed that sentences don't +-- start with lowercase letters or numerals. +assumedAbbrev :: GenParser Char ParserState [Char] +assumedAbbrev = try $ do + result <- many1 $ satisfy isAlpha + string ". " + lookAhead $ satisfy (\x -> isLower x || isDigit x) + return result + +-- these strings are treated as abbreviations even if they are followed +-- by a capital letter (such as a name). +knownAbbrev :: GenParser Char ParserState [Char] +knownAbbrev = try $ do + result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen", + "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs", + "Sen", "Rep", "Pres", "Hon", "Rev" ] + string ". " + return result + +smartPunctuation :: GenParser Char ParserState Inline +smartPunctuation = failUnlessSmart >> + choice [ quoted, apostrophe, dash, ellipses ] + +apostrophe :: GenParser Char ParserState Inline +apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe + +quoted :: GenParser Char ParserState Inline +quoted = doubleQuoted <|> singleQuoted + +withQuoteContext :: QuoteContext + -> (GenParser Char ParserState Inline) + -> GenParser Char ParserState Inline +withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +singleQuoted :: GenParser Char ParserState Inline +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted :: GenParser Char ParserState Inline +doubleQuoted = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= + return . Quoted DoubleQuote . normalizeSpaces + +failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () +failIfInQuoteContext context = do + st <- getState + if stateQuoteContext st == context + then fail "already inside quotes" + else return () + +singleQuoteStart :: GenParser Char ParserState Char +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + char '\8216' <|> + (try $ do char '\'' + notFollowedBy (oneOf ")!],.;:-? \t\n") + notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> + satisfy (not . isAlphaNum))) + -- possess/contraction + return '\'') + +singleQuoteEnd :: GenParser Char st Char +singleQuoteEnd = try $ do + char '\8217' <|> char '\'' + notFollowedBy alphaNum + return '\'' + +doubleQuoteStart :: GenParser Char ParserState Char +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + char '\8220' <|> + (try $ do char '"' + notFollowedBy (oneOf " \t\n") + return '"') + +doubleQuoteEnd :: GenParser Char st Char +doubleQuoteEnd = char '\8221' <|> char '"' + +ellipses :: GenParser Char st Inline +ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses + +dash :: GenParser Char st Inline +dash = enDash <|> emDash + +enDash :: GenParser Char st Inline +enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash + +emDash :: GenParser Char st Inline +emDash = oneOfStrings ["---", "--"] >> return EmDash + +whitespace :: GenParser Char ParserState Inline +whitespace = do + sps <- many1 (oneOf spaceChars) + if length sps >= 2 + then option Space (endline >> return LineBreak) + else return Space "whitespace" + +nonEndline :: GenParser Char st Char +nonEndline = satisfy (/='\n') + +strChar :: GenParser Char st Char +strChar = noneOf (specialChars ++ spaceChars ++ "\n") + +str :: GenParser Char st Inline +str = many1 strChar >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline + notFollowedBy blankline + st <- getState + if stateStrict st + then do notFollowedBy emailBlockQuoteStart + notFollowedBy (char '#') -- atx header + else return () + -- parse potential list-starts differently if in a list: + if stateParserContext st == ListItemState + then notFollowedBy' (bulletListStart <|> + (anyOrderedListStart >> return ())) + else return () + return Space + +-- +-- links +-- + +-- a reference label for a link +reference :: GenParser Char ParserState [Inline] +reference = do notFollowedBy' (string "[^") -- footnote reference + result <- inlinesInBalancedBrackets inlineNonLink + return $ normalizeSpaces result + +-- source for a link, with optional title +source :: GenParser Char st (String, [Char]) +source = + (try $ charsInBalanced '(' ')' >>= parseFromString source') <|> + -- the following is needed for cases like: [ref](/url(a). + (enclosed (char '(') (char ')') anyChar >>= + parseFromString source') + +-- auxiliary function for source +source' :: GenParser Char st (String, [Char]) +source' = do + skipSpaces + let sourceURL excludes = many $ + optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' ')) + src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + tit <- option "" linkTitle + skipSpaces + eof + return (intercalate "%20" $ words $ removeTrailingSpace src, tit) + +linkTitle :: GenParser Char st String +linkTitle = try $ do + (many1 spaceChar >> option '\n' newline) <|> newline + skipSpaces + delim <- oneOf "'\"" + tit <- manyTill (optional (char '\\') >> anyChar) + (try (char delim >> skipSpaces >> eof)) + return $ decodeCharacterReferences tit + +link :: GenParser Char ParserState Inline +link = try $ do + lab <- reference + src <- source <|> referenceLink lab + sanitize <- getState >>= return . stateSanitizeHTML + if sanitize && unsanitaryURI (fst src) + then fail "Unsanitary URI" + else return $ Link lab src + +-- a link like [this][ref] or [this][] or [this] +referenceLink :: [Inline] + -> GenParser Char ParserState (String, [Char]) +referenceLink lab = do + ref <- option [] (try (optional (char ' ') >> + optional (newline >> skipSpaces) >> reference)) + let ref' = if null ref then lab else ref + state <- getState + case lookupKeySrc (stateKeys state) ref' of + Nothing -> fail "no corresponding key" + Just target -> return target + +autoLink :: GenParser Char ParserState Inline +autoLink = try $ do + char '<' + src <- uri <|> (emailAddress >>= (return . ("mailto:" ++))) + char '>' + let src' = if "mailto:" `isPrefixOf` src + then drop 7 src + else src + st <- getState + let sanitize = stateSanitizeHTML st + if sanitize && unsanitaryURI src + then fail "Unsanitary URI" + else return $ if stateStrict st + then Link [Str src'] (src, "") + else Link [Code src'] (src, "") + +image :: GenParser Char ParserState Inline +image = try $ do + char '!' + (Link lab src) <- link + return $ Image lab src + +note :: GenParser Char ParserState Inline +note = try $ do + failIfStrict + ref <- noteMarker + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just contents -> return $ Note contents + +inlineNote :: GenParser Char ParserState Inline +inlineNote = try $ do + failIfStrict + char '^' + contents <- inlinesInBalancedBrackets inline + return $ Note [Para contents] + +rawLaTeXInline' :: GenParser Char ParserState Inline +rawLaTeXInline' = do + failIfStrict + (rawConTeXtEnvironment' >>= return . TeX) + <|> (rawLaTeXEnvironment' >>= return . TeX) + <|> rawLaTeXInline + +rawConTeXtEnvironment' :: GenParser Char st String +rawConTeXtEnvironment' = try $ do + string "\\start" + completion <- inBrackets (letter <|> digit <|> spaceChar) + <|> (many1 letter) + contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar)) + (try $ string "\\stop" >> string completion) + return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion + +inBrackets :: (GenParser Char st Char) -> GenParser Char st String +inBrackets parser = do + char '[' + contents <- many parser + char ']' + return $ "[" ++ contents ++ "]" + +rawHtmlInline' :: GenParser Char ParserState Inline +rawHtmlInline' = do + st <- getState + result <- if stateStrict st + then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] + else anyHtmlInlineTag + return $ HtmlInline result + +#ifdef _CITEPROC +inlineCitation :: GenParser Char ParserState Inline +inlineCitation = try $ do + failIfStrict + cit <- citeMarker + let citations = readWith parseCitation defaultParserState cit + mr <- mapM chkCit citations + if catMaybes mr /= [] + then return $ Cite citations [] + else fail "no citation found" + +chkCit :: Target -> GenParser Char ParserState (Maybe Target) +chkCit t = do + st <- getState + case lookupKeySrc (stateKeys st) [Str $ fst t] of + Just _ -> fail "This is a link" + Nothing -> if elem (fst t) $ stateCitations st + then return $ Just t + else return $ Nothing + +citeMarker :: GenParser Char ParserState String +citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']') + +parseCitation :: GenParser Char ParserState [(String,String)] +parseCitation = try $ sepBy (parseLabel) (oneOf ";") + +parseLabel :: GenParser Char ParserState (String,String) +parseLabel = try $ do + res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@") + case res of + [lab,loc] -> return (lab, loc) + [lab] -> return (lab, "" ) + _ -> return ("" , "" ) + +#endif diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..255054c10 --- /dev/null +++ b/src/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,707 @@ +{- +Copyright (C) 2006-8 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.RST + Copyright : Copyright (C) 2006-8 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion from reStructuredText to 'Pandoc' document. +-} +module Text.Pandoc.Readers.RST ( + readRST + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.ParserCombinators.Parsec +import Control.Monad ( when ) +import Data.List ( findIndex, delete, intercalate ) + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: ParserState -> String -> Pandoc +readRST state s = (readWith parseRST) state (s ++ "\n\n") + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers :: [Char] +bulletListMarkers = "*+-" + +underlineChars :: [Char] +underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\\`|*_<>$:[-" + +-- +-- parsing documents +-- + +isHeader :: Int -> Block -> Bool +isHeader n (Header x _) = x == n +isHeader _ _ = False + +-- | Promote all headers in a list of blocks. (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level text):rest) = + (Header (level - num) text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders _ [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers. +titleTransform :: [Block] -- ^ list of blocks + -> ([Block], [Inline]) -- ^ modified list of blocks, title +titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle + if (any (isHeader 1) rest) || (any (isHeader 2) rest) + then ((Header 1 head1):(Header 2 head2):rest, []) + else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) +titleTransform ((Header 1 head1):rest) = -- title, no subtitle + if (any (isHeader 1) rest) + then ((Header 1 head1):rest, []) + else ((promoteHeaders 1 rest), head1) +titleTransform blocks = (blocks, []) + +parseRST :: GenParser Char ParserState Pandoc +parseRST = do + startPos <- getPosition + -- go through once just to get list of reference keys + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat + setInput docMinusKeys + setPosition startPos + st <- getState + let reversedKeys = stateKeys st + updateState $ \s -> s { stateKeys = reverse reversedKeys } + -- now parse it for real... + blocks <- parseBlocks + let blocks' = filter (/= Null) blocks + state <- getState + let (blocks'', title) = if stateStandalone state + then titleTransform blocks' + else (blocks', []) + let authors = stateAuthors state + let date = stateDate state + let title' = if (null title) then (stateTitle state) else title + return $ Pandoc (Meta title' authors date) blocks'' + +-- +-- parsing blocks +-- + +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +block :: GenParser Char ParserState Block +block = choice [ codeBlock + , rawHtmlBlock + , rawLaTeXBlock + , fieldList + , blockQuote + , imageBlock + , unknownDirective + , header + , hrule + , list + , lineBlock + , lhsCodeBlock + , para + , plain + , nullBlock ] "block" + +-- +-- field list +-- + +fieldListItem :: String -> GenParser Char st ([Char], [Char]) +fieldListItem indent = try $ do + string indent + char ':' + name <- many1 alphaNum + string ": " + skipSpaces + first <- manyTill anyChar newline + rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >> + indentedBlock + return (name, intercalate " " (first:(lines rest))) + +fieldList :: GenParser Char ParserState Block +fieldList = try $ do + indent <- lookAhead $ many (oneOf " \t") + items <- many1 $ fieldListItem indent + blanklines + let authors = case lookup "Authors" items of + Just auth -> [auth] + Nothing -> map snd (filter (\(x,_) -> x == "Author") items) + if null authors + then return () + else updateState $ \st -> st {stateAuthors = authors} + case (lookup "Date" items) of + Just dat -> updateState $ \st -> st {stateDate = dat} + Nothing -> return () + case (lookup "Title" items) of + Just tit -> parseFromString (many inline) tit >>= + \t -> updateState $ \st -> st {stateTitle = t} + Nothing -> return () + let remaining = filter (\(x,_) -> (x /= "Authors") && (x /= "Author") && + (x /= "Date") && (x /= "Title")) items + if null remaining + then return Null + else do terms <- mapM (return . (:[]) . Str . fst) remaining + defs <- mapM (parseFromString (many block) . snd) + remaining + return $ DefinitionList $ zip terms defs + +-- +-- line block +-- + +lineBlockLine :: GenParser Char ParserState [Inline] +lineBlockLine = try $ do + string "| " + white <- many (oneOf " \t") + line <- manyTill inline newline + return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak] + +lineBlock :: GenParser Char ParserState Block +lineBlock = try $ do + lines' <- many1 lineBlockLine + blanklines + return $ Para (concat lines') + +-- +-- paragraph block +-- + +para :: GenParser Char ParserState Block +para = paraBeforeCodeBlock <|> paraNormal "paragraph" + +codeBlockStart :: GenParser Char st Char +codeBlockStart = string "::" >> blankline >> blankline + +-- paragraph that ends in a :: starting a code block +paraBeforeCodeBlock :: GenParser Char ParserState Block +paraBeforeCodeBlock = try $ do + result <- many1 (notFollowedBy' codeBlockStart >> inline) + lookAhead (string "::") + return $ Para $ if last result == Space + then normalizeSpaces result + else (normalizeSpaces result) ++ [Str ":"] + +-- regular paragraph +paraNormal :: GenParser Char ParserState Block +paraNormal = try $ do + result <- many1 inline + newline + blanklines + return $ Para $ normalizeSpaces result + +plain :: GenParser Char ParserState Block +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- image block +-- + +imageBlock :: GenParser Char st Block +imageBlock = try $ do + string ".. image:: " + src <- manyTill anyChar newline + fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") + many1 $ fieldListItem indent + optional blanklines + case lookup "alt" fields of + Just alt -> return $ Plain [Image [Str alt] (src, alt)] + Nothing -> return $ Plain [Image [Str "image"] (src, "")] +-- +-- header blocks +-- + +header :: GenParser Char ParserState Block +header = doubleHeader <|> singleHeader "header" + +-- a header with lines on top and bottom +doubleHeader :: GenParser Char ParserState Block +doubleHeader = try $ do + c <- oneOf underlineChars + rest <- many (char c) -- the top line + let lenTop = length (c:rest) + skipSpaces + newline + txt <- many1 (notFollowedBy blankline >> inline) + pos <- getPosition + let len = (sourceColumn pos) - 1 + if (len > lenTop) then fail "title longer than border" else return () + blankline -- spaces and newline + count lenTop (char c) -- the bottom line + blanklines + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return $ Header level (normalizeSpaces txt) + +-- a header with line on the bottom only +singleHeader :: GenParser Char ParserState Block +singleHeader = try $ do + notFollowedBy' whitespace + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + blankline + c <- oneOf underlineChars + count (len - 1) (char c) + many (char c) + blanklines + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return $ Header level (normalizeSpaces txt) + +-- +-- hrule block +-- + +hrule :: GenParser Char st Block +hrule = try $ do + chr <- oneOf underlineChars + count 3 (char chr) + skipMany (char chr) + blankline + blanklines + return HorizontalRule + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine :: String -> GenParser Char st [Char] +indentedLine indents = try $ do + string indents + result <- manyTill anyChar newline + return $ result ++ "\n" + +-- two or more indented lines, possibly separated by blank lines. +-- any amount of indentation will work. +indentedBlock :: GenParser Char st [Char] +indentedBlock = do + indents <- lookAhead $ many1 (oneOf " \t") + lns <- many $ choice $ [ indentedLine indents, + try $ do b <- blanklines + l <- indentedLine indents + return (b ++ l) ] + optional blanklines + return $ concat lns + +codeBlock :: GenParser Char st Block +codeBlock = try $ do + codeBlockStart + result <- indentedBlock + return $ CodeBlock ("",[],[]) $ stripTrailingNewlines result + +lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock = try $ do + failUnlessLHS + pos <- getPosition + when (sourceColumn pos /= 1) $ fail "Not in first column" + lns <- many1 birdTrackLine + -- if (as is normal) there is always a space after >, drop it + let lns' = if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns + blanklines + return $ CodeBlock ("", ["sourceCode", "haskell"], []) $ intercalate "\n" lns' + +birdTrackLine :: GenParser Char st [Char] +birdTrackLine = do + char '>' + manyTill anyChar newline + +-- +-- raw html +-- + +rawHtmlBlock :: GenParser Char st Block +rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> + indentedBlock >>= return . RawHtml + +-- +-- raw latex +-- + +rawLaTeXBlock :: GenParser Char st Block +rawLaTeXBlock = try $ do + string ".. raw:: latex" + blanklines + result <- indentedBlock + return $ Para [(TeX result)] + +-- +-- block quotes +-- + +blockQuote :: GenParser Char ParserState Block +blockQuote = do + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n\n" + return $ BlockQuote contents + +-- +-- list blocks +-- + +list :: GenParser Char ParserState Block +list = choice [ bulletList, orderedList, definitionList ] "list" + +definitionListItem :: GenParser Char ParserState ([Inline], [Block]) +definitionListItem = try $ do + -- avoid capturing a directive or comment + notFollowedBy (try $ char '.' >> char '.') + term <- many1Till inline endline + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n\n" + return (normalizeSpaces term, contents) + +definitionList :: GenParser Char ParserState Block +definitionList = many1 definitionListItem >>= return . DefinitionList + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart :: GenParser Char st Int +bulletListStart = try $ do + notFollowedBy' hrule -- because hrules start out just like lists + marker <- oneOf bulletListMarkers + white <- many1 spaceChar + return $ length (marker:white) + +-- parses ordered list start and returns its length (inc following whitespace) +orderedListStart :: ListNumberStyle + -> ListNumberDelim + -> GenParser Char st Int +orderedListStart style delim = try $ do + (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) + white <- many1 spaceChar + return $ markerLen + length white + +-- parse a line of a list item +listLine :: Int -> GenParser Char ParserState [Char] +listLine markerLength = try $ do + notFollowedBy blankline + indentWith markerLength + line <- manyTill anyChar newline + return $ line ++ "\n" + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> GenParser Char ParserState [Char] +indentWith num = do + state <- getState + let tabStop = stateTabStop state + if (num < tabStop) + then count num (char ' ') + else choice [ try (count num (char ' ')), + (try (char '\t' >> count (num - tabStop) (char ' '))) ] + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: GenParser Char ParserState Int + -> GenParser Char ParserState (Int, [Char]) +rawListItem start = try $ do + markerLength <- start + firstLine <- manyTill anyChar newline + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + +-- continuation of a list item - indented and separated by blankline or +-- (in compact lists) endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int -> GenParser Char ParserState [Char] +listContinuation markerLength = try $ do + blanks <- many1 blankline + result <- many1 (listLine markerLength) + return $ blanks ++ concat result + +listItem :: GenParser Char ParserState Int + -> GenParser Char ParserState [Block] +listItem start = try $ do + (markerLength, first) <- rawListItem start + rest <- many (listContinuation markerLength) + blanks <- choice [ try (many blankline >>~ lookAhead start), + many1 blankline ] -- whole list must end with blank. + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may itself contain block elements + parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks + updateState (\st -> st {stateParserContext = oldContext}) + return parsed + +orderedList :: GenParser Char ParserState Block +orderedList = try $ do + (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) + items <- many1 (listItem (orderedListStart style delim)) + let items' = compactify items + return $ OrderedList (start, style, delim) items' + +bulletList :: GenParser Char ParserState Block +bulletList = many1 (listItem bulletListStart) >>= + return . BulletList . compactify + +-- +-- unknown directive (e.g. comment) +-- + +unknownDirective :: GenParser Char st Block +unknownDirective = try $ do + string ".." + notFollowedBy (noneOf " \t\n") + manyTill anyChar newline + many $ blanklines <|> (oneOf " \t" >> manyTill anyChar newline) + return Null + +-- +-- reference key +-- + +quotedReferenceName :: GenParser Char ParserState [Inline] +quotedReferenceName = try $ do + char '`' >> notFollowedBy (char '`') -- `` means inline code! + label' <- many1Till inline (char '`') + return label' + +unquotedReferenceName :: GenParser Char ParserState [Inline] +unquotedReferenceName = try $ do + label' <- many1Till inline (lookAhead $ char ':') + return label' + +isolated :: Char -> GenParser Char st Char +isolated ch = try $ char ch >>~ notFollowedBy (char ch) + +simpleReferenceName :: GenParser Char st [Inline] +simpleReferenceName = do + raw <- many1 (alphaNum <|> isolated '-' <|> isolated '.' <|> + (try $ char '_' >>~ lookAhead alphaNum)) + return [Str raw] + +referenceName :: GenParser Char ParserState [Inline] +referenceName = quotedReferenceName <|> + (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> + unquotedReferenceName + +referenceKey :: GenParser Char ParserState [Char] +referenceKey = do + startPos <- getPosition + key <- choice [imageKey, anonymousKey, regularKey] + st <- getState + let oldkeys = stateKeys st + updateState $ \s -> s { stateKeys = key : oldkeys } + optional blanklines + endPos <- getPosition + -- return enough blanks to replace key + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +targetURI :: GenParser Char st [Char] +targetURI = do + skipSpaces + optional newline + contents <- many1 (try (many spaceChar >> newline >> + many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") + blanklines + return contents + +imageKey :: GenParser Char ParserState ([Inline], (String, [Char])) +imageKey = try $ do + string ".. |" + ref <- manyTill inline (char '|') + skipSpaces + string "image::" + src <- targetURI + return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + +anonymousKey :: GenParser Char st ([Inline], (String, [Char])) +anonymousKey = try $ do + oneOfStrings [".. __:", "__"] + src <- targetURI + return ([Str "_"], (removeLeadingTrailingSpace src, "")) + +regularKey :: GenParser Char ParserState ([Inline], (String, [Char])) +regularKey = try $ do + string ".. _" + ref <- referenceName + char ':' + src <- targetURI + return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + + -- + -- inline + -- + +inline :: GenParser Char ParserState Inline +inline = choice [ link + , str + , whitespace + , endline + , strong + , emph + , code + , image + , hyphens + , superscript + , subscript + , escapedChar + , symbol ] "inline" + +hyphens :: GenParser Char ParserState Inline +hyphens = do + result <- many1 (char '-') + option Space endline + -- don't want to treat endline after hyphen or dash as a space + return $ Str result + +escapedChar :: GenParser Char st Inline +escapedChar = escaped anyChar + +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialChars + return $ Str [result] + +-- parses inline code, between codeStart and codeEnd +code :: GenParser Char ParserState Inline +code = try $ do + string "``" + result <- manyTill anyChar (try (string "``")) + return $ Code $ removeLeadingTrailingSpace $ intercalate " " $ lines result + +emph :: GenParser Char ParserState Inline +emph = enclosed (char '*') (char '*') inline >>= + return . Emph . normalizeSpaces + +strong :: GenParser Char ParserState Inline +strong = enclosed (string "**") (try $ string "**") inline >>= + return . Strong . normalizeSpaces + +interpreted :: [Char] -> GenParser Char st [Inline] +interpreted role = try $ do + optional $ try $ string "\\ " + result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar + try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") + return [Str result] + +superscript :: GenParser Char ParserState Inline +superscript = interpreted "sup" >>= (return . Superscript) + +subscript :: GenParser Char ParserState Inline +subscript = interpreted "sub" >>= (return . Subscript) + +whitespace :: GenParser Char ParserState Inline +whitespace = many1 spaceChar >> return Space "whitespace" + +str :: GenParser Char ParserState Inline +str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline + notFollowedBy blankline + -- parse potential list-starts at beginning of line differently in a list: + st <- getState + if (stateParserContext st) == ListItemState + then notFollowedBy (anyOrderedListMarker >> spaceChar) >> + notFollowedBy' bulletListStart + else return () + return Space + +-- +-- links +-- + +link :: GenParser Char ParserState Inline +link = choice [explicitLink, referenceLink, autoLink] "link" + +explicitLink :: GenParser Char ParserState Inline +explicitLink = try $ do + char '`' + notFollowedBy (char '`') -- `` marks start of inline code + label' <- manyTill (notFollowedBy (char '`') >> inline) + (try (spaces >> char '<')) + src <- manyTill (noneOf ">\n ") (char '>') + skipSpaces + string "`_" + return $ Link (normalizeSpaces label') (removeLeadingTrailingSpace src, "") + +referenceLink :: GenParser Char ParserState Inline +referenceLink = try $ do + label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_' + key <- option label' (do{char '_'; return [Str "_"]}) -- anonymous link + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable key of + Nothing -> fail "no corresponding key" + Just target -> return target + -- if anonymous link, remove first anon key so it won't be used again + let keyTable' = if (key == [Str "_"]) -- anonymous link? + then delete ([Str "_"], src) keyTable -- remove first anon key + else keyTable + setState $ state { stateKeys = keyTable' } + return $ Link (normalizeSpaces label') src + +autoURI :: GenParser Char ParserState Inline +autoURI = do + src <- uri + return $ Link [Str src] (src, "") + +autoEmail :: GenParser Char ParserState Inline +autoEmail = do + src <- emailAddress + return $ Link [Str src] ("mailto:" ++ src, "") + +autoLink :: GenParser Char ParserState Inline +autoLink = autoURI <|> autoEmail + +-- For now, we assume that all substitution references are for images. +image :: GenParser Char ParserState Inline +image = try $ do + char '|' + ref <- manyTill inline (char '|') + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable ref of + Nothing -> fail "no corresponding key" + Just target -> return target + return $ Image (normalizeSpaces ref) src + diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs new file mode 100644 index 000000000..04b0f3b8f --- /dev/null +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -0,0 +1,233 @@ +{- +Copyright (C) 2007 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.TeXMath + Copyright : Copyright (C) 2007 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of TeX math to a list of 'Pandoc' inline elements. +-} +module Text.Pandoc.Readers.TeXMath ( + readTeXMath + ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition + +-- | Converts a string of raw TeX math to a list of 'Pandoc' inlines. +readTeXMath :: String -> [Inline] +readTeXMath inp = case parse teXMath ("formula: " ++ inp) inp of + Left _ -> [Str inp] -- if unparseable, just include original + Right res -> res + +teXMath :: GenParser Char st [Inline] +teXMath = manyTill mathPart eof >>= return . concat + +mathPart :: GenParser Char st [Inline] +mathPart = whitespace <|> superscript <|> subscript <|> symbol <|> + argument <|> digits <|> letters <|> misc + +whitespace :: GenParser Char st [Inline] +whitespace = many1 space >> return [] + +symbol :: GenParser Char st [Inline] +symbol = try $ do + char '\\' + res <- many1 letter + case lookup res teXsymbols of + Just m -> return [Str m] + Nothing -> return [Str $ "\\" ++ res] + +argument :: GenParser Char st [Inline] +argument = try $ do + char '{' + res <- many mathPart + char '}' + return $ if null res + then [Str " "] + else [Str "{"] ++ concat res ++ [Str "}"] + +digits :: GenParser Char st [Inline] +digits = do + res <- many1 digit + return [Str res] + +letters :: GenParser Char st [Inline] +letters = do + res <- many1 letter + return [Emph [Str res]] + +misc :: GenParser Char st [Inline] +misc = do + res <- noneOf "}" + return [Str [res]] + +scriptArg :: GenParser Char st [Inline] +scriptArg = try $ do + (try (do{char '{'; r <- many mathPart; char '}'; return $ concat r})) + <|> symbol + <|> (do{c <- (letter <|> digit); return [Str [c]]}) + +superscript :: GenParser Char st [Inline] +superscript = try $ do + char '^' + arg <- scriptArg + return [Superscript arg] + +subscript :: GenParser Char st [Inline] +subscript = try $ do + char '_' + arg <- scriptArg + return [Subscript arg] + +withThinSpace :: String -> String +withThinSpace str = "\x2009" ++ str ++ "\x2009" + +teXsymbols :: [(String, String)] +teXsymbols = + [("alpha","\x3B1") + ,("beta", "\x3B2") + ,("chi", "\x3C7") + ,("delta", "\x3B4") + ,("Delta", "\x394") + ,("epsilon", "\x3B5") + ,("varepsilon", "\x25B") + ,("eta", "\x3B7") + ,("gamma", "\x3B3") + ,("Gamma", "\x393") + ,("iota", "\x3B9") + ,("kappa", "\x3BA") + ,("lambda", "\x3BB") + ,("Lambda", "\x39B") + ,("mu", "\x3BC") + ,("nu", "\x3BD") + ,("omega", "\x3C9") + ,("Omega", "\x3A9") + ,("phi", "\x3C6") + ,("varphi", "\x3D5") + ,("Phi", "\x3A6") + ,("pi", "\x3C0") + ,("Pi", "\x3A0") + ,("psi", "\x3C8") + ,("Psi", "\x3A8") + ,("rho", "\x3C1") + ,("sigma", "\x3C3") + ,("Sigma", "\x3A3") + ,("tau", "\x3C4") + ,("theta", "\x3B8") + ,("vartheta", "\x3D1") + ,("Theta", "\x398") + ,("upsilon", "\x3C5") + ,("xi", "\x3BE") + ,("Xi", "\x39E") + ,("zeta", "\x3B6") + ,("ne", "\x2260") + ,("lt", withThinSpace "<") + ,("le", withThinSpace "\x2264") + ,("leq", withThinSpace "\x2264") + ,("ge", withThinSpace "\x2265") + ,("geq", withThinSpace "\x2265") + ,("prec", withThinSpace "\x227A") + ,("succ", withThinSpace "\x227B") + ,("preceq", withThinSpace "\x2AAF") + ,("succeq", withThinSpace "\x2AB0") + ,("in", withThinSpace "\x2208") + ,("notin", withThinSpace "\x2209") + ,("subset", withThinSpace "\x2282") + ,("supset", withThinSpace "\x2283") + ,("subseteq", withThinSpace "\x2286") + ,("supseteq", withThinSpace "\x2287") + ,("equiv", withThinSpace "\x2261") + ,("cong", withThinSpace "\x2245") + ,("approx", withThinSpace "\x2248") + ,("propto", withThinSpace "\x221D") + ,("cdot", withThinSpace "\x22C5") + ,("star", withThinSpace "\x22C6") + ,("backslash", "\\") + ,("times", withThinSpace "\x00D7") + ,("divide", withThinSpace "\x00F7") + ,("circ", withThinSpace "\x2218") + ,("oplus", withThinSpace "\x2295") + ,("otimes", withThinSpace "\x2297") + ,("odot", withThinSpace "\x2299") + ,("sum", "\x2211") + ,("prod", "\x220F") + ,("wedge", withThinSpace "\x2227") + ,("bigwedge", withThinSpace "\x22C0") + ,("vee", withThinSpace "\x2228") + ,("bigvee", withThinSpace "\x22C1") + ,("cap", withThinSpace "\x2229") + ,("bigcap", withThinSpace "\x22C2") + ,("cup", withThinSpace "\x222A") + ,("bigcup", withThinSpace "\x22C3") + ,("neg", "\x00AC") + ,("implies", withThinSpace "\x21D2") + ,("iff", withThinSpace "\x21D4") + ,("forall", "\x2200") + ,("exists", "\x2203") + ,("bot", "\x22A5") + ,("top", "\x22A4") + ,("vdash", "\x22A2") + ,("models", withThinSpace "\x22A8") + ,("uparrow", "\x2191") + ,("downarrow", "\x2193") + ,("rightarrow", withThinSpace "\x2192") + ,("to", withThinSpace "\x2192") + ,("rightarrowtail", "\x21A3") + ,("twoheadrightarrow", withThinSpace "\x21A0") + ,("twoheadrightarrowtail", withThinSpace "\x2916") + ,("mapsto", withThinSpace "\x21A6") + ,("leftarrow", withThinSpace "\x2190") + ,("leftrightarrow", withThinSpace "\x2194") + ,("Rightarrow", withThinSpace "\x21D2") + ,("Leftarrow", withThinSpace "\x21D0") + ,("Leftrightarrow", withThinSpace "\x21D4") + ,("partial", "\x2202") + ,("nabla", "\x2207") + ,("pm", "\x00B1") + ,("emptyset", "\x2205") + ,("infty", "\x221E") + ,("aleph", "\x2135") + ,("ldots", "...") + ,("therefore", "\x2234") + ,("angle", "\x2220") + ,("quad", "\x00A0\x00A0") + ,("cdots", "\x22EF") + ,("vdots", "\x22EE") + ,("ddots", "\x22F1") + ,("diamond", "\x22C4") + ,("Box", "\x25A1") + ,("lfloor", "\x230A") + ,("rfloor", "\x230B") + ,("lceiling", "\x2308") + ,("rceiling", "\x2309") + ,("langle", "\x2329") + ,("rangle", "\x232A") + ,("{", "{") + ,("}", "}") + ,("[", "[") + ,("]", "]") + ,("|", "|") + ,("||", "||") + ] + -- cgit v1.2.3