diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/LaTeX.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 585 |
1 files changed, 585 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..3bf3dfd23 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,585 @@ +-- | Converts LaTeX to 'Pandoc' document. +module Text.Pandoc.Readers.LaTeX ( + readLaTeX, + rawLaTeXInline, + rawLaTeXEnvironment + ) where + +import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Maybe ( fromMaybe ) +import Char ( chr ) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse + -> Pandoc +readLaTeX = readWith parseLaTeX + +-- for testing +testString = testStringWith parseLaTeX + +-- characters with special meaning +specialChars = "\\$%&^&_~#{}\n \t|<>" + +-- +-- utility functions +-- + +-- | Change quotation marks in a string back to "basic" quotes. +normalizeQuotes :: String -> String +normalizeQuotes = gsub "''" "\"" . gsub "`" "'" + +-- | Change LaTeX En dashes between digits to hyphens. +normalizeDashes :: String -> String +normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2" + +normalizePunctuation :: String -> String +normalizePunctuation = normalizeDashes . normalizeQuotes + +-- | Returns command option (between []) if any, or empty string. +commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]"))) + +-- | Returns text between brackets and its matching pair. +bracketedText = try (do + char '{' + result <- many (choice [ try (do{ char '\\'; + b <- oneOf "{}"; + return (['\\', b])}), -- escaped bracket + count 1 (noneOf "{}"), + do {text <- bracketedText; return ("{" ++ text ++ "}")} ]) + char '}' + return (concat result)) + +-- | Parses list of arguments of LaTeX command. +commandArgs = many bracketedText + +-- | Parses LaTeX command, returns (name, star, option, list of arguments). +command = try (do + char '\\' + name <- many1 alphaNum + star <- option "" (string "*") -- some commands have starred versions + opt <- commandOpt + args <- commandArgs + return (name, star, opt, args)) + +begin name = try (do + string "\\begin{" + string name + char '}' + option "" commandOpt + option [] commandArgs + spaces + return name) + +end name = try (do + string "\\end{" + string name + char '}' + spaces + return name) + +-- | Returns a list of block elements containing the contents of an environment. +environment name = try (do + begin name + spaces + contents <- manyTill block (end name) + return contents) + +anyEnvironment = try (do + string "\\begin{" + name <- many alphaNum + star <- option "" (string "*") -- some environments have starred variants + char '}' + option "" commandOpt + option [] commandArgs + spaces + contents <- manyTill block (end (name ++ star)) + return (BlockQuote contents)) + +-- +-- parsing documents +-- + +-- | Skip everything up through \begin{document} +skipLaTeXHeader = try (do + manyTill anyChar (begin "document") + spaces + return "") + +-- | Parse LaTeX and return 'Pandoc'. +parseLaTeX = do + option "" skipLaTeXHeader -- if parsing a fragment, this might not be present + blocks <- parseBlocks + spaces + option "" (string "\\end{document}") -- if parsing a fragment, this might not be present + spaces + eof + state <- getState + let keyBlocks = stateKeyBlocks state + let noteBlocks = stateNoteBlocks state + let blocks' = filter (/= Null) blocks + return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks))) + +-- +-- parsing blocks +-- + +parseBlocks = do + spaces + result <- many block + return result + +block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock, comment, + bibliographic, para, specialEnvironment, itemBlock, unknownEnvironment, + unknownCommand ] <?> "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" + +headerLevel n = try (do + let subs = concat $ replicate (n - 1) "sub" + string ("\\" ++ subs ++ "section") + option ' ' (char '*') + char '{' + title <- manyTill inline (char '}') + spaces + return (Header n (normalizeSpaces title))) + +-- +-- hrule block +-- + +hrule = try (do + oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ] + spaces + return HorizontalRule) + +-- +-- code blocks +-- + +codeBlock = try (do + string "\\begin{verbatim}" -- don't use begin function because it gobbles whitespace + option "" blanklines -- we want to gobble blank lines, but not leading space + contents <- manyTill anyChar (try (string "\\end{verbatim}")) + spaces + return (CodeBlock (stripTrailingNewlines contents))) + +-- +-- block quotes +-- + +blockQuote = choice [ blockQuote1, blockQuote2 ] <?> "blockquote" + +blockQuote1 = try (do + blocks <- environment "quote" + spaces + return (BlockQuote blocks)) + +blockQuote2 = try (do + blocks <- environment "quotation" + spaces + return (BlockQuote blocks)) + +-- +-- math block +-- + +mathBlock = mathBlockWith (begin "equation") (end "equation") <|> + mathBlockWith (begin "displaymath") (end "displaymath") <|> + mathBlockWith (string "\\[") (string "\\]") <?> "math block" + +mathBlockWith start end = try (do + start + spaces + result <- manyTill anyChar end + spaces + return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]])) + +-- +-- list blocks +-- + +list = bulletList <|> orderedList <?> "list" + +listItem = try (do + ("item", _, _, _) <- command + spaces + state <- getState + let oldParserContext = stateParserContext state + updateState (\state -> state {stateParserContext = ListItemState}) + blocks <- many block + updateState (\state -> state {stateParserContext = oldParserContext}) + return blocks) + +orderedList = try (do + begin "enumerate" + spaces + items <- many listItem + end "enumerate" + spaces + return (OrderedList items)) + +bulletList = try (do + begin "itemize" + spaces + items <- many listItem + end "itemize" + spaces + return (BulletList items)) + +-- +-- paragraph block +-- + +para = try (do + result <- many1 inline + spaces + return (Para (normalizeSpaces result))) + +-- +-- title authors date +-- + +bibliographic = choice [ maketitle, title, authors, date ] + +maketitle = try (do + string "\\maketitle" + spaces + return Null) + +title = try (do + string "\\title{" + tit <- manyTill inline (char '}') + spaces + updateState (\state -> state { stateTitle = tit }) + return Null) + +authors = try (do + string "\\author{" + authors <- manyTill anyChar (char '}') + spaces + let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors + updateState (\state -> state { stateAuthors = authors' }) + return Null) + +date = try (do + string "\\date{" + date' <- manyTill anyChar (char '}') + spaces + updateState (\state -> state { stateDate = date' }) + return Null) + +-- +-- item block +-- for use in unknown environments that aren't being parsed as raw latex +-- + +-- this forces items to be parsed in different blocks +itemBlock = try (do + ("item", _, opt, _) <- command + state <- getState + if (stateParserContext state == ListItemState) then + fail "item should be handled by list block" + else + if null opt then + return Null + else + return (Plain [Str opt])) + +-- +-- raw LaTeX +-- + +specialEnvironment = do -- these are always parsed as raw + followedBy' (choice (map (\name -> begin name) ["tabular", "figure", "tabbing", "eqnarry", + "picture", "table", "verse", "theorem"])) + rawLaTeXEnvironment + +-- | Parse any LaTeX environment and return a Para block containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment :: GenParser Char st Block +rawLaTeXEnvironment = try (do + string "\\begin" + char '{' + name <- many1 alphaNum + star <- option "" (string "*") -- for starred variants + let name' = name ++ star + char '}' + opt <- option "" commandOpt + args <- option [] commandArgs + let optStr = if (null opt) then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + contents <- manyTill (choice [(many1 (noneOf "\\")), + (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }), + string "\\"]) (end name') + spaces + return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++ + (concat contents) ++ "\\end{" ++ name' ++ "}")])) + +unknownEnvironment = try (do + state <- getState + result <- if stateParseRaw state then -- check to see whether we should include raw TeX + rawLaTeXEnvironment -- if so, get the whole raw environment + else + anyEnvironment -- otherwise just the contents + return result) + +unknownCommand = try (do + notFollowedBy' (string "\\end{itemize}") + notFollowedBy' (string "\\end{enumerate}") + notFollowedBy' (string "\\end{document}") + (name, star, opt, args) <- command + spaces + let optStr = if null opt then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + state <- getState + if (name == "item") && ((stateParserContext state) == ListItemState) then + fail "should not be parsed as raw" + else + string "" + if stateParseRaw state then + return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)]) + else + return (Plain [Str (joinWithSep " " args)])) + +-- latex comment +comment = try (do + char '%' + result <- manyTill anyChar newline + spaces + return Null) + +-- +-- inline +-- + +inline = choice [ strong, emph, ref, lab, code, linebreak, math, ldots, accentedChar, + specialChar, specialInline, escapedChar, unescapedChar, str, + endline, whitespace ] <?> "inline" + +specialInline = choice [ link, image, footnote, rawLaTeXInline ] <?> + "link, raw TeX, note, or image" + +ldots = try (do + string "\\ldots" + return (Str "...")) + +accentedChar = normalAccentedChar <|> specialAccentedChar + +normalAccentedChar = try (do + char '\\' + accent <- oneOf "'`^\"~" + character <- choice [ between (char '{') (char '}') anyChar, anyChar ] + let table = fromMaybe [] $ lookup character accentTable + let result = case lookup accent table of + Just num -> chr num + Nothing -> '?' + return (Str [result])) + +-- an association list of letters and association list of accents +-- and decimal character numbers. +accentTable = + [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), + ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), + ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), + ('N', [('~', 209)]), + ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), + ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), + ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), + ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), + ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), + ('n', [('~', 241)]), + ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), + ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] + +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, oslash, pound, + euro, copyright, sect ] + +ccedil = try (do + char '\\' + letter <- choice [try (string "cc"), try (string "cC")] + let num = if letter == "cc" then 231 else 199 + return (Str [chr num])) + +aring = try (do + char '\\' + letter <- choice [try (string "aa"), try (string "AA")] + let num = if letter == "aa" then 229 else 197 + return (Str [chr num])) + +iuml = try (do + string "\\\"" + choice [try (string "\\i"), try (string "{\\i}")] + return (Str [chr 239])) + +icirc = try (do + string "\\^" + choice [try (string "\\i"), try (string "{\\i}")] + return (Str [chr 238])) + +szlig = try (do + string "\\ss" + return (Str [chr 223])) + +oslash = try (do + char '\\' + letter <- choice [char 'o', char 'O'] + let num = if letter == 'o' then 248 else 216 + return (Str [chr num])) + +aelig = try (do + char '\\' + letter <- choice [try (string "ae"), try (string "AE")] + let num = if letter == "ae" then 230 else 198 + return (Str [chr num])) + +pound = try (do + string "\\pounds" + return (Str [chr 163])) + +euro = try (do + string "\\euro" + return (Str [chr 8364])) + +copyright = try (do + string "\\copyright" + return (Str [chr 169])) + +sect = try (do + string "\\S" + return (Str [chr 167])) + +escapedChar = escaped (oneOf " $%^&_#{}") + +unescapedChar = do -- ignore standalone, nonescaped special characters + oneOf "$^&_#{}|<>" + return (Str "") + +specialChar = choice [ backslash, bar, lt, gt ] + +backslash = try (do + string "\\textbackslash" + return (Str "\\")) + +bar = try (do + string "\\textbar" + return (Str "\\")) + +lt = try (do + string "\\textless" + return (Str "<")) + +gt = try (do + string "\\textgreater" + return (Str ">")) + +code = try (do + string "\\verb" + marker <- anyChar + result <- manyTill anyChar (char marker) + let result' = removeLeadingTrailingSpace result + return (Code result')) + +emph = try (do + oneOfStrings [ "\\emph{", "\\textit{" ] + result <- manyTill inline (char '}') + return (Emph result)) + +lab = try (do + string "\\label{" + result <- manyTill anyChar (char '}') + return (Str ("(" ++ result ++ ")"))) + +ref = try (do + string "\\ref{" + result <- manyTill anyChar (char '}') + return (Str (result))) + +strong = try (do + string "\\textbf{" + result <- manyTill inline (char '}') + return (Strong result)) + +whitespace = do + many1 (oneOf "~ \t") + return Space + +-- hard line break +linebreak = try (do + string "\\\\" + return LineBreak) + +str = do + result <- many1 (noneOf specialChars) + return (Str (normalizePunctuation result)) + +-- endline internal to paragraph +endline = try (do + newline + notFollowedBy blankline + return Space) + +-- math +math = math1 <|> math2 <?> "math" + +math1 = try (do + char '$' + result <- many (noneOf "$") + char '$' + return (TeX ("$" ++ result ++ "$"))) + +math2 = try (do + string "\\(" + result <- many (noneOf "$") + string "\\)" + return (TeX ("$" ++ result ++ "$"))) + +-- +-- links and images +-- + +link = try (do + string "\\href{" + url <- manyTill anyChar (char '}') + char '{' + label <- manyTill inline (char '}') + ref <- generateReference url "" + return (Link (normalizeSpaces label) ref)) + +image = try (do + ("includegraphics", _, _, (src:lst)) <- command + return (Image [Str "image"] (Src src ""))) + +footnote = try (do + ("footnote", _, _, (contents:[])) <- command + let blocks = case runParser parseBlocks defaultParserState "footnote" contents of + Left err -> error $ "Input:\n" ++ show contents ++ + "\nError:\n" ++ show err + Right result -> result + state <- getState + let notes = stateNoteBlocks state + let nextRef = case notes of + [] -> "1" + (Note ref body):rest -> (show ((read ref) + 1)) + setState (state { stateNoteBlocks = (Note nextRef blocks):notes }) + return (NoteRef nextRef)) + +-- | Parse any LaTeX command and return it in a raw TeX inline element. +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = try (do + (name, star, opt, args) <- command + let optStr = if (null opt) then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args + state <- getState + if ((name == "begin") || (name == "end") || (name == "item")) then + fail "not an inline command" + else + string "" + return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr))) |