summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/LaTeX.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/LaTeX.hs')
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs585
1 files changed, 585 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
new file mode 100644
index 000000000..3bf3dfd23
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -0,0 +1,585 @@
+-- | Converts LaTeX to 'Pandoc' document.
+module Text.Pandoc.Readers.LaTeX (
+ readLaTeX,
+ rawLaTeXInline,
+ rawLaTeXEnvironment
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Maybe ( fromMaybe )
+import Char ( chr )
+
+-- | Parse LaTeX from string and return 'Pandoc' document.
+readLaTeX :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse
+ -> Pandoc
+readLaTeX = readWith parseLaTeX
+
+-- for testing
+testString = testStringWith parseLaTeX
+
+-- characters with special meaning
+specialChars = "\\$%&^&_~#{}\n \t|<>"
+
+--
+-- utility functions
+--
+
+-- | Change quotation marks in a string back to "basic" quotes.
+normalizeQuotes :: String -> String
+normalizeQuotes = gsub "''" "\"" . gsub "`" "'"
+
+-- | Change LaTeX En dashes between digits to hyphens.
+normalizeDashes :: String -> String
+normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2"
+
+normalizePunctuation :: String -> String
+normalizePunctuation = normalizeDashes . normalizeQuotes
+
+-- | Returns command option (between []) if any, or empty string.
+commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]")))
+
+-- | Returns text between brackets and its matching pair.
+bracketedText = try (do
+ char '{'
+ result <- many (choice [ try (do{ char '\\';
+ b <- oneOf "{}";
+ return (['\\', b])}), -- escaped bracket
+ count 1 (noneOf "{}"),
+ do {text <- bracketedText; return ("{" ++ text ++ "}")} ])
+ char '}'
+ return (concat result))
+
+-- | Parses list of arguments of LaTeX command.
+commandArgs = many bracketedText
+
+-- | Parses LaTeX command, returns (name, star, option, list of arguments).
+command = try (do
+ char '\\'
+ name <- many1 alphaNum
+ star <- option "" (string "*") -- some commands have starred versions
+ opt <- commandOpt
+ args <- commandArgs
+ return (name, star, opt, args))
+
+begin name = try (do
+ string "\\begin{"
+ string name
+ char '}'
+ option "" commandOpt
+ option [] commandArgs
+ spaces
+ return name)
+
+end name = try (do
+ string "\\end{"
+ string name
+ char '}'
+ spaces
+ return name)
+
+-- | Returns a list of block elements containing the contents of an environment.
+environment name = try (do
+ begin name
+ spaces
+ contents <- manyTill block (end name)
+ return contents)
+
+anyEnvironment = try (do
+ string "\\begin{"
+ name <- many alphaNum
+ star <- option "" (string "*") -- some environments have starred variants
+ char '}'
+ option "" commandOpt
+ option [] commandArgs
+ spaces
+ contents <- manyTill block (end (name ++ star))
+ return (BlockQuote contents))
+
+--
+-- parsing documents
+--
+
+-- | Skip everything up through \begin{document}
+skipLaTeXHeader = try (do
+ manyTill anyChar (begin "document")
+ spaces
+ return "")
+
+-- | Parse LaTeX and return 'Pandoc'.
+parseLaTeX = do
+ option "" skipLaTeXHeader -- if parsing a fragment, this might not be present
+ blocks <- parseBlocks
+ spaces
+ option "" (string "\\end{document}") -- if parsing a fragment, this might not be present
+ spaces
+ eof
+ state <- getState
+ let keyBlocks = stateKeyBlocks state
+ let noteBlocks = stateNoteBlocks state
+ let blocks' = filter (/= Null) blocks
+ return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks)))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ spaces
+ result <- many block
+ return result
+
+block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock, comment,
+ bibliographic, para, specialEnvironment, itemBlock, unknownEnvironment,
+ unknownCommand ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try (do
+ let subs = concat $ replicate (n - 1) "sub"
+ string ("\\" ++ subs ++ "section")
+ option ' ' (char '*')
+ char '{'
+ title <- manyTill inline (char '}')
+ spaces
+ return (Header n (normalizeSpaces title)))
+
+--
+-- hrule block
+--
+
+hrule = try (do
+ oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ]
+ spaces
+ return HorizontalRule)
+
+--
+-- code blocks
+--
+
+codeBlock = try (do
+ string "\\begin{verbatim}" -- don't use begin function because it gobbles whitespace
+ option "" blanklines -- we want to gobble blank lines, but not leading space
+ contents <- manyTill anyChar (try (string "\\end{verbatim}"))
+ spaces
+ return (CodeBlock (stripTrailingNewlines contents)))
+
+--
+-- block quotes
+--
+
+blockQuote = choice [ blockQuote1, blockQuote2 ] <?> "blockquote"
+
+blockQuote1 = try (do
+ blocks <- environment "quote"
+ spaces
+ return (BlockQuote blocks))
+
+blockQuote2 = try (do
+ blocks <- environment "quotation"
+ spaces
+ return (BlockQuote blocks))
+
+--
+-- math block
+--
+
+mathBlock = mathBlockWith (begin "equation") (end "equation") <|>
+ mathBlockWith (begin "displaymath") (end "displaymath") <|>
+ mathBlockWith (string "\\[") (string "\\]") <?> "math block"
+
+mathBlockWith start end = try (do
+ start
+ spaces
+ result <- manyTill anyChar end
+ spaces
+ return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]]))
+
+--
+-- list blocks
+--
+
+list = bulletList <|> orderedList <?> "list"
+
+listItem = try (do
+ ("item", _, _, _) <- command
+ spaces
+ state <- getState
+ let oldParserContext = stateParserContext state
+ updateState (\state -> state {stateParserContext = ListItemState})
+ blocks <- many block
+ updateState (\state -> state {stateParserContext = oldParserContext})
+ return blocks)
+
+orderedList = try (do
+ begin "enumerate"
+ spaces
+ items <- many listItem
+ end "enumerate"
+ spaces
+ return (OrderedList items))
+
+bulletList = try (do
+ begin "itemize"
+ spaces
+ items <- many listItem
+ end "itemize"
+ spaces
+ return (BulletList items))
+
+--
+-- paragraph block
+--
+
+para = try (do
+ result <- many1 inline
+ spaces
+ return (Para (normalizeSpaces result)))
+
+--
+-- title authors date
+--
+
+bibliographic = choice [ maketitle, title, authors, date ]
+
+maketitle = try (do
+ string "\\maketitle"
+ spaces
+ return Null)
+
+title = try (do
+ string "\\title{"
+ tit <- manyTill inline (char '}')
+ spaces
+ updateState (\state -> state { stateTitle = tit })
+ return Null)
+
+authors = try (do
+ string "\\author{"
+ authors <- manyTill anyChar (char '}')
+ spaces
+ let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors
+ updateState (\state -> state { stateAuthors = authors' })
+ return Null)
+
+date = try (do
+ string "\\date{"
+ date' <- manyTill anyChar (char '}')
+ spaces
+ updateState (\state -> state { stateDate = date' })
+ return Null)
+
+--
+-- item block
+-- for use in unknown environments that aren't being parsed as raw latex
+--
+
+-- this forces items to be parsed in different blocks
+itemBlock = try (do
+ ("item", _, opt, _) <- command
+ state <- getState
+ if (stateParserContext state == ListItemState) then
+ fail "item should be handled by list block"
+ else
+ if null opt then
+ return Null
+ else
+ return (Plain [Str opt]))
+
+--
+-- raw LaTeX
+--
+
+specialEnvironment = do -- these are always parsed as raw
+ followedBy' (choice (map (\name -> begin name) ["tabular", "figure", "tabbing", "eqnarry",
+ "picture", "table", "verse", "theorem"]))
+ rawLaTeXEnvironment
+
+-- | Parse any LaTeX environment and return a Para block containing
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment :: GenParser Char st Block
+rawLaTeXEnvironment = try (do
+ string "\\begin"
+ char '{'
+ name <- many1 alphaNum
+ star <- option "" (string "*") -- for starred variants
+ let name' = name ++ star
+ char '}'
+ opt <- option "" commandOpt
+ args <- option [] commandArgs
+ let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ contents <- manyTill (choice [(many1 (noneOf "\\")),
+ (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }),
+ string "\\"]) (end name')
+ spaces
+ return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++
+ (concat contents) ++ "\\end{" ++ name' ++ "}")]))
+
+unknownEnvironment = try (do
+ state <- getState
+ result <- if stateParseRaw state then -- check to see whether we should include raw TeX
+ rawLaTeXEnvironment -- if so, get the whole raw environment
+ else
+ anyEnvironment -- otherwise just the contents
+ return result)
+
+unknownCommand = try (do
+ notFollowedBy' (string "\\end{itemize}")
+ notFollowedBy' (string "\\end{enumerate}")
+ notFollowedBy' (string "\\end{document}")
+ (name, star, opt, args) <- command
+ spaces
+ let optStr = if null opt then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ state <- getState
+ if (name == "item") && ((stateParserContext state) == ListItemState) then
+ fail "should not be parsed as raw"
+ else
+ string ""
+ if stateParseRaw state then
+ return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)])
+ else
+ return (Plain [Str (joinWithSep " " args)]))
+
+-- latex comment
+comment = try (do
+ char '%'
+ result <- manyTill anyChar newline
+ spaces
+ return Null)
+
+--
+-- inline
+--
+
+inline = choice [ strong, emph, ref, lab, code, linebreak, math, ldots, accentedChar,
+ specialChar, specialInline, escapedChar, unescapedChar, str,
+ endline, whitespace ] <?> "inline"
+
+specialInline = choice [ link, image, footnote, rawLaTeXInline ] <?>
+ "link, raw TeX, note, or image"
+
+ldots = try (do
+ string "\\ldots"
+ return (Str "..."))
+
+accentedChar = normalAccentedChar <|> specialAccentedChar
+
+normalAccentedChar = try (do
+ char '\\'
+ accent <- oneOf "'`^\"~"
+ character <- choice [ between (char '{') (char '}') anyChar, anyChar ]
+ let table = fromMaybe [] $ lookup character accentTable
+ let result = case lookup accent table of
+ Just num -> chr num
+ Nothing -> '?'
+ return (Str [result]))
+
+-- an association list of letters and association list of accents
+-- and decimal character numbers.
+accentTable =
+ [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
+ ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
+ ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
+ ('N', [('~', 209)]),
+ ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
+ ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
+ ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
+ ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
+ ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
+ ('n', [('~', 241)]),
+ ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
+ ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
+
+specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, oslash, pound,
+ euro, copyright, sect ]
+
+ccedil = try (do
+ char '\\'
+ letter <- choice [try (string "cc"), try (string "cC")]
+ let num = if letter == "cc" then 231 else 199
+ return (Str [chr num]))
+
+aring = try (do
+ char '\\'
+ letter <- choice [try (string "aa"), try (string "AA")]
+ let num = if letter == "aa" then 229 else 197
+ return (Str [chr num]))
+
+iuml = try (do
+ string "\\\""
+ choice [try (string "\\i"), try (string "{\\i}")]
+ return (Str [chr 239]))
+
+icirc = try (do
+ string "\\^"
+ choice [try (string "\\i"), try (string "{\\i}")]
+ return (Str [chr 238]))
+
+szlig = try (do
+ string "\\ss"
+ return (Str [chr 223]))
+
+oslash = try (do
+ char '\\'
+ letter <- choice [char 'o', char 'O']
+ let num = if letter == 'o' then 248 else 216
+ return (Str [chr num]))
+
+aelig = try (do
+ char '\\'
+ letter <- choice [try (string "ae"), try (string "AE")]
+ let num = if letter == "ae" then 230 else 198
+ return (Str [chr num]))
+
+pound = try (do
+ string "\\pounds"
+ return (Str [chr 163]))
+
+euro = try (do
+ string "\\euro"
+ return (Str [chr 8364]))
+
+copyright = try (do
+ string "\\copyright"
+ return (Str [chr 169]))
+
+sect = try (do
+ string "\\S"
+ return (Str [chr 167]))
+
+escapedChar = escaped (oneOf " $%^&_#{}")
+
+unescapedChar = do -- ignore standalone, nonescaped special characters
+ oneOf "$^&_#{}|<>"
+ return (Str "")
+
+specialChar = choice [ backslash, bar, lt, gt ]
+
+backslash = try (do
+ string "\\textbackslash"
+ return (Str "\\"))
+
+bar = try (do
+ string "\\textbar"
+ return (Str "\\"))
+
+lt = try (do
+ string "\\textless"
+ return (Str "<"))
+
+gt = try (do
+ string "\\textgreater"
+ return (Str ">"))
+
+code = try (do
+ string "\\verb"
+ marker <- anyChar
+ result <- manyTill anyChar (char marker)
+ let result' = removeLeadingTrailingSpace result
+ return (Code result'))
+
+emph = try (do
+ oneOfStrings [ "\\emph{", "\\textit{" ]
+ result <- manyTill inline (char '}')
+ return (Emph result))
+
+lab = try (do
+ string "\\label{"
+ result <- manyTill anyChar (char '}')
+ return (Str ("(" ++ result ++ ")")))
+
+ref = try (do
+ string "\\ref{"
+ result <- manyTill anyChar (char '}')
+ return (Str (result)))
+
+strong = try (do
+ string "\\textbf{"
+ result <- manyTill inline (char '}')
+ return (Strong result))
+
+whitespace = do
+ many1 (oneOf "~ \t")
+ return Space
+
+-- hard line break
+linebreak = try (do
+ string "\\\\"
+ return LineBreak)
+
+str = do
+ result <- many1 (noneOf specialChars)
+ return (Str (normalizePunctuation result))
+
+-- endline internal to paragraph
+endline = try (do
+ newline
+ notFollowedBy blankline
+ return Space)
+
+-- math
+math = math1 <|> math2 <?> "math"
+
+math1 = try (do
+ char '$'
+ result <- many (noneOf "$")
+ char '$'
+ return (TeX ("$" ++ result ++ "$")))
+
+math2 = try (do
+ string "\\("
+ result <- many (noneOf "$")
+ string "\\)"
+ return (TeX ("$" ++ result ++ "$")))
+
+--
+-- links and images
+--
+
+link = try (do
+ string "\\href{"
+ url <- manyTill anyChar (char '}')
+ char '{'
+ label <- manyTill inline (char '}')
+ ref <- generateReference url ""
+ return (Link (normalizeSpaces label) ref))
+
+image = try (do
+ ("includegraphics", _, _, (src:lst)) <- command
+ return (Image [Str "image"] (Src src "")))
+
+footnote = try (do
+ ("footnote", _, _, (contents:[])) <- command
+ let blocks = case runParser parseBlocks defaultParserState "footnote" contents of
+ Left err -> error $ "Input:\n" ++ show contents ++
+ "\nError:\n" ++ show err
+ Right result -> result
+ state <- getState
+ let notes = stateNoteBlocks state
+ let nextRef = case notes of
+ [] -> "1"
+ (Note ref body):rest -> (show ((read ref) + 1))
+ setState (state { stateNoteBlocks = (Note nextRef blocks):notes })
+ return (NoteRef nextRef))
+
+-- | Parse any LaTeX command and return it in a raw TeX inline element.
+rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline = try (do
+ (name, star, opt, args) <- command
+ let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args
+ state <- getState
+ if ((name == "begin") || (name == "end") || (name == "item")) then
+ fail "not an inline command"
+ else
+ string ""
+ return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr)))