diff options
Diffstat (limited to 'src/Text/Pandoc/Parsing.hs')
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 714 |
1 files changed, 457 insertions, 257 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 110e34c6a..82abcb440 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -1,11 +1,12 @@ -{-# LANGUAGE - FlexibleContexts -, GeneralizedNewtypeDeriving -, TypeSynonymInstances -, MultiParamTypeClasses -, FlexibleInstances #-} +{-# LANGUAGE ExplicitForAll #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE IncoherentInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE TypeSynonymInstances #-} {- -Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Parsing - Copyright : Copyright (C) 2006-2016 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,8 +34,14 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA A utility library with parsers used in pandoc readers. -} -module Text.Pandoc.Parsing ( anyLine, +module Text.Pandoc.Parsing ( takeWhileP, + takeP, + anyLine, + anyLineNewline, + indentWith, many1Till, + manyUntil, + sepBy1', notFollowedBy', oneOfStrings, oneOfStringsCI, @@ -43,9 +50,12 @@ module Text.Pandoc.Parsing ( anyLine, skipSpaces, blankline, blanklines, + gobbleSpaces, + gobbleAtMostSpaces, enclosed, stringAnyCase, parseFromString, + parseFromString', lineClump, charsInBalanced, romanNumeral, @@ -57,6 +67,11 @@ module Text.Pandoc.Parsing ( anyLine, withRaw, escaped, characterReference, + upperRoman, + lowerRoman, + decimal, + lowerAlpha, + upperAlpha, anyOrderedListMarker, orderedListMarker, charRef, @@ -64,20 +79,24 @@ module Text.Pandoc.Parsing ( anyLine, tableWith, widthsFromIndices, gridTableWith, + gridTableWith', readWith, - readWithWarnings, readWithM, testStringWith, guardEnabled, guardDisabled, updateLastStrPos, notAfterString, + logMessage, + reportLogMessages, ParserState (..), HasReaderOptions (..), HasHeaderMap (..), HasIdentifierList (..), HasMacros (..), + HasLogMessages (..), HasLastStrPosition (..), + HasIncludeFiles (..), defaultParserState, HeaderType (..), ParserContext (..), @@ -100,20 +119,26 @@ module Text.Pandoc.Parsing ( anyLine, dash, nested, citeKey, - macro, - applyMacros', Parser, ParserT, - F(..), + F, + Future(..), runF, askF, asksF, + returnF, + trimInlinesF, token, + (<+?>), + extractIdClass, + insertIncludedFile, + insertIncludedFileF, -- * Re-exports from Text.Pandoc.Parsec Stream, runParser, runParserT, parse, + tokenPrim, anyToken, getInput, setInput, @@ -162,61 +187,106 @@ module Text.Pandoc.Parsing ( anyLine, sourceLine, setSourceColumn, setSourceLine, + incSourceColumn, newPos, - addWarning, - (<+?>), - extractIdClass + Line, + Column ) where +import Control.Monad.Identity +import Control.Monad.Reader +import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, isHexDigit, + isPunctuation, isSpace, ord, toLower, toUpper) +import Data.Default +import Data.List (intercalate, isSuffixOf, transpose) +import qualified Data.Map as M +import Data.Maybe (mapMaybe, fromMaybe) +import Data.Monoid ((<>)) +import qualified Data.Set as Set +import Data.Text (Text) +import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Asciify (toAsciiChar) +import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad, readFileFromDirs, report) import Text.Pandoc.Definition +import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.XML (fromEntities) +import Text.Pandoc.Readers.LaTeX.Types (Macro) +import Text.Pandoc.Shared import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) +import Text.Pandoc.XML (fromEntities) import Text.Parsec hiding (token) -import Text.Parsec.Pos (newPos) -import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, - isHexDigit, isSpace, isPunctuation ) -import Data.List ( intercalate, transpose, isSuffixOf ) -import Text.Pandoc.Shared -import qualified Data.Map as M -import Text.TeXMath.Readers.TeX.Macros (applyMacros, Macro, - parseMacroDefinitions) -import Text.HTML.TagSoup.Entity ( lookupEntity ) -import Text.Pandoc.Asciify (toAsciiChar) -import Data.Monoid ((<>)) -import Data.Default -import qualified Data.Set as Set -import Control.Monad.Reader -import Control.Monad.Identity -import Data.Maybe (catMaybes) +import Text.Parsec.Pos (initialPos, newPos, updatePosString) +import Control.Monad.Except import Text.Pandoc.Error type Parser t s = Parsec t s type ParserT = ParsecT -newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Applicative, Functor) +-- | Reader monad wrapping the parser state. This is used to possibly delay +-- evaluation until all relevant information has been parsed and made available +-- in the parser state. +newtype Future s a = Future { runDelayed :: Reader s a } + deriving (Monad, Applicative, Functor) + +type F = Future ParserState -runF :: F a -> ParserState -> a -runF = runReader . unF +runF :: Future s a -> s -> a +runF = runReader . runDelayed -askF :: F ParserState -askF = F ask +askF :: Future s s +askF = Future ask -asksF :: (ParserState -> a) -> F a -asksF f = F $ asks f +asksF :: (s -> a) -> Future s a +asksF f = Future $ asks f -instance Monoid a => Monoid (F a) where +returnF :: Monad m => a -> m (Future s a) +returnF = return . return + +trimInlinesF :: Future s Inlines -> Future s Inlines +trimInlinesF = liftM trimInlines + +instance Monoid a => Monoid (Future s a) where mempty = return mempty mappend = liftM2 mappend mconcat = liftM mconcat . sequence +-- | Parse characters while a predicate is true. +takeWhileP :: Monad m + => (Char -> Bool) + -> ParserT [Char] st m [Char] +takeWhileP f = do + -- faster than 'many (satisfy f)' + inp <- getInput + pos <- getPosition + let (xs, rest) = span f inp + -- needed to persuade parsec that this won't match an empty string: + anyChar + setInput rest + setPosition $ updatePosString pos xs + return xs + +-- Parse n characters of input (or the rest of the input if +-- there aren't n characters). +takeP :: Monad m => Int -> ParserT [Char] st m [Char] +takeP n = do + guard (n > 0) + -- faster than 'count n anyChar' + inp <- getInput + pos <- getPosition + let (xs, rest) = splitAt n inp + -- needed to persuade parsec that this won't match an empty string: + anyChar + setInput rest + setPosition $ updatePosString pos xs + return xs + -- | Parse any line of text -anyLine :: Stream [Char] m Char => ParserT [Char] st m [Char] +anyLine :: Monad m => ParserT [Char] st m [Char] anyLine = do -- This is much faster than: -- manyTill anyChar newline @@ -231,16 +301,54 @@ anyLine = do return this _ -> mzero +-- | Parse any line, include the final newline in the output +anyLineNewline :: Monad m => ParserT [Char] st m [Char] +anyLineNewline = (++ "\n") <$> anyLine + +-- | Parse indent by specified number of spaces (or equiv. tabs) +indentWith :: Stream s m Char + => HasReaderOptions st + => Int -> ParserT s st m [Char] +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> indentWith (num - tabStop)) ] + -- | Like @manyTill@, but reads at least one item. -many1Till :: Stream s m t +many1Till :: (Show end, Stream s m t) => ParserT s st m a -> ParserT s st m end -> ParserT s st m [a] many1Till p end = do + notFollowedBy' end first <- p rest <- manyTill p end return (first:rest) +-- | Like @manyTill@, but also returns the result of end parser. +manyUntil :: (Stream s m t) + => ParserT s u m a + -> ParserT s u m b + -> ParserT s u m ([a], b) +manyUntil p end = scan + where scan = + (do e <- end + return ([], e) + ) <|> + (do x <- p + (xs, e) <- scan + return (x:xs, e)) + +-- | Like @sepBy1@ from Parsec, +-- but does not fail if it @sep@ succeeds and @p@ fails. +sepBy1' :: (Stream s m t) + => ParsecT s u m a + -> ParsecT s u m sep + -> ParsecT s u m [a] +sepBy1' p sep = (:) <$> p <*> many (try $ sep >> p) + -- | A more general form of @notFollowedBy@. This one allows any -- type of parser to be specified, and succeeds only if that parser fails. -- It does not consume any input. @@ -276,7 +384,7 @@ oneOfStringsCI = oneOfStrings' ciMatch -- this optimizes toLower by checking common ASCII case -- first, before calling the expensive unicode-aware -- function: - toLower' c | c >= 'A' && c <= 'Z' = chr (ord c + 32) + toLower' c | isAsciiUpper c = chr (ord c + 32) | isAscii c = c | otherwise = toLower c @@ -300,8 +408,38 @@ blankline = try $ skipSpaces >> newline blanklines :: Stream s m Char => ParserT s st m [Char] blanklines = many1 blankline +-- | Gobble n spaces; if tabs are encountered, expand them +-- and gobble some or all of their spaces, leaving the rest. +gobbleSpaces :: (HasReaderOptions st, Monad m) + => Int -> ParserT [Char] st m () +gobbleSpaces 0 = return () +gobbleSpaces n + | n < 0 = error "gobbleSpaces called with negative number" + | otherwise = try $ do + char ' ' <|> eatOneSpaceOfTab + gobbleSpaces (n - 1) + +eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT [Char] st m Char +eatOneSpaceOfTab = do + char '\t' + tabstop <- getOption readerTabStop + inp <- getInput + setInput $ replicate (tabstop - 1) ' ' ++ inp + return ' ' + +-- | Gobble up to n spaces; if tabs are encountered, expand them +-- and gobble some or all of their spaces, leaving the rest. +gobbleAtMostSpaces :: (HasReaderOptions st, Monad m) + => Int -> ParserT [Char] st m Int +gobbleAtMostSpaces 0 = return 0 +gobbleAtMostSpaces n + | n < 0 = error "gobbleAtMostSpaces called with negative number" + | otherwise = option 0 $ do + char ' ' <|> eatOneSpaceOfTab + (+ 1) <$> gobbleAtMostSpaces (n - 1) + -- | Parses material enclosed between start and end parsers. -enclosed :: Stream s m Char => ParserT s st m t -- ^ start parser +enclosed :: (Show end, Stream s m Char) => ParserT s st m t -- ^ start parser -> ParserT s st m end -- ^ end parser -> ParserT s st m a -- ^ content parser (to be used repeatedly) -> ParserT s st m [a] @@ -317,9 +455,13 @@ stringAnyCase (x:xs) = do return (firstChar:rest) -- | Parse contents of 'str' using 'parser' and return result. -parseFromString :: Monad m => ParserT String st m a -> String -> ParserT String st m a +parseFromString :: Monad m + => ParserT [Char] st m a + -> String + -> ParserT [Char] st m a parseFromString parser str = do oldPos <- getPosition + setPosition $ initialPos "chunk" oldInput <- getInput setInput str result <- parser @@ -329,10 +471,22 @@ parseFromString parser str = do setPosition oldPos return result +-- | Like 'parseFromString' but specialized for 'ParserState'. +-- This resets 'stateLastStrPos', which is almost always what we want. +parseFromString' :: Monad m + => ParserT String ParserState m a + -> String + -> ParserT String ParserState m a +parseFromString' parser str = do + oldStrPos <- stateLastStrPos <$> getState + res <- parseFromString parser str + updateState $ \st -> st{ stateLastStrPos = oldStrPos } + return res + -- | Parse raw line block up to and including blank lines. -lineClump :: Stream [Char] m Char => ParserT [Char] st m String +lineClump :: Monad m => ParserT [Char] st m String lineClump = blanklines - <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) + <|> (unlines <$> many1 (notFollowedBy blankline >> anyLine)) -- | Parse a string of characters between an open character -- and a close character, including text between balanced @@ -373,19 +527,19 @@ romanNumeral upperCase = do lookAhead $ oneOf romanDigits let [one, five, ten, fifty, hundred, fivehundred, thousand] = map char romanDigits - thousands <- many thousand >>= (return . (1000 *) . length) + thousands <- ((1000 *) . length) <$> many thousand ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 - fivehundreds <- many fivehundred >>= (return . (500 *) . length) + fivehundreds <- ((500 *) . length) <$> many fivehundred fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 - hundreds <- many hundred >>= (return . (100 *) . length) + hundreds <- ((100 *) . length) <$> many hundred nineties <- option 0 $ try $ ten >> hundred >> return 90 - fifties <- many fifty >>= (return . (50 *) . length) + fifties <- ((50 *) . length) <$> many fifty forties <- option 0 $ try $ ten >> fifty >> return 40 - tens <- many ten >>= (return . (10 *) . length) + tens <- ((10 *) . length) <$> many ten nines <- option 0 $ try $ one >> ten >> return 9 - fives <- many five >>= (return . (5 *) . length) + fives <- ((5 *) . length) <$> many five fours <- option 0 $ try $ one >> five >> return 4 - ones <- many one >>= (return . length) + ones <- length <$> many one let total = thousands + ninehundreds + fivehundreds + fourhundreds + hundreds + nineties + fifties + forties + tens + nines + fives + fours + ones @@ -401,8 +555,8 @@ emailAddress :: Stream s m Char => ParserT s st m (String, String) emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) where toResult mbox dom = let full = fromEntities $ mbox ++ '@':dom in (full, escapeURI $ "mailto:" ++ full) - mailbox = intercalate "." <$> (emailWord `sepby1` dot) - domain = intercalate "." <$> (subdomain `sepby1` dot) + mailbox = intercalate "." <$> (emailWord `sepBy1'` dot) + domain = intercalate "." <$> (subdomain `sepBy1'` dot) dot = char '.' subdomain = many1 $ alphaNum <|> innerPunct -- this excludes some valid email addresses, since an @@ -419,59 +573,33 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) return (x:xs) isEmailChar c = isAlphaNum c || isEmailPunct c isEmailPunct c = c `elem` "!\"#$%&'*+-/=?^_{|}~;" - -- note: sepBy1 from parsec consumes input when sep - -- succeeds and p fails, so we use this variant here. - sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p)) - - --- Schemes from http://www.iana.org/assignments/uri-schemes.html plus --- the unofficial schemes coap, doi, javascript, isbn, pmid -schemes :: [String] -schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid", - "crid","data","dav","dict","dns","file","ftp","geo","go","gopher", - "h323","http","https","iax","icap","im","imap","info","ipp","iris", - "iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid", - "msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp", - "opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve", - "sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet", - "tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon", - "xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s", - "adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin", - "bolo","callto","chrome","chrome-extension","com-eventbrite-attendee", - "content", "cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb", - "ed2k","facetime","feed","finger","fish","gg","git","gizmoproject", - "gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms", - "keyparc","lastfm","ldaps","magnet","maps","market","message","mms", - "ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi", - "platform","proxy","psyc","query","res","resource","rmi","rsync", - "rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify", - "ssh","steam","svn","teamspeak","things","udp","unreal","ut2004", - "ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri", - "ymsgr", "isbn", "pmid"] + uriScheme :: Stream s m Char => ParserT s st m String -uriScheme = oneOfStringsCI schemes +uriScheme = oneOfStringsCI (Set.toList schemes) -- | Parses a URI. Returns pair of original and URI-escaped version. -uri :: Stream [Char] m Char => ParserT [Char] st m (String, String) +uri :: Monad m => ParserT [Char] st m (String, String) uri = try $ do scheme <- uriScheme char ':' + -- Avoid parsing e.g. "**Notes:**" as a raw URI: + notFollowedBy (oneOf "*_]") -- We allow sentence punctuation except at the end, since -- we don't want the trailing '.' in 'http://google.com.' We want to allow -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) -- as a URL, while NOT picking up the closing paren in -- (http://wikipedia.org). So we include balanced parens in the URL. - let isWordChar c = isAlphaNum c || c `elem` "#$%*+/@\\_-" + let isWordChar c = isAlphaNum c || c `elem` "#$%*+/@\\_-&=" let wordChar = satisfy isWordChar let percentEscaped = try $ char '%' >> skipMany1 (satisfy isHexDigit) let entity = () <$ characterReference let punct = skipMany1 (char ',') - <|> () <$ (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) + <|> () <$ satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>') let uriChunk = skipMany1 wordChar <|> percentEscaped <|> entity - <|> (try $ punct >> + <|> try (punct >> lookAhead (void (satisfy isWordChar) <|> percentEscaped)) str <- snd <$> withRaw (skipMany1 ( () <$ (enclosed (char '(') (char ')') uriChunk @@ -485,7 +613,7 @@ uri = try $ do mathInlineWith :: Stream s m Char => String -> String -> ParserT s st m String mathInlineWith op cl = try $ do string op - notFollowedBy space + when (op == "$") $ notFollowedBy space words' <- many1Till (count 1 (noneOf " \t\n\\") <|> (char '\\' >> -- This next clause is needed because \text{..} can @@ -499,7 +627,7 @@ mathInlineWith op cl = try $ do return " " ) (try $ string cl) notFollowedBy digit -- to prevent capture of $5 - return $ concat words' + return $ trim $ concat words' where inBalancedBraces :: Stream s m Char => Int -> String -> ParserT s st m String inBalancedBraces 0 "" = do @@ -556,7 +684,9 @@ withHorizDisplacement parser = do -- | Applies a parser and returns the raw string that was parsed, -- along with the value produced by the parser. -withRaw :: Stream [Char] m Char => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) +withRaw :: Monad m + => ParsecT [Char] st m a + -> ParsecT [Char] st m (a, [Char]) withRaw parser = do pos1 <- getPosition inp <- getInput @@ -566,9 +696,9 @@ withRaw parser = do let (l2,c2) = (sourceLine pos2, sourceColumn pos2) let inplines = take ((l2 - l1) + 1) $ lines inp let raw = case inplines of - [] -> "" - [l] -> take (c2 - c1) l - ls -> unlines (init ls) ++ take (c2 - 1) (last ls) + [] -> "" + [l] -> take (c2 - c1) l + ls -> unlines (init ls) ++ take (c2 - 1) (last ls) return (result, raw) -- | Parses backslash, then applies character parser. @@ -584,11 +714,11 @@ characterReference = try $ do ent <- many1Till nonspaceChar (char ';') let ent' = case ent of '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug - '#':_ -> ent - _ -> ent ++ ";" + '#':_ -> ent + _ -> ent ++ ";" case lookupEntity ent' of - Just (c : _) -> return c - _ -> fail "entity not found" + Just (c : _) -> return c + _ -> fail "entity not found" -- | Parses an uppercase roman numeral and returns (UpperRoman, number). upperRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) @@ -651,7 +781,7 @@ romanOne = (char 'i' >> return (LowerRoman, 1)) <|> -- | Parses an ordered list marker and returns list attributes. anyOrderedListMarker :: Stream s m Char => ParserT s ParserState m ListAttributes -anyOrderedListMarker = choice $ +anyOrderedListMarker = choice [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], numParser <- [decimal, exampleNum, defaultNum, romanOne, lowerAlpha, lowerRoman, upperAlpha, upperRoman]] @@ -717,7 +847,7 @@ charRef = do c <- characterReference return $ Str [c] -lineBlockLine :: Stream [Char] m Char => ParserT [Char] st m String +lineBlockLine :: Monad m => ParserT [Char] st m String lineBlockLine = try $ do char '|' char ' ' @@ -727,33 +857,48 @@ lineBlockLine = try $ do continuations <- many (try $ char ' ' >> anyLine) return $ white ++ unwords (line : continuations) -blankLineBlockLine :: Stream [Char] m Char => ParserT [Char] st m Char +blankLineBlockLine :: Stream s m Char => ParserT s st m Char blankLineBlockLine = try (char '|' >> blankline) -- | Parses an RST-style line block and returns a list of strings. -lineBlockLines :: Stream [Char] m Char => ParserT [Char] st m [String] +lineBlockLines :: Monad m => ParserT [Char] st m [String] lineBlockLines = try $ do lines' <- many1 (lineBlockLine <|> ((:[]) <$> blankLineBlockLine)) - skipMany1 $ blankline <|> blankLineBlockLine + skipMany blankline return lines' -- | Parse a table using 'headerParser', 'rowParser', -- 'lineParser', and 'footerParser'. -tableWith :: Stream s m Char - => ParserT s ParserState m ([[Block]], [Alignment], [Int]) - -> ([Int] -> ParserT s ParserState m [[Block]]) - -> ParserT s ParserState m sep - -> ParserT s ParserState m end - -> ParserT s ParserState m Block +tableWith :: (Stream s m Char, HasReaderOptions st, + Functor mf, Applicative mf, Monad mf) + => ParserT s st m (mf [Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s st m (mf [Blocks])) + -> ParserT s st m sep + -> ParserT s st m end + -> ParserT s st m (mf Blocks) tableWith headerParser rowParser lineParser footerParser = try $ do + (aligns, widths, heads, rows) <- tableWith' headerParser rowParser + lineParser footerParser + return $ B.table mempty (zip aligns widths) <$> heads <*> rows + +type TableComponents mf = ([Alignment], [Double], mf [Blocks], mf [[Blocks]]) + +tableWith' :: (Stream s m Char, HasReaderOptions st, + Functor mf, Applicative mf, Monad mf) + => ParserT s st m (mf [Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s st m (mf [Blocks])) + -> ParserT s st m sep + -> ParserT s st m end + -> ParserT s st m (TableComponents mf) +tableWith' headerParser rowParser lineParser footerParser = try $ do (heads, aligns, indices) <- headerParser - lines' <- rowParser indices `sepEndBy1` lineParser + lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser footerParser numColumns <- getOption readerColumns - let widths = if (indices == []) + let widths = if null indices then replicate (length aligns) 0.0 else widthsFromIndices numColumns indices - return $ Table [] aligns widths heads lines' + return (aligns, widths, heads, lines') -- Calculate relative widths of table columns, based on indices widthsFromIndices :: Int -- Number of columns on terminal @@ -777,7 +922,7 @@ widthsFromIndices numColumns' indices = quotient = if totLength > numColumns then fromIntegral totLength else fromIntegral numColumns - fracs = map (\l -> (fromIntegral l) / quotient) lengths in + fracs = map (\l -> fromIntegral l / quotient) lengths in tail fracs --- @@ -786,25 +931,44 @@ widthsFromIndices numColumns' indices = -- (which may be grid), then the rows, -- which may be grid, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -gridTableWith :: Stream [Char] m Char - => ParserT [Char] ParserState m [Block] -- ^ Block list parser - -> Bool -- ^ Headerless table - -> ParserT [Char] ParserState m Block +gridTableWith :: (Monad m, HasReaderOptions st, + Functor mf, Applicative mf, Monad mf) + => ParserT [Char] st m (mf Blocks) -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT [Char] st m (mf Blocks) gridTableWith blocks headless = tableWith (gridTableHeader headless blocks) (gridTableRow blocks) (gridTableSep '-') gridTableFooter +gridTableWith' :: (Monad m, HasReaderOptions st, + Functor mf, Applicative mf, Monad mf) + => ParserT [Char] st m (mf Blocks) -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT [Char] st m (TableComponents mf) +gridTableWith' blocks headless = + tableWith' (gridTableHeader headless blocks) (gridTableRow blocks) + (gridTableSep '-') gridTableFooter + gridTableSplitLine :: [Int] -> String -> [String] gridTableSplitLine indices line = map removeFinalBar $ tail $ splitStringByIndices (init indices) $ trimr line -gridPart :: Stream s m Char => Char -> ParserT s st m (Int, Int) +gridPart :: Stream s m Char => Char -> ParserT s st m ((Int, Int), Alignment) gridPart ch = do + leftColon <- option False (True <$ char ':') dashes <- many1 (char ch) + rightColon <- option False (True <$ char ':') char '+' - return (length dashes, length dashes + 1) - -gridDashedLines :: Stream s m Char => Char -> ParserT s st m [(Int,Int)] + let lengthDashes = length dashes + (if leftColon then 1 else 0) + + (if rightColon then 1 else 0) + let alignment = case (leftColon, rightColon) of + (True, True) -> AlignCenter + (True, False) -> AlignLeft + (False, True) -> AlignRight + (False, False) -> AlignDefault + return ((lengthDashes, lengthDashes + 1), alignment) + +gridDashedLines :: Stream s m Char => Char -> ParserT s st m [((Int, Int), Alignment)] gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: String -> String @@ -812,14 +976,14 @@ removeFinalBar = reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse -- | Separator between rows of grid table. -gridTableSep :: Stream s m Char => Char -> ParserT s ParserState m Char +gridTableSep :: Stream s m Char => Char -> ParserT s st m Char gridTableSep ch = try $ gridDashedLines ch >> return '\n' -- | Parse header for a grid table. -gridTableHeader :: Stream [Char] m Char +gridTableHeader :: (Monad m, Functor mf, Applicative mf, Monad mf) => Bool -- ^ Headerless table - -> ParserT [Char] ParserState m [Block] - -> ParserT [Char] ParserState m ([[Block]], [Alignment], [Int]) + -> ParserT [Char] st m (mf Blocks) + -> ParserT [Char] st m (mf [Blocks], [Alignment], [Int]) gridTableHeader headless blocks = try $ do optional blanklines dashes <- gridDashedLines '-' @@ -828,62 +992,63 @@ gridTableHeader headless blocks = try $ do else many1 (notFollowedBy (gridTableSep '=') >> char '|' >> many1Till anyChar newline) - if headless - then return () - else gridTableSep '=' >> return () - let lines' = map snd dashes + underDashes <- if headless + then return dashes + else gridDashedLines '=' + guard $ length dashes == length underDashes + let lines' = map (snd . fst) underDashes let indices = scanl (+) 0 lines' - let aligns = replicate (length lines') AlignDefault - -- RST does not have a notion of alignments + let aligns = map snd underDashes let rawHeads = if headless - then replicate (length dashes) "" - else map (intercalate " ") $ transpose + then replicate (length underDashes) "" + else map (unlines . map trim) $ transpose $ map (gridTableSplitLine indices) rawContent - heads <- mapM (parseFromString blocks) $ map trim rawHeads + heads <- sequence <$> mapM (parseFromString blocks . trim) rawHeads return (heads, aligns, indices) -gridTableRawLine :: Stream s m Char => [Int] -> ParserT s ParserState m [String] +gridTableRawLine :: Stream s m Char => [Int] -> ParserT s st m [String] gridTableRawLine indices = do char '|' line <- many1Till anyChar newline return (gridTableSplitLine indices line) -- | Parse row of grid table. -gridTableRow :: Stream [Char] m Char - => ParserT [Char] ParserState m [Block] +gridTableRow :: (Monad m, Functor mf, Applicative mf, Monad mf) + => ParserT [Char] st m (mf Blocks) -> [Int] - -> ParserT [Char] ParserState m [[Block]] + -> ParserT [Char] st m (mf [Blocks]) gridTableRow blocks indices = do colLines <- many1 (gridTableRawLine indices) let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ transpose colLines - mapM (liftM compactifyCell . parseFromString blocks) cols + compactifyCell bs = case compactify [bs] of + [] -> mempty + x:_ -> x + cells <- sequence <$> mapM (parseFromString blocks) cols + return $ fmap (map compactifyCell) cells removeOneLeadingSpace :: [String] -> [String] removeOneLeadingSpace xs = if all startsWithSpace xs then map (drop 1) xs else xs - where startsWithSpace "" = True + where startsWithSpace "" = True startsWithSpace (y:_) = y == ' ' -compactifyCell :: [Block] -> [Block] -compactifyCell bs = head $ compactify [bs] - -- | Parse footer for a grid table. -gridTableFooter :: Stream s m Char => ParserT s ParserState m [Char] +gridTableFooter :: Stream s m Char => ParserT s st m [Char] gridTableFooter = blanklines --- -- | Removes the ParsecT layer from the monad transformer stack -readWithM :: (Monad m) - => ParserT [Char] st m a -- ^ parser +readWithM :: Monad m + => ParserT [Char] st m a -- ^ parser -> st -- ^ initial state -> String -- ^ input -> m (Either PandocError a) readWithM parser state input = - mapLeft (ParsecError input) `liftM` runParserT parser state "source" input + mapLeft (PandocParsecError input) `liftM` runParserT parser state "source" input -- | Parse a string with a given parser and state @@ -893,17 +1058,8 @@ readWith :: Parser [Char] st a -> Either PandocError a readWith p t inp = runIdentity $ readWithM p t inp -readWithWarnings :: Parser [Char] ParserState a - -> ParserState - -> String - -> Either PandocError (a, [String]) -readWithWarnings p = readWith $ do - doc <- p - warnings <- stateWarnings <$> getState - return (doc, warnings) - -- | Parse a string with @parser@ (for testing). -testStringWith :: (Show a) +testStringWith :: Show a => ParserT [Char] ParserState Identity a -> [Char] -> IO () @@ -912,34 +1068,37 @@ testStringWith parser str = UTF8.putStrLn $ show $ -- | Parsing options. data ParserState = ParserState - { stateOptions :: ReaderOptions, -- ^ User options - stateParserContext :: ParserContext, -- ^ Inside list? - stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? - stateAllowLinks :: Bool, -- ^ Allow parsing of links - stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph - stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed - stateKeys :: KeyTable, -- ^ List of reference keys - stateHeaderKeys :: KeyTable, -- ^ List of implicit header ref keys - stateSubstitutions :: SubstTable, -- ^ List of substitution references - stateNotes :: NoteTable, -- ^ List of notes (raw bodies) - stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) - stateMeta :: Meta, -- ^ Document metadata - stateMeta' :: F Meta, -- ^ Document metadata - stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used - stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) - stateIdentifiers :: Set.Set String, -- ^ Header identifiers used - stateNextExample :: Int, -- ^ Number of next example - stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers - stateHasChapters :: Bool, -- ^ True if \chapter encountered - stateMacros :: [Macro], -- ^ List of macros defined so far - stateRstDefaultRole :: String, -- ^ Current rST default interpreted text role - stateRstCustomRoles :: M.Map String (String, Maybe String, Attr), -- ^ Current rST custom text roles + { stateOptions :: ReaderOptions, -- ^ User options + stateParserContext :: ParserContext, -- ^ Inside list? + stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateAllowLinks :: Bool, -- ^ Allow parsing of links + stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph + stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed + stateKeys :: KeyTable, -- ^ List of reference keys + stateHeaderKeys :: KeyTable, -- ^ List of implicit header ref keys + stateSubstitutions :: SubstTable, -- ^ List of substitution references + stateNotes :: NoteTable, -- ^ List of notes (raw bodies) + stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) + stateNoteRefs :: Set.Set String, -- ^ List of note references used + stateMeta :: Meta, -- ^ Document metadata + stateMeta' :: F Meta, -- ^ Document metadata + stateCitations :: M.Map String String, -- ^ RST-style citations + stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used + stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) + stateIdentifiers :: Set.Set String, -- ^ Header identifiers used + stateNextExample :: Int, -- ^ Number of next example + stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers + stateMacros :: M.Map Text Macro, -- ^ Table of macros defined so far + stateRstDefaultRole :: String, -- ^ Current rST default interpreted text role + stateRstCustomRoles :: M.Map String (String, Maybe String, Attr), -- ^ Current rST custom text roles -- Triple represents: 1) Base role, 2) Optional format (only for :raw: -- roles), 3) Additional classes (rest of Attr is unused)). - stateCaption :: Maybe Inlines, -- ^ Caption in current environment - stateInHtmlBlock :: Maybe String, -- ^ Tag type of HTML block being parsed - stateMarkdownAttribute :: Bool, -- ^ True if in markdown=1 context - stateWarnings :: [String] -- ^ Warnings generated by the parser + stateCaption :: Maybe Inlines, -- ^ Caption in current environment + stateInHtmlBlock :: Maybe String, -- ^ Tag type of HTML block being parsed + stateFencedDivLevel :: Int, -- ^ Depth of fenced div + stateContainers :: [String], -- ^ parent include files + stateLogMessages :: [LogMessage], -- ^ log messages + stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context } instance Default ParserState where @@ -957,6 +1116,9 @@ class HasReaderOptions st where -- default getOption f = (f . extractReaderOptions) <$> getState +instance HasReaderOptions ParserState where + extractReaderOptions = stateOptions + class HasQuoteContext st m where getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a @@ -972,9 +1134,6 @@ instance Monad m => HasQuoteContext ParserState m where setState newState { stateQuoteContext = oldQuoteContext } return result -instance HasReaderOptions ParserState where - extractReaderOptions = stateOptions - class HasHeaderMap st where extractHeaderMap :: st -> M.Map Inlines String updateHeaderMap :: (M.Map Inlines String -> M.Map Inlines String) -> @@ -993,8 +1152,8 @@ instance HasIdentifierList ParserState where updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } class HasMacros st where - extractMacros :: st -> [Macro] - updateMacros :: ([Macro] -> [Macro]) -> st -> st + extractMacros :: st -> M.Map Text Macro + updateMacros :: (M.Map Text Macro -> M.Map Text Macro) -> st -> st instance HasMacros ParserState where extractMacros = stateMacros @@ -1008,6 +1167,24 @@ instance HasLastStrPosition ParserState where setLastStrPos pos st = st{ stateLastStrPos = Just pos } getLastStrPos st = stateLastStrPos st +class HasLogMessages st where + addLogMessage :: LogMessage -> st -> st + getLogMessages :: st -> [LogMessage] + +instance HasLogMessages ParserState where + addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st } + getLogMessages st = reverse $ stateLogMessages st + +class HasIncludeFiles st where + getIncludeFiles :: st -> [String] + addIncludeFile :: String -> st -> st + dropLatestIncludeFile :: st -> st + +instance HasIncludeFiles ParserState where + getIncludeFiles = stateContainers + addIncludeFile f s = s{ stateContainers = f : stateContainers s } + dropLatestIncludeFile s = s { stateContainers = drop 1 $ stateContainers s } + defaultParserState :: ParserState defaultParserState = ParserState { stateOptions = def, @@ -1020,30 +1197,45 @@ defaultParserState = stateHeaderKeys = M.empty, stateSubstitutions = M.empty, stateNotes = [], - stateNotes' = [], + stateNotes' = M.empty, + stateNoteRefs = Set.empty, stateMeta = nullMeta, stateMeta' = return nullMeta, + stateCitations = M.empty, stateHeaderTable = [], stateHeaders = M.empty, stateIdentifiers = Set.empty, stateNextExample = 1, stateExamples = M.empty, - stateHasChapters = False, - stateMacros = [], + stateMacros = M.empty, stateRstDefaultRole = "title-reference", stateRstCustomRoles = M.empty, stateCaption = Nothing, stateInHtmlBlock = Nothing, - stateMarkdownAttribute = False, - stateWarnings = []} + stateFencedDivLevel = 0, + stateContainers = [], + stateLogMessages = [], + stateMarkdownAttribute = False + } + +-- | Add a log message. +logMessage :: (Stream s m a, HasLogMessages st) + => LogMessage -> ParserT s st m () +logMessage msg = updateState (addLogMessage msg) + +-- | Report all the accumulated log messages, according to verbosity level. +reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m () +reportLogMessages = do + msgs <- getLogMessages <$> getState + mapM_ report msgs -- | Succeed only if the extension is enabled. guardEnabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () -guardEnabled ext = getOption readerExtensions >>= guard . Set.member ext +guardEnabled ext = getOption readerExtensions >>= guard . extensionEnabled ext -- | Succeed only if the extension is disabled. guardDisabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () -guardDisabled ext = getOption readerExtensions >>= guard . not . Set.member ext +guardDisabled ext = getOption readerExtensions >>= guard . not . extensionEnabled ext -- | Update the position on which the last string ended. updateLastStrPos :: (Stream s m a, HasLastStrPosition st) => ParserT s st m () @@ -1074,7 +1266,8 @@ data QuoteContext type NoteTable = [(String, String)] -type NoteTable' = [(String, F Blocks)] -- used in markdown reader +type NoteTable' = M.Map String (SourcePos, F Blocks) +-- used in markdown reader newtype Key = Key String deriving (Show, Read, Eq, Ord) @@ -1091,37 +1284,40 @@ type SubstTable = M.Map Key Inlines -- with its associated identifier. If the identifier is null -- and the auto_identifers extension is set, generate a new -- unique identifier, and update the list of identifiers --- in state. -registerHeader :: (Stream s m a, HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) +-- in state. Issue a warning if an explicit identifier +-- is encountered that duplicates an earlier identifier +-- (explict or automatically generated). +registerHeader :: (Stream s m a, HasReaderOptions st, + HasHeaderMap st, HasLogMessages st, HasIdentifierList st) => Attr -> Inlines -> ParserT s st m Attr registerHeader (ident,classes,kvs) header' = do ids <- extractIdentifierList <$> getState exts <- getOption readerExtensions let insert' = M.insertWith (\_new old -> old) - if null ident && Ext_auto_identifiers `Set.member` exts + if null ident && Ext_auto_identifiers `extensionEnabled` exts then do let id' = uniqueIdent (B.toList header') ids - let id'' = if Ext_ascii_identifiers `Set.member` exts - then catMaybes $ map toAsciiChar id' + let id'' = if Ext_ascii_identifiers `extensionEnabled` exts + then mapMaybe toAsciiChar id' else id' updateState $ updateIdentifierList $ Set.insert id' updateState $ updateIdentifierList $ Set.insert id'' updateState $ updateHeaderMap $ insert' header' id' return (id'',classes,kvs) else do - unless (null ident) $ + unless (null ident) $ do + when (ident `Set.member` ids) $ do + pos <- getPosition + logMessage $ DuplicateIdentifier ident pos + updateState $ updateIdentifierList $ Set.insert ident updateState $ updateHeaderMap $ insert' header' ident return (ident,classes,kvs) --- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: (Stream s m a, HasReaderOptions st) => ParserT s st m () -failUnlessSmart = getOption readerSmart >>= guard - smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) => ParserT s st m Inlines -> ParserT s st m Inlines smartPunctuation inlineParser = do - failUnlessSmart + guardEnabled Ext_smart choice [ quoted inlineParser, apostrophe, dash, ellipses ] apostrophe :: Stream s m Char => ParserT s st m Inlines @@ -1153,9 +1349,7 @@ failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) -> ParserT s st m () failIfInQuoteContext context = do context' <- getQuoteContext - if context' == context - then fail "already inside quotes" - else return () + when (context' == context) $ fail "already inside quotes" charOrRef :: Stream s m Char => String -> ParserT s st m Char charOrRef cs = @@ -1195,7 +1389,7 @@ ellipses = try (string "..." >> return (B.str "\8230")) dash :: (HasReaderOptions st, Stream s m Char) => ParserT s st m Inlines dash = try $ do - oldDashes <- getOption readerOldDashes + oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions if oldDashes then do char '-' @@ -1241,43 +1435,6 @@ token :: (Stream s m t) -> ParsecT s st m a token pp pos match = tokenPrim pp (\_ t _ -> pos t) match --- --- Macros --- - --- | Parse a \newcommand or \renewcommand macro definition. -macro :: (Stream [Char] m Char, HasMacros st, HasReaderOptions st) - => ParserT [Char] st m Blocks -macro = do - apply <- getOption readerApplyMacros - inp <- getInput - case parseMacroDefinitions inp of - ([], _) -> mzero - (ms, rest) -> do def' <- count (length inp - length rest) anyChar - if apply - then do - updateState $ \st -> - updateMacros (ms ++) st - return mempty - else return $ rawBlock "latex" def' - --- | Apply current macros to string. -applyMacros' :: (HasReaderOptions st, HasMacros st, Stream [Char] m Char) - => String - -> ParserT [Char] st m String -applyMacros' target = do - apply <- getOption readerApplyMacros - if apply - then do macros <- extractMacros <$> getState - return $ applyMacros macros target - else return target - --- | Append a warning to the log. -addWarning :: Maybe SourcePos -> String -> Parser [Char] ParserState () -addWarning mbpos msg = - updateState $ \st -> st{ - stateWarnings = (msg ++ maybe "" (\pos -> " " ++ show pos) mbpos) : - stateWarnings st } infixr 5 <+?> (<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a a <+?> b = a >>= flip fmap (try b <|> return mempty) . (<>) @@ -1285,10 +1442,53 @@ a <+?> b = a >>= flip fmap (try b <|> return mempty) . (<>) extractIdClass :: Attr -> Attr extractIdClass (ident, cls, kvs) = (ident', cls', kvs') where - ident' = case (lookup "id" kvs) of - Just v -> v - Nothing -> ident - cls' = case (lookup "class" kvs) of + ident' = fromMaybe ident (lookup "id" kvs) + cls' = case lookup "class" kvs of Just cl -> words cl Nothing -> cls kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs + +insertIncludedFile' :: (PandocMonad m, HasIncludeFiles st, + Functor mf, Applicative mf, Monad mf) + => ParserT [a] st m (mf Blocks) + -> (String -> [a]) + -> [FilePath] -> FilePath + -> ParserT [a] st m (mf Blocks) +insertIncludedFile' blocks totoks dirs f = do + oldPos <- getPosition + oldInput <- getInput + containers <- getIncludeFiles <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show oldPos + updateState $ addIncludeFile f + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return s + Nothing -> do + report $ CouldNotLoadIncludeFile f oldPos + return "" + setPosition $ newPos f 1 1 + setInput $ totoks contents + bs <- blocks + setInput oldInput + setPosition oldPos + updateState dropLatestIncludeFile + return bs + +-- | Parse content of include file as blocks. Circular includes result in an +-- @PandocParseError@. +insertIncludedFile :: (PandocMonad m, HasIncludeFiles st) + => ParserT [a] st m Blocks + -> (String -> [a]) + -> [FilePath] -> FilePath + -> ParserT [a] st m Blocks +insertIncludedFile blocks totoks dirs f = + runIdentity <$> insertIncludedFile' (Identity <$> blocks) totoks dirs f + +-- | Parse content of include file as future blocks. Circular includes result in +-- an @PandocParseError@. +insertIncludedFileF :: (PandocMonad m, HasIncludeFiles st) + => ParserT String st m (Future st Blocks) + -> [FilePath] -> FilePath + -> ParserT String st m (Future st Blocks) +insertIncludedFileF p = insertIncludedFile' p id |