summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorAlexander Krotov <ilabdsf@gmail.com>2018-02-06 03:17:31 +0300
committerAlexander Krotov <ilabdsf@gmail.com>2018-02-12 17:30:57 +0300
commit8aed3652c2cb1811aa5685bbeb7c97b097b2eed4 (patch)
tree0e6e43e5c539c1c09d7d64be4abd07c61ba427b9 /src/Text/Pandoc
parent10c8b9f4bbd78de75ebd134547445e9f1df13248 (diff)
Muse reader: refactor to avoid reparsing
Lists are parsed in linear instead of exponential time now. Contents of block tags, such as <quote>, is parsed directly, without storing it in a string and parsing with parseFromString. Fixed a bug: headers did not terminate lists.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers/Muse.hs273
1 files changed, 195 insertions, 78 deletions
diff --git a/src/Text/Pandoc/Readers/Muse.hs b/src/Text/Pandoc/Readers/Muse.hs
index 1385533b3..c8ebe1883 100644
--- a/src/Text/Pandoc/Readers/Muse.hs
+++ b/src/Text/Pandoc/Readers/Muse.hs
@@ -47,7 +47,7 @@ import Data.List (stripPrefix, intercalate)
import Data.List.Split (splitOn)
import qualified Data.Map as M
import qualified Data.Set as Set
-import Data.Maybe (fromMaybe)
+import Data.Maybe (fromMaybe, isNothing)
import Data.Text (Text, unpack)
import System.FilePath (takeExtension)
import Text.HTML.TagSoup
@@ -82,6 +82,7 @@ data MuseState = MuseState { museMeta :: F Meta -- ^ Document metadata
, museLogMessages :: [LogMessage]
, museNotes :: M.Map String (SourcePos, F Blocks)
, museInLink :: Bool
+ , museInPara :: Bool
}
instance Default MuseState where
@@ -96,6 +97,7 @@ defaultMuseState = MuseState { museMeta = return nullMeta
, museLogMessages = []
, museNotes = M.empty
, museInLink = False
+ , museInPara = False
}
type MuseParser = ParserT String MuseState
@@ -149,6 +151,12 @@ htmlElement tag = try $ do
where
endtag = void $ htmlTag (~== TagClose tag)
+htmlBlock :: PandocMonad m => String -> MuseParser m (Attr, String)
+htmlBlock tag = try $ do
+ res <- htmlElement tag
+ manyTill spaceChar eol
+ return res
+
htmlAttrToPandoc :: [Attribute String] -> Attr
htmlAttrToPandoc attrs = (ident, classes, keyvals)
where
@@ -159,13 +167,13 @@ htmlAttrToPandoc attrs = (ident, classes, keyvals)
parseHtmlContent :: PandocMonad m
=> String -> MuseParser m (Attr, F Blocks)
parseHtmlContent tag = do
- (attr, content) <- htmlElement tag
- parsedContent <- parseContent (content ++ "\n")
+ (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag [])
+ manyTill spaceChar eol
+ content <- parseBlocksTill (manyTill spaceChar endtag)
manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline
- return (attr, mconcat parsedContent)
+ return (htmlAttrToPandoc attr, content)
where
- parseContent = parseFromString $ manyTill parseBlock endOfContent
- endOfContent = try $ skipMany blankline >> skipSpaces >> eof
+ endtag = void $ htmlTag (~== TagClose tag)
commonPrefix :: String -> String -> String
commonPrefix _ [] = []
@@ -248,19 +256,85 @@ directive = do
parseBlocks :: PandocMonad m
=> MuseParser m (F Blocks)
parseBlocks =
- try (mempty <$ eof) <|>
+ try parseEnd <|>
try blockStart <|>
+ try listStart <|>
try paraStart
where
+ parseEnd = mempty <$ eof
blockStart = do first <- header <|> blockElements <|> amuseNoteBlock <|> emacsNoteBlock
rest <- parseBlocks
return $ first B.<> rest
+ listStart = do
+ st <- getState
+ setState $ st{ museInPara = False }
+ (first, rest) <- anyListUntil parseBlocks
+ return $ first B.<> rest
paraStart = do
indent <- length <$> many spaceChar
- (first, rest) <- paraUntil ((mempty <$ eof) <|> blockStart)
+ (first, rest) <- paraUntil parseBlocks
let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first
return $ first' B.<> rest
+parseBlocksTill :: PandocMonad m
+ => MuseParser m a
+ -> MuseParser m (F Blocks)
+parseBlocksTill end =
+ try parseEnd <|>
+ try blockStart <|>
+ try listStart <|>
+ try paraStart
+ where
+ parseEnd = mempty <$ end
+ blockStart = do first <- blockElements
+ rest <- continuation
+ return $ first B.<> rest
+ listStart = do
+ st <- getState
+ setState $ st{ museInPara = False }
+ (first, e) <- anyListUntil ((Left <$> end) <|> (Right <$> continuation))
+ case e of
+ Left _ -> return first
+ Right rest -> return $ first B.<> rest
+ paraStart = do (first, e) <- paraUntil ((Left <$> end) <|> (Right <$> continuation))
+ case e of
+ Left _ -> return $ first
+ Right rest -> return $ first B.<> rest
+ continuation = parseBlocksTill end
+
+listItemContentsUntil :: PandocMonad m
+ => Int
+ -> MuseParser m a
+ -> MuseParser m (F Blocks, a)
+listItemContentsUntil col end =
+ try blockStart <|>
+ try listStart <|>
+ try paraStart
+ where
+ parseEnd = do e <- end
+ return (mempty, e)
+ paraStart = do
+ (first, e) <- paraUntil ((Right <$> continuation) <|> (Left <$> end))
+ case e of
+ Left ee -> return (first, ee)
+ Right (rest, ee) -> return (first B.<> rest, ee)
+ blockStart = do first <- blockElements
+ (rest, e) <- continuation <|> parseEnd
+ return (first B.<> rest, e)
+ listStart = do
+ st <- getState
+ setState $ st{ museInPara = False }
+ (first, e) <- anyListUntil ((Right <$> continuation) <|> (Left <$> end))
+ case e of
+ Left ee -> return (first, ee)
+ Right (rest, ee) -> return $ (first B.<> rest, ee)
+ continuation = try $ do blank <- optionMaybe blankline
+ skipMany blankline
+ indentWith col
+ st <- getState
+ setState $ st{ museInPara = museInPara st && isNothing blank }
+ listItemContentsUntil col end
+
parseBlock :: PandocMonad m => MuseParser m (F Blocks)
parseBlock = do
res <- blockElements <|> para
@@ -269,24 +343,24 @@ parseBlock = do
where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements)))
blockElements :: PandocMonad m => MuseParser m (F Blocks)
-blockElements = choice [ mempty <$ blankline
- , comment
- , separator
- , example
- , exampleTag
- , literalTag
- , centerTag
- , rightTag
- , quoteTag
- , divTag
- , verseTag
- , lineBlock
- , bulletList
- , orderedList
- , definitionList
- , table
- , commentTag
- ]
+blockElements = do
+ st <- getState
+ setState $ st{ museInPara = False }
+ choice [ mempty <$ blankline
+ , comment
+ , separator
+ , example
+ , exampleTag
+ , literalTag
+ , centerTag
+ , rightTag
+ , quoteTag
+ , divTag
+ , verseTag
+ , lineBlock
+ , table
+ , commentTag
+ ]
comment :: PandocMonad m => MuseParser m (F Blocks)
comment = try $ do
@@ -343,13 +417,13 @@ dropSpacePrefix lns =
exampleTag :: PandocMonad m => MuseParser m (F Blocks)
exampleTag = try $ do
many spaceChar
- (attr, contents) <- htmlElement "example"
+ (attr, contents) <- htmlBlock "example"
return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents
literalTag :: PandocMonad m => MuseParser m (F Blocks)
literalTag = do
guardDisabled Ext_amuse -- Text::Amuse does not support <literal>
- (return . rawBlock) <$> htmlElement "literal"
+ (return . rawBlock) <$> htmlBlock "literal"
where
-- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML
format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs
@@ -385,18 +459,22 @@ verseLines = do
verseTag :: PandocMonad m => MuseParser m (F Blocks)
verseTag = do
- (_, content) <- htmlElement "verse"
+ (_, content) <- htmlBlock "verse"
parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content)
commentTag :: PandocMonad m => MuseParser m (F Blocks)
-commentTag = htmlElement "comment" >> return mempty
+commentTag = htmlBlock "comment" >> return mempty
-- Indented paragraph is either center, right or quote
paraUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
paraUntil end = do
+ state <- getState
+ guard $ not $ museInPara state
+ setState $ state{ museInPara = True }
(l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end)
+ updateState (\st -> st { museInPara = False })
return (fmap (B.para) $ trimInlinesF $ mconcat l, e)
noteMarker :: PandocMonad m => MuseParser m String
@@ -413,6 +491,8 @@ amuseNoteBlock = try $ do
guardEnabled Ext_amuse
pos <- getPosition
ref <- noteMarker <* spaceChar
+ st <- getState
+ setState $ st{ museInPara = False }
content <- listItemContents
oldnotes <- museNotes <$> getState
case M.lookup ref oldnotes of
@@ -465,35 +545,36 @@ lineBlock = try $ do
-- lists
--
-listItemContents' :: PandocMonad m => Int -> MuseParser m (F Blocks)
-listItemContents' col =
- mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
+bulletListItemsUntil :: PandocMonad m
+ => Int
+ -> MuseParser m a
+ -> MuseParser m ([F Blocks], a)
+bulletListItemsUntil indent end = try $ do
+ char '-'
+ void spaceChar <|> lookAhead eol
+ st <- getState
+ setState $ st{ museInPara = False }
+ (x, e) <- listItemContentsUntil (indent + 2) ((Right <$> try (optional blankline >> indentWith indent >> bulletListItemsUntil indent end)) <|> (Left <$> end))
+ case e of
+ Left ee -> return ([x], ee)
+ Right (xs, ee) -> return (x:xs, ee)
+
+bulletListUntil :: PandocMonad m
+ => MuseParser m a
+ -> MuseParser m (F Blocks, a)
+bulletListUntil end = try $ do
+ many spaceChar
+ pos <- getPosition
+ let indent = sourceColumn pos - 1
+ guard $ indent /= 0
+ (items, e) <- bulletListItemsUntil indent end
+ return $ (B.bulletList <$> sequence items, e)
listItemContents :: PandocMonad m => MuseParser m (F Blocks)
listItemContents = do
pos <- getPosition
let col = sourceColumn pos - 1
- listItemContents' col
-
-listItem :: PandocMonad m => Int -> MuseParser m a -> MuseParser m (F Blocks)
-listItem n p = try $ do
- optional blankline
- count n spaceChar
- p
- void spaceChar <|> lookAhead eol
- listItemContents
-
-bulletList :: PandocMonad m => MuseParser m (F Blocks)
-bulletList = try $ do
- many spaceChar
- pos <- getPosition
- let col = sourceColumn pos
- guard $ col /= 1
- char '-'
- void spaceChar <|> lookAhead eol
- first <- listItemContents
- rest <- many $ listItem (col - 1) (char '-')
- return $ B.bulletList <$> sequence (first : rest)
+ mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
-- | Parses an ordered list marker and returns list attributes.
anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes
@@ -516,38 +597,74 @@ museOrderedListMarker style = do
char '.'
return start
-orderedList :: PandocMonad m => MuseParser m (F Blocks)
-orderedList = try $ do
+orderedListItemsUntil :: PandocMonad m
+ => Int
+ -> ListNumberStyle
+ -> MuseParser m a
+ -> MuseParser m ([F Blocks], a)
+orderedListItemsUntil indent style end =
+ continuation
+ where
+ continuation = try $ do
+ pos <- getPosition
+ void spaceChar <|> lookAhead eol
+ st <- getState
+ setState $ st{ museInPara = False }
+ (x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optionMaybe blankline >> indentWith indent >> museOrderedListMarker style >> continuation)) <|> (Left <$> end))
+ case e of
+ Left ee -> return ([x], ee)
+ Right (xs, ee) -> return (x:xs, ee)
+
+orderedListUntil :: PandocMonad m
+ => MuseParser m a
+ -> MuseParser m (F Blocks, a)
+orderedListUntil end = try $ do
many spaceChar
pos <- getPosition
- let col = sourceColumn pos
- guard $ col /= 1
+ let indent = sourceColumn pos - 1
+ guard $ indent /= 0
p@(_, style, _) <- anyMuseOrderedListMarker
guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman]
- void spaceChar <|> lookAhead eol
- first <- listItemContents
- rest <- many $ listItem (col - 1) (museOrderedListMarker style)
- return $ B.orderedListWith p <$> sequence (first : rest)
-
-definitionListItem :: PandocMonad m => MuseParser m (F (Inlines, [Blocks]))
-definitionListItem = try $ do
- pos <- getPosition
- term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::")
- void spaceChar <|> lookAhead eol
- contents <- listItemContents' $ sourceColumn pos
- pure $ do lineContent' <- contents
- term' <- term
- pure (term', [lineContent'])
-
-definitionList :: PandocMonad m => MuseParser m (F Blocks)
-definitionList = try $ do
+ (items, e) <- orderedListItemsUntil indent style end
+ return $ (B.orderedListWith p <$> sequence items, e)
+
+definitionListItemsUntil :: PandocMonad m
+ => Int
+ -> MuseParser m a
+ -> MuseParser m ([F (Inlines, [Blocks])], a)
+definitionListItemsUntil indent end =
+ continuation
+ where continuation = try $ do
+ pos <- getPosition
+ term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::")
+ void spaceChar <|> lookAhead eol
+ st <- getState
+ setState $ st{ museInPara = False }
+ (x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optional blankline >> count indent spaceChar >> continuation)) <|> (Left <$> end))
+ let xx = do
+ term' <- term
+ x' <- x
+ (return (term', [x']))::(F (Inlines, [Blocks]))
+ case e of
+ Left ee -> return $ ([xx], ee)
+ Right (xs, ee) -> return $ (xx : xs, ee)
+
+definitionListUntil :: PandocMonad m
+ => MuseParser m a
+ -> MuseParser m (F Blocks, a)
+definitionListUntil end = try $ do
many spaceChar
pos <- getPosition
let indent = sourceColumn pos - 1
guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse
- first <- definitionListItem
- rest <- many $ try (optional blankline >> count indent spaceChar >> definitionListItem)
- return $ B.definitionList <$> sequence (first : rest)
+ (items, e) <- definitionListItemsUntil indent end
+ return (B.definitionList <$> sequence items, e)
+
+anyListUntil :: PandocMonad m
+ => MuseParser m a
+ -> MuseParser m (F Blocks, a)
+anyListUntil end =
+ bulletListUntil end <|> orderedListUntil end <|> definitionListUntil end
--
-- tables