diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
48 files changed, 10595 insertions, 5719 deletions
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index d20d386e7..6fbc09c17 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2015-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.CommonMark - Copyright : Copyright (C) 2015 John MacFarlane + Copyright : Copyright (C) 2015-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,48 +32,94 @@ CommonMark is a strongly specified variant of Markdown: http://commonmark.org. module Text.Pandoc.Readers.CommonMark (readCommonMark) where -import CMark -import Data.Text (unpack, pack) +import CMarkGFM +import Control.Monad.State +import Data.Char (isAlphaNum, isLetter, isSpace, toLower) import Data.List (groupBy) +import qualified Data.Map as Map +import Data.Text (Text, unpack) +import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Definition +import Text.Pandoc.Emoji (emojis) import Text.Pandoc.Options -import Text.Pandoc.Error +import Text.Pandoc.Shared (stringify) +import Text.Pandoc.Walk (walkM) -- | Parse a CommonMark formatted string into a 'Pandoc' structure. -readCommonMark :: ReaderOptions -> String -> Either PandocError Pandoc -readCommonMark opts = Right . nodeToPandoc . commonmarkToNode opts' . pack - where opts' = if readerSmart opts - then [optNormalize, optSmart] - else [optNormalize] - -nodeToPandoc :: Node -> Pandoc -nodeToPandoc (Node _ DOCUMENT nodes) = - Pandoc nullMeta $ foldr addBlock [] nodes -nodeToPandoc n = -- shouldn't happen - Pandoc nullMeta $ foldr addBlock [] [n] - -addBlocks :: [Node] -> [Block] -addBlocks = foldr addBlock [] - -addBlock :: Node -> [Block] -> [Block] -addBlock (Node _ PARAGRAPH nodes) = - (Para (addInlines nodes) :) -addBlock (Node _ THEMATIC_BREAK _) = +readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readCommonMark opts s = return $ + (if isEnabled Ext_gfm_auto_identifiers opts + then addHeaderIdentifiers + else id) $ + nodeToPandoc opts $ commonmarkToNode opts' exts s + where opts' = [ optSmart | isEnabled Ext_smart opts ] + exts = [ extStrikethrough | isEnabled Ext_strikeout opts ] ++ + [ extTable | isEnabled Ext_pipe_tables opts ] ++ + [ extAutolink | isEnabled Ext_autolink_bare_uris opts ] + +convertEmojis :: String -> String +convertEmojis (':':xs) = + case break (==':') xs of + (ys,':':zs) -> + case Map.lookup ys emojis of + Just s -> s ++ convertEmojis zs + Nothing -> ':' : ys ++ convertEmojis (':':zs) + _ -> ':':xs +convertEmojis (x:xs) = x : convertEmojis xs +convertEmojis [] = [] + +addHeaderIdentifiers :: Pandoc -> Pandoc +addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty + +addHeaderId :: Block -> State (Map.Map String Int) Block +addHeaderId (Header lev (_,classes,kvs) ils) = do + idmap <- get + let ident = toIdent ils + ident' <- case Map.lookup ident idmap of + Nothing -> do + put (Map.insert ident 1 idmap) + return ident + Just i -> do + put (Map.adjust (+ 1) ident idmap) + return (ident ++ "-" ++ show i) + return $ Header lev (ident',classes,kvs) ils +addHeaderId x = return x + +toIdent :: [Inline] -> String +toIdent = map (\c -> if isSpace c then '-' else c) + . filter (\c -> isLetter c || isAlphaNum c || isSpace c || + c == '_' || c == '-') + . map toLower . stringify + +nodeToPandoc :: ReaderOptions -> Node -> Pandoc +nodeToPandoc opts (Node _ DOCUMENT nodes) = + Pandoc nullMeta $ foldr (addBlock opts) [] nodes +nodeToPandoc opts n = -- shouldn't happen + Pandoc nullMeta $ foldr (addBlock opts) [] [n] + +addBlocks :: ReaderOptions -> [Node] -> [Block] +addBlocks opts = foldr (addBlock opts) [] + +addBlock :: ReaderOptions -> Node -> [Block] -> [Block] +addBlock opts (Node _ PARAGRAPH nodes) = + (Para (addInlines opts nodes) :) +addBlock _ (Node _ THEMATIC_BREAK _) = (HorizontalRule :) -addBlock (Node _ BLOCK_QUOTE nodes) = - (BlockQuote (addBlocks nodes) :) -addBlock (Node _ (HTML_BLOCK t) _) = - (RawBlock (Format "html") (unpack t) :) +addBlock opts (Node _ BLOCK_QUOTE nodes) = + (BlockQuote (addBlocks opts nodes) :) +addBlock opts (Node _ (HTML_BLOCK t) _) + | isEnabled Ext_raw_html opts = (RawBlock (Format "html") (unpack t) :) + | otherwise = id -- Note: the cmark parser will never generate CUSTOM_BLOCK, -- so we don't need to handle it: -addBlock (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) = +addBlock _ (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) = id -addBlock (Node _ (CODE_BLOCK info t) _) = +addBlock _ (Node _ (CODE_BLOCK info t) _) = (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :) -addBlock (Node _ (HEADING lev) nodes) = - (Header lev ("",[],[]) (addInlines nodes) :) -addBlock (Node _ (LIST listAttrs) nodes) = - (constructor (map (setTightness . addBlocks . children) nodes) :) +addBlock opts (Node _ (HEADING lev) nodes) = + (Header lev ("",[],[]) (addInlines opts nodes) :) +addBlock opts (Node _ (LIST listAttrs) nodes) = + (constructor (map (setTightness . addBlocks opts . children) nodes) :) where constructor = case listType listAttrs of BULLET_LIST -> BulletList ORDERED_LIST -> OrderedList @@ -82,46 +128,108 @@ addBlock (Node _ (LIST listAttrs) nodes) = setTightness = if listTight listAttrs then map paraToPlain else id - paraToPlain (Para xs) = Plain (xs) + paraToPlain (Para xs) = Plain xs paraToPlain x = x delim = case listDelim listAttrs of - PERIOD_DELIM -> Period - PAREN_DELIM -> OneParen -addBlock (Node _ ITEM _) = id -- handled in LIST -addBlock _ = id + PERIOD_DELIM -> Period + PAREN_DELIM -> OneParen +addBlock opts (Node _ (TABLE alignments) nodes) = + (Table [] aligns widths headers rows :) + where aligns = map fromTableCellAlignment alignments + fromTableCellAlignment NoAlignment = AlignDefault + fromTableCellAlignment LeftAligned = AlignLeft + fromTableCellAlignment RightAligned = AlignRight + fromTableCellAlignment CenterAligned = AlignCenter + widths = replicate numcols 0.0 + numcols = if null rows' + then 0 + else maximum $ map length rows' + rows' = map toRow $ filter isRow nodes + (headers, rows) = case rows' of + (h:rs) -> (h, rs) + [] -> ([], []) + isRow (Node _ TABLE_ROW _) = True + isRow _ = False + isCell (Node _ TABLE_CELL _) = True + isCell _ = False + toRow (Node _ TABLE_ROW ns) = map toCell $ filter isCell ns + toRow (Node _ t _) = error $ "toRow encountered non-row " ++ show t + toCell (Node _ TABLE_CELL []) = [] + toCell (Node _ TABLE_CELL (n:ns)) + | isBlockNode n = addBlocks opts (n:ns) + | otherwise = [Plain (addInlines opts (n:ns))] + toCell (Node _ t _) = error $ "toCell encountered non-cell " ++ show t +addBlock _ (Node _ TABLE_ROW _) = id -- handled in TABLE +addBlock _ (Node _ TABLE_CELL _) = id -- handled in TABLE +addBlock _ _ = id + +isBlockNode :: Node -> Bool +isBlockNode (Node _ nodetype _) = + case nodetype of + DOCUMENT -> True + THEMATIC_BREAK -> True + PARAGRAPH -> True + BLOCK_QUOTE -> True + HTML_BLOCK _ -> True + CUSTOM_BLOCK _ _ -> True + CODE_BLOCK _ _ -> True + HEADING _ -> True + LIST _ -> True + ITEM -> True + TEXT _ -> False + SOFTBREAK -> False + LINEBREAK -> False + HTML_INLINE _ -> False + CUSTOM_INLINE _ _ -> False + CODE _ -> False + EMPH -> False + STRONG -> False + LINK _ _ -> False + IMAGE _ _ -> False + STRIKETHROUGH -> False + TABLE _ -> False + TABLE_ROW -> False + TABLE_CELL -> False children :: Node -> [Node] children (Node _ _ ns) = ns -addInlines :: [Node] -> [Inline] -addInlines = foldr addInline [] +addInlines :: ReaderOptions -> [Node] -> [Inline] +addInlines opts = foldr (addInline opts) [] -addInline :: Node -> [Inline] -> [Inline] -addInline (Node _ (TEXT t) _) = (map toinl clumps ++) +addInline :: ReaderOptions -> Node -> [Inline] -> [Inline] +addInline opts (Node _ (TEXT t) _) = (map toinl clumps ++) where raw = unpack t clumps = groupBy samekind raw samekind ' ' ' ' = True samekind ' ' _ = False samekind _ ' ' = False samekind _ _ = True - toinl (' ':_) = Space - toinl xs = Str xs -addInline (Node _ LINEBREAK _) = (LineBreak :) -addInline (Node _ SOFTBREAK _) = (SoftBreak :) -addInline (Node _ (HTML_INLINE t) _) = - (RawInline (Format "html") (unpack t) :) + toinl (' ':_) = Space + toinl xs = Str $ if isEnabled Ext_emoji opts + then convertEmojis xs + else xs +addInline _ (Node _ LINEBREAK _) = (LineBreak :) +addInline opts (Node _ SOFTBREAK _) + | isEnabled Ext_hard_line_breaks opts = (LineBreak :) + | otherwise = (SoftBreak :) +addInline opts (Node _ (HTML_INLINE t) _) + | isEnabled Ext_raw_html opts = (RawInline (Format "html") (unpack t) :) + | otherwise = id -- Note: the cmark parser will never generate CUSTOM_BLOCK, -- so we don't need to handle it: -addInline (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) = +addInline _ (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) = id -addInline (Node _ (CODE t) _) = +addInline _ (Node _ (CODE t) _) = (Code ("",[],[]) (unpack t) :) -addInline (Node _ EMPH nodes) = - (Emph (addInlines nodes) :) -addInline (Node _ STRONG nodes) = - (Strong (addInlines nodes) :) -addInline (Node _ (LINK url title) nodes) = - (Link nullAttr (addInlines nodes) (unpack url, unpack title) :) -addInline (Node _ (IMAGE url title) nodes) = - (Image nullAttr (addInlines nodes) (unpack url, unpack title) :) -addInline _ = id +addInline opts (Node _ EMPH nodes) = + (Emph (addInlines opts nodes) :) +addInline opts (Node _ STRONG nodes) = + (Strong (addInlines opts nodes) :) +addInline opts (Node _ STRIKETHROUGH nodes) = + (Strikeout (addInlines opts nodes) :) +addInline opts (Node _ (LINK url title) nodes) = + (Link nullAttr (addInlines opts nodes) (unpack url, unpack title) :) +addInline opts (Node _ (IMAGE url title) nodes) = + (Image nullAttr (addInlines opts nodes) (unpack url, unpack title) :) +addInline _ _ = id diff --git a/src/Text/Pandoc/Readers/Creole.hs b/src/Text/Pandoc/Readers/Creole.hs new file mode 100644 index 000000000..505d1686d --- /dev/null +++ b/src/Text/Pandoc/Readers/Creole.hs @@ -0,0 +1,320 @@ +{- + Copyright (C) 2017 Sascha Wilde <wilde@sha-bang.de> + + partly based on all the other readers, especialy the work by + John MacFarlane <jgm@berkeley.edu> and + Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + all bugs are solely created by me. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Creole + Copyright : Copyright (C) 2017 Sascha Wilde + License : GNU GPL, version 2 or above + + Maintainer : Sascha Wilde <wilde@sha-bang.de> + Stability : alpha + Portability : portable + +Conversion of creole text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Creole ( readCreole + ) where + +import Control.Monad.Except (guard, liftM2, throwError) +import qualified Data.Foldable as F +import Data.Maybe (fromMaybe) +import Data.Monoid +import Data.Text (Text) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed) +import Text.Pandoc.Shared (crFilter) + + +-- | Read creole from an input string and return a Pandoc document. +readCreole :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readCreole opts s = do + res <- readWithM parseCreole def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case res of + Left e -> throwError e + Right d -> return d + +type CRLParser = ParserT [Char] ParserState + +-- +-- Utility functions +-- + +(<+>) :: (Monad m, Monoid a) => m a -> m a -> m a +(<+>) = liftM2 (<>) + +-- we have to redefine `enclosed' from Text.Pandoc.Parsing, because it +-- assumes, that there can't be a space after the start parser, but +-- with creole this is possible. +enclosed :: (Show end, PandocMonad m) => CRLParser m start -- ^ start parser + -> CRLParser m end -- ^ end parser + -> CRLParser m a -- ^ content parser (to be used repeatedly) + -> CRLParser m [a] +enclosed start end parser = try $ start >> many1Till parser end + +-- +-- main parser +-- + +specialChars :: [Char] +specialChars = "*/~{}\\|[]()<>\"'" + +parseCreole :: PandocMonad m => CRLParser m Pandoc +parseCreole = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + + +-- +-- block parsers +-- + +block :: PandocMonad m => CRLParser m B.Blocks +block = do + res <- mempty <$ skipMany1 blankline + <|> nowiki + <|> header + <|> horizontalRule + <|> anyList 1 + <|> table + <|> para + skipMany blankline + return res + +nowiki :: PandocMonad m => CRLParser m B.Blocks +nowiki = try $ fmap (B.codeBlock . mconcat) (nowikiStart + >> manyTill content nowikiEnd) + where + content = brackets <|> line + brackets = try $ option "" ((:[]) <$> newline) + <+> (char ' ' >> (many (char ' ') <+> string "}}}") <* eol) + line = option "" ((:[]) <$> newline) <+> manyTill anyChar eol + eol = lookAhead $ try $ nowikiEnd <|> newline + nowikiStart = optional newline >> string "{{{" >> skipMany spaceChar >> newline + nowikiEnd = try $ linebreak >> string "}}}" >> skipMany spaceChar >> newline + +header :: PandocMonad m => CRLParser m B.Blocks +header = try $ do + skipSpaces + level <- + fmap length (many1 (char '=')) + guard $ level <= 6 + skipSpaces + content <- B.str <$> manyTill (noneOf "\n") headerEnd + return $ B.header level content + where + headerEnd = try $ skipSpaces >> many (char '=') >> skipSpaces >> newline + +unorderedList :: PandocMonad m => Int -> CRLParser m B.Blocks +unorderedList = list '*' B.bulletList + +orderedList :: PandocMonad m => Int -> CRLParser m B.Blocks +orderedList = list '#' B.orderedList + +anyList :: PandocMonad m => Int -> CRLParser m B.Blocks +anyList n = unorderedList n <|> orderedList n + +anyListItem :: PandocMonad m => Int -> CRLParser m B.Blocks +anyListItem n = listItem '*' n <|> listItem '#' n + +list :: PandocMonad m => Char -> ([B.Blocks] -> B.Blocks) -> Int -> CRLParser m B.Blocks +list c f n = + fmap f (many1 (itemPlusSublist <|> listItem c n)) + where itemPlusSublist = try $ listItem c n <+> anyList (n+1) + +listItem :: PandocMonad m => Char -> Int -> CRLParser m B.Blocks +listItem c n = + fmap (B.plain . B.trimInlines .mconcat) (listStart >> many1Till inline itemEnd) + where + listStart = try $ skipSpaces >> optional newline >> skipSpaces + >> count n (char c) + >> lookAhead (noneOf [c]) >> skipSpaces + itemEnd = endOfParaElement <|> nextItem n + <|> if n < 3 then nextItem (n+1) + else nextItem (n+1) <|> nextItem (n-1) + nextItem x = lookAhead $ try $ blankline >> anyListItem x >> return mempty + +table :: PandocMonad m => CRLParser m B.Blocks +table = try $ do + headers <- optionMaybe headerRow + rows <- many1 row + return $ B.simpleTable (fromMaybe [mempty] headers) rows + where + headerRow = try $ skipSpaces >> many1Till headerCell rowEnd + headerCell = B.plain . B.trimInlines . mconcat + <$> (string "|=" >> many1Till inline cellEnd) + row = try $ skipSpaces >> many1Till cell rowEnd + cell = B.plain . B.trimInlines . mconcat + <$> (char '|' >> many1Till inline cellEnd) + rowEnd = try $ optional (char '|') >> skipSpaces >> newline + cellEnd = lookAhead $ try $ char '|' <|> rowEnd + +para :: PandocMonad m => CRLParser m B.Blocks +para = fmap (result . mconcat) (many1Till inline endOfParaElement) + where + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + +endOfParaElement :: PandocMonad m => CRLParser m () +endOfParaElement = lookAhead $ endOfInput <|> endOfPara + <|> startOfList <|> startOfTable + <|> startOfHeader <|> hr <|> startOfNowiki + where + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + startOf :: PandocMonad m => CRLParser m a -> CRLParser m () + startOf p = try $ blankline >> p >> return mempty + startOfList = startOf $ anyListItem 1 + startOfTable = startOf table + startOfHeader = startOf header + startOfNowiki = startOf nowiki + hr = startOf horizontalRule + +horizontalRule :: PandocMonad m => CRLParser m B.Blocks +horizontalRule = try $ skipSpaces >> string "----" >> skipSpaces >> newline + >> return B.horizontalRule + +-- +-- inline parsers +-- + +inline :: PandocMonad m => CRLParser m B.Inlines +inline = choice [ whitespace + , escapedLink + , escapedChar + , link + , inlineNowiki + , placeholder + , image + , forcedLinebreak + , bold + , finalBold + , italics + , finalItalics + , str + , symbol + ] <?> "inline" + +escapedChar :: PandocMonad m => CRLParser m B.Inlines +escapedChar = + fmap (B.str . (:[])) (try $ char '~' >> noneOf "\t\n ") + +escapedLink :: PandocMonad m => CRLParser m B.Inlines +escapedLink = try $ do + char '~' + (orig, _) <- uri + return $ B.str orig + +image :: PandocMonad m => CRLParser m B.Inlines +image = try $ do + (orig, src) <- wikiImg + return $ B.image src "" (B.str orig) + where + linkSrc = many $ noneOf "|}\n\r\t" + linkDsc = char '|' >> many (noneOf "}\n\r\t") + wikiImg = try $ do + string "{{" + src <- linkSrc + dsc <- option "" linkDsc + string "}}" + return (dsc, src) + +link :: PandocMonad m => CRLParser m B.Inlines +link = try $ do + (orig, src) <- uriLink <|> wikiLink + return $ B.link src "" orig + where + linkSrc = many $ noneOf "|]\n\r\t" + linkDsc :: PandocMonad m => String -> CRLParser m B.Inlines + linkDsc otxt = B.str + <$> try (option otxt + (char '|' >> many (noneOf "]\n\r\t"))) + linkImg = try $ char '|' >> image + wikiLink = try $ do + string "[[" + src <- linkSrc + dsc <- linkImg <|> linkDsc src + string "]]" + return (dsc, src) + uriLink = try $ do + (orig, src) <- uri + return (B.str orig, src) + +inlineNowiki :: PandocMonad m => CRLParser m B.Inlines +inlineNowiki = B.code <$> (start >> manyTill (noneOf "\n\r") end) + where + start = try $ string "{{{" + end = try $ string "}}}" >> lookAhead (noneOf "}") + +placeholder :: PandocMonad m => CRLParser m B.Inlines +-- The semantics of the placeholder is basicallly implementation +-- dependent, so there is no way to DTRT for all cases. +-- So for now we just drop them. +placeholder = B.text <$> try (string "<<<" >> manyTill anyChar (string ">>>") + >> return "") + +whitespace :: PandocMonad m => CRLParser m B.Inlines +whitespace = lb <|> regsp + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +linebreak :: PandocMonad m => CRLParser m B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +symbol :: PandocMonad m => CRLParser m B.Inlines +symbol = fmap (B.str . (:[])) (oneOf specialChars) + +str :: PandocMonad m => CRLParser m B.Inlines +str = let strChar = noneOf ("\t\n " ++ specialChars) in + fmap B.str (many1 strChar) + +bold :: PandocMonad m => CRLParser m B.Inlines +bold = B.strong . mconcat <$> + enclosed (string "**") (try $ string "**") inline + +italics :: PandocMonad m => CRLParser m B.Inlines +italics = B.emph . mconcat <$> + enclosed (string "//") (try $ string "//") inline + +finalBold :: PandocMonad m => CRLParser m B.Inlines +finalBold = B.strong . mconcat <$> + try (string "**" >> many1Till inline endOfParaElement) + +finalItalics :: PandocMonad m => CRLParser m B.Inlines +finalItalics = B.emph . mconcat <$> + try (string "//" >> many1Till inline endOfParaElement) + +forcedLinebreak :: PandocMonad m => CRLParser m B.Inlines +forcedLinebreak = try $ string "\\\\" >> return B.linebreak diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 68552ccb3..728f77a05 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -1,22 +1,22 @@ +{-# LANGUAGE ExplicitForAll #-} module Text.Pandoc.Readers.DocBook ( readDocBook ) where -import Data.Char (toUpper) -import Text.Pandoc.Shared (safeRead) -import Text.Pandoc.Options -import Text.Pandoc.Definition -import Text.Pandoc.Builder -import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) +import Control.Monad.State.Strict +import Data.Char (isSpace, toUpper) +import Data.Default import Data.Either (rights) +import Data.Foldable (asum) import Data.Generics -import Data.Char (isSpace) -import Control.Monad.State import Data.List (intersperse) import Data.Maybe (fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Options +import Text.Pandoc.Shared (crFilter, safeRead) import Text.TeXMath (readMathML, writeTeX) -import Text.Pandoc.Error (PandocError) -import Control.Monad.Except -import Data.Default -import Data.Foldable (asum) +import Text.XML.Light {- @@ -50,7 +50,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] author - The name of an individual author [ ] authorblurb - A short description or note about an author [x] authorgroup - Wrapper for author information when a document has - multiple authors or collabarators + multiple authors or collaborators [x] authorinitials - The initials or other short identifier for an author [o] beginpage - The location of a page break in a print version of the document [ ] bibliocoverage - The spatial or temporal coverage of a document @@ -502,7 +502,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] ?asciidoc-br? - line break from asciidoc docbook output -} -type DB = ExceptT PandocError (State DBState) +type DB m = StateT DBState m data DBState = DBState{ dbSectionLevel :: Int , dbQuoteType :: QuoteType @@ -523,10 +523,12 @@ instance Default DBState where , dbContent = [] } -readDocBook :: ReaderOptions -> String -> Either PandocError Pandoc -readDocBook _ inp = (\blocks -> Pandoc (dbMeta st') (toList . mconcat $ blocks)) <$> bs - where (bs , st') = flip runState (def{ dbContent = tree }) . runExceptT . mapM parseBlock $ tree - tree = normalizeTree . parseXML . handleInstructions $ inp +readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readDocBook _ inp = do + let tree = normalizeTree . parseXML . handleInstructions + $ T.unpack $ crFilter inp + (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree + return $ Pandoc (dbMeta st') (toList . mconcat $ bs) -- We treat <?asciidoc-br?> specially (issue #1236), converting it -- to <br/>, since xml-light doesn't parse the instruction correctly. @@ -536,12 +538,12 @@ handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>': handleInstructions xs = case break (=='<') xs of (ys, []) -> ys ([], '<':zs) -> '<' : handleInstructions zs - (ys, zs) -> ys ++ handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs -getFigure :: Element -> DB Blocks +getFigure :: PandocMonad m => Element -> DB m Blocks getFigure e = do tit <- case filterChild (named "title") e of - Just t -> getInlines t + Just t -> getInlines t Nothing -> return mempty modify $ \st -> st{ dbFigureTitle = tit } res <- getBlocks e @@ -564,14 +566,12 @@ normalizeTree = everywhere (mkT go) go xs = xs convertEntity :: String -> String -convertEntity e = maybe (map toUpper e) id (lookupEntity e) +convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) -- convenience function to get an attribute value, defaulting to "" attrValue :: String -> Element -> String attrValue attr elt = - case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of - Just z -> z - Nothing -> "" + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: String -> Element -> Bool @@ -579,20 +579,20 @@ named s e = qName (elName e) == s -- -acceptingMetadata :: DB a -> DB a +acceptingMetadata :: PandocMonad m => DB m a -> DB m a acceptingMetadata p = do modify (\s -> s { dbAcceptsMeta = True } ) res <- p modify (\s -> s { dbAcceptsMeta = False }) return res -checkInMeta :: Monoid a => DB () -> DB a +checkInMeta :: (PandocMonad m, Monoid a) => DB m () -> DB m a checkInMeta p = do accepts <- dbAcceptsMeta <$> get when accepts p return mempty -addMeta :: ToMetaValue a => String -> a -> DB () +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> DB m () addMeta field val = modify (setMeta field val) instance HasMeta DBState where @@ -631,7 +631,7 @@ addToStart toadd bs = -- function that is used by both mediaobject (in parseBlock) -- and inlinemediaobject (in parseInline) -- A DocBook mediaobject is a wrapper around a set of alternative presentations -getMediaobject :: Element -> DB Inlines +getMediaobject :: PandocMonad m => Element -> DB m Inlines getMediaobject e = do (imageUrl, attr) <- case filterChild (named "imageobject") e of @@ -651,18 +651,20 @@ getMediaobject e = do || named "textobject" x || named "alt" x) el of Nothing -> return mempty - Just z -> mconcat <$> (mapM parseInline $ elContent z) + Just z -> mconcat <$> + mapM parseInline (elContent z) figTitle <- gets dbFigureTitle let (caption, title) = if isNull figTitle then (getCaption e, "") else (return figTitle, "fig:") - liftM (imageWith attr imageUrl title) caption + fmap (imageWith attr imageUrl title) caption -getBlocks :: Element -> DB Blocks -getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) +getBlocks :: PandocMonad m => Element -> DB m Blocks +getBlocks e = mconcat <$> + mapM parseBlock (elContent e) -parseBlock :: Content -> DB Blocks +parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE parseBlock (Text (CData _ s _)) = if all isSpace s then return mempty @@ -795,15 +797,16 @@ parseBlock (Elem e) = return $ p <> b <> x codeBlockWithLang = do let classes' = case attrValue "language" e of - "" -> [] - x -> [x] + "" -> [] + x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty Just z -> (para . (str "— " <>) . mconcat) - <$> (mapM parseInline $ elContent z) + <$> + mapM parseInline (elContent z) contents <- getBlocks e return $ blockQuote (contents <> attrib) listitems = mapM getBlocks $ filterChildren (named "listitem") e @@ -868,11 +871,11 @@ parseBlock (Elem e) = || x == '.') w Nothing -> 0 :: Double let numrows = case bodyrows of - [] -> 0 - xs -> maximum $ map length xs + [] -> 0 + xs -> maximum $ map length xs let aligns = case colspecs of - [] -> replicate numrows AlignDefault - cs -> map toAlignment cs + [] -> replicate numrows AlignDefault + cs -> map toAlignment cs let widths = case colspecs of [] -> replicate numrows 0 cs -> let ws = map toWidth cs @@ -892,7 +895,7 @@ parseBlock (Elem e) = headerText <- case filterChild (named "title") e `mplus` (filterChild (named "info") e >>= filterChild (named "title")) of - Just t -> getInlines t + Just t -> getInlines t Nothing -> return mempty modify $ \st -> st{ dbSectionLevel = n } b <- getBlocks e @@ -902,8 +905,9 @@ parseBlock (Elem e) = lineItems = mapM getInlines $ filterChildren (named "line") e metaBlock = acceptingMetadata (getBlocks e) >> return mempty -getInlines :: Element -> DB Inlines -getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') +getInlines :: PandocMonad m => Element -> DB m Inlines +getInlines e' = (trimInlines . mconcat) <$> + mapM parseInline (elContent e') strContentRecursive :: Element -> String strContentRecursive = strContent . @@ -913,10 +917,10 @@ elementToStr :: Content -> Content elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x -parseInline :: Content -> DB Inlines +parseInline :: PandocMonad m => Content -> DB m Inlines parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ maybe (text $ map toUpper ref) (text) $ lookupEntity ref + return $ maybe (text $ map toUpper ref) text $ lookupEntity ref parseInline (Elem e) = case qName (elName e) of "equation" -> equation displayMath @@ -957,8 +961,10 @@ parseInline (Elem e) = "userinput" -> codeWithLang "varargs" -> return $ code "(...)" "keycap" -> return (str $ strContent e) - "keycombo" -> keycombo <$> (mapM parseInline $ elContent e) - "menuchoice" -> menuchoice <$> (mapM parseInline $ + "keycombo" -> keycombo <$> + mapM parseInline (elContent e) + "menuchoice" -> menuchoice <$> + mapM parseInline ( filter isGuiMenu $ elContent e) "xref" -> do content <- dbContent <$> get @@ -977,17 +983,18 @@ parseInline (Elem e) = ils <- innerInlines let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just h -> h - _ -> ('#' : attrValue "linkend" e) + _ -> '#' : attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, words $ attrValue "role" e, []) return $ linkWith attr href "" ils' "foreignphrase" -> emph <$> innerInlines "emphasis" -> case attrValue "role" e of - "bold" -> strong <$> innerInlines - "strong" -> strong <$> innerInlines + "bold" -> strong <$> innerInlines + "strong" -> strong <$> innerInlines "strikethrough" -> strikeout <$> innerInlines - _ -> emph <$> innerInlines - "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) + _ -> emph <$> innerInlines + "footnote" -> (note . mconcat) <$> + mapM parseBlock (elContent e) "title" -> return mempty "affiliation" -> return mempty -- Note: this isn't a real docbook tag; it's what we convert @@ -996,7 +1003,7 @@ parseInline (Elem e) = "br" -> return linebreak _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> - (mapM parseInline $ elContent e) + mapM parseInline (elContent e) equation constructor = return $ mconcat $ map (constructor . writeTeX) $ rights diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 595c805bf..e58b0a905 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,7 +1,8 @@ -{-# LANGUAGE PatternGuards, OverloadedStrings, CPP #-} - +{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternGuards #-} {- -Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Docx - Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal License : GNU GPL, version 2 or above Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> @@ -50,12 +51,13 @@ implemented, [-] means partially implemented): * Inlines - [X] Str - - [X] Emph (italics and underline both read as Emph) + - [X] Emph - [X] Strong - [X] Strikeout - [X] Superscript - [X] Subscript - [X] SmallCaps + - [-] Underline (was previously converted to Emph) - [ ] Quoted - [ ] Cite - [X] Code (styled with `VerbatimChar`) @@ -64,88 +66,91 @@ implemented, [-] means partially implemented): - [X] Math - [X] Link (links to an arbitrary bookmark create a span with the target as id and "anchor" class) - - [X] Image + - [X] Image - [X] Note (Footnotes and Endnotes are silently combined.) -} module Text.Pandoc.Readers.Docx - ( readDocxWithWarnings - , readDocx + ( readDocx ) where import Codec.Archive.Zip -import Text.Pandoc.Definition -import Text.Pandoc.Options +import Control.Monad.Reader +import Control.Monad.State.Strict +import qualified Data.ByteString.Lazy as B +import Data.Default (Default) +import Data.List (delete, intersect) +import qualified Data.Map as M +import Data.Maybe (isJust, fromMaybe) +import Data.Sequence (ViewL (..), viewl) +import qualified Data.Sequence as Seq +import qualified Data.Set as Set import Text.Pandoc.Builder -import Text.Pandoc.Walk -import Text.Pandoc.Readers.Docx.Parse -import Text.Pandoc.Readers.Docx.Lists +-- import Text.Pandoc.Definition +import Text.Pandoc.MediaBag (MediaBag) +import Text.Pandoc.Options import Text.Pandoc.Readers.Docx.Combine +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Shared -import Text.Pandoc.MediaBag (insertMedia, MediaBag) -import Data.List (delete, intersect) +import Text.Pandoc.Walk import Text.TeXMath (writeTeX) -import Data.Default (Default) -import qualified Data.ByteString.Lazy as B -import qualified Data.Map as M -import qualified Data.Set as Set -import Control.Monad.Reader -import Control.Monad.State -import Data.Sequence (ViewL(..), viewl) -import qualified Data.Sequence as Seq (null) #if !(MIN_VERSION_base(4,8,0)) import Data.Traversable (traverse) #endif - +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P import Text.Pandoc.Error -import Control.Monad.Except +import Text.Pandoc.Logging -readDocxWithWarnings :: ReaderOptions - -> B.ByteString - -> Either PandocError (Pandoc, MediaBag, [String]) -readDocxWithWarnings opts bytes +readDocx :: PandocMonad m + => ReaderOptions + -> B.ByteString + -> m Pandoc +readDocx opts bytes | Right archive <- toArchiveOrFail bytes , Right (docx, parserWarnings) <- archiveToDocxWithWarnings archive = do - (meta, blks, mediaBag, warnings) <- docxToOutput opts docx - return (Pandoc meta blks, mediaBag, parserWarnings ++ warnings) -readDocxWithWarnings _ _ = - Left (ParseFailure "couldn't parse docx file") - -readDocx :: ReaderOptions - -> B.ByteString - -> Either PandocError (Pandoc, MediaBag) -readDocx opts bytes = do - (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes - return (pandoc, mediaBag) + mapM_ (P.report . DocxParserWarning) parserWarnings + (meta, blks) <- docxToOutput opts docx + return $ Pandoc meta blks +readDocx _ _ = + throwError $ PandocSomeError "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String - , docxMediaBag :: MediaBag - , docxDropCap :: Inlines - , docxWarnings :: [String] + , docxAnchorSet :: Set.Set String + , docxImmedPrevAnchor :: Maybe String + , docxMediaBag :: MediaBag + , docxDropCap :: Inlines + , docxWarnings :: [String] + -- keep track of (numId, lvl) values for + -- restarting + , docxListState :: M.Map (String, String) Integer + , docxPrevPara :: Inlines } instance Default DState where def = DState { docxAnchorMap = M.empty + , docxAnchorSet = mempty + , docxImmedPrevAnchor = Nothing , docxMediaBag = mempty , docxDropCap = mempty , docxWarnings = [] + , docxListState = M.empty + , docxPrevPara = mempty } -data DEnv = DEnv { docxOptions :: ReaderOptions - , docxInHeaderBlock :: Bool } +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool + } instance Default DEnv where def = DEnv def False -type DocxContext = ExceptT PandocError (ReaderT DEnv (State DState)) - -evalDocxContext :: DocxContext a -> DEnv -> DState -> Either PandocError a -evalDocxContext ctx env st = flip evalState st . flip runReaderT env . runExceptT $ ctx +type DocxContext m = ReaderT DEnv (StateT DState m) -addDocxWarning :: String -> DocxContext () -addDocxWarning msg = do - warnings <- gets docxWarnings - modify $ \s -> s {docxWarnings = msg : warnings} +evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a +evalDocxContext ctx env st = flip evalStateT st $flip runReaderT env ctx -- This is empty, but we put it in for future-proofing. spansToKeep :: [String] @@ -162,7 +167,7 @@ metaStyles = M.fromList [ ("Title", "title") , ("Abstract", "abstract")] sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) -sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) +sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp) isMetaPar :: BodyPart -> Bool isMetaPar (Paragraph pPr _) = @@ -174,28 +179,28 @@ isEmptyPar (Paragraph _ parParts) = all isEmptyParPart parParts where isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems - isEmptyParPart _ = False + isEmptyParPart _ = False isEmptyElem (TextRun s) = trim s == "" isEmptyElem _ = True isEmptyPar _ = False -bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String MetaValue) bodyPartsToMeta' [] = return M.empty bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp - , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles) , (Just metaField) <- M.lookup c metaStyles = do inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps let f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] - f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks (Para ils : blks) f m (MetaList mv) = MetaList (m : mv) f m n = MetaList [m, n] return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps -bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta :: PandocMonad m => [BodyPart] -> DocxContext m Meta bodyPartsToMeta bps = do mp <- bodyPartsToMeta' bps let mp' = @@ -208,7 +213,7 @@ fixAuthors :: MetaValue -> MetaValue fixAuthors (MetaBlocks blks) = MetaList $ map g $ filter f blks where f (Para _) = True - f _ = False + f _ = False g (Para ils) = MetaInlines ils g _ = MetaInlines [] fixAuthors mv = mv @@ -220,106 +225,122 @@ codeDivs :: [String] codeDivs = ["SourceCode"] runElemToInlines :: RunElem -> Inlines -runElemToInlines (TextRun s) = text s -runElemToInlines (LnBrk) = linebreak -runElemToInlines (Tab) = space -runElemToInlines (SoftHyphen) = text "\xad" -runElemToInlines (NoBreakHyphen) = text "\x2011" +runElemToInlines (TextRun s) = text s +runElemToInlines LnBrk = linebreak +runElemToInlines Tab = space +runElemToInlines SoftHyphen = text "\xad" +runElemToInlines NoBreakHyphen = text "\x2011" runElemToString :: RunElem -> String -runElemToString (TextRun s) = s -runElemToString (LnBrk) = ['\n'] -runElemToString (Tab) = ['\t'] -runElemToString (SoftHyphen) = ['\xad'] -runElemToString (NoBreakHyphen) = ['\x2011'] +runElemToString (TextRun s) = s +runElemToString LnBrk = ['\n'] +runElemToString Tab = ['\t'] +runElemToString SoftHyphen = ['\xad'] +runElemToString NoBreakHyphen = ['\x2011'] runToString :: Run -> String runToString (Run _ runElems) = concatMap runElemToString runElems -runToString _ = "" +runToString _ = "" parPartToString :: ParPart -> String -parPartToString (PlainRun run) = runToString run +parPartToString (PlainRun run) = runToString run parPartToString (InternalHyperLink _ runs) = concatMap runToString runs parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs -parPartToString _ = "" +parPartToString _ = "" blacklistedCharStyles :: [String] blacklistedCharStyles = ["Hyperlink"] -resolveDependentRunStyle :: RunStyle -> RunStyle +resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle resolveDependentRunStyle rPr | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = - rPr - | Just (_, cs) <- rStyle rPr = - let rPr' = resolveDependentRunStyle cs - in - RunStyle { isBold = case isBold rPr of - Just bool -> Just bool - Nothing -> isBold rPr' - , isItalic = case isItalic rPr of - Just bool -> Just bool - Nothing -> isItalic rPr' - , isSmallCaps = case isSmallCaps rPr of - Just bool -> Just bool - Nothing -> isSmallCaps rPr' - , isStrike = case isStrike rPr of - Just bool -> Just bool - Nothing -> isStrike rPr' - , rVertAlign = case rVertAlign rPr of - Just valign -> Just valign - Nothing -> rVertAlign rPr' - , rUnderline = case rUnderline rPr of - Just ulstyle -> Just ulstyle - Nothing -> rUnderline rPr' - , rStyle = rStyle rPr } - | otherwise = rPr - -runStyleToTransform :: RunStyle -> (Inlines -> Inlines) + return rPr + | Just (_, cs) <- rStyle rPr = do + opts <- asks docxOptions + if isEnabled Ext_styles opts + then return rPr + else do rPr' <- resolveDependentRunStyle cs + return $ + RunStyle { isBold = case isBold rPr of + Just bool -> Just bool + Nothing -> isBold rPr' + , isItalic = case isItalic rPr of + Just bool -> Just bool + Nothing -> isItalic rPr' + , isSmallCaps = case isSmallCaps rPr of + Just bool -> Just bool + Nothing -> isSmallCaps rPr' + , isStrike = case isStrike rPr of + Just bool -> Just bool + Nothing -> isStrike rPr' + , rVertAlign = case rVertAlign rPr of + Just valign -> Just valign + Nothing -> rVertAlign rPr' + , rUnderline = case rUnderline rPr of + Just ulstyle -> Just ulstyle + Nothing -> rUnderline rPr' + , rStyle = rStyle rPr } + | otherwise = return rPr + +runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines) runStyleToTransform rPr | Just (s, _) <- rStyle rPr - , s `elem` spansToKeep = - let rPr' = rPr{rStyle = Nothing} - in - (spanWith ("", [s], [])) . (runStyleToTransform rPr') - | Just True <- isItalic rPr = - emph . (runStyleToTransform rPr {isItalic = Nothing}) - | Just True <- isBold rPr = - strong . (runStyleToTransform rPr {isBold = Nothing}) - | Just True <- isSmallCaps rPr = - smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing}) - | Just True <- isStrike rPr = - strikeout . (runStyleToTransform rPr {isStrike = Nothing}) - | Just SupScrpt <- rVertAlign rPr = - superscript . (runStyleToTransform rPr {rVertAlign = Nothing}) - | Just SubScrpt <- rVertAlign rPr = - subscript . (runStyleToTransform rPr {rVertAlign = Nothing}) - | Just "single" <- rUnderline rPr = - emph . (runStyleToTransform rPr {rUnderline = Nothing}) - | otherwise = id - -runToInlines :: Run -> DocxContext Inlines + , s `elem` spansToKeep = do + transform <- runStyleToTransform rPr{rStyle = Nothing} + return $ spanWith ("", [s], []) . transform + | Just (s, _) <- rStyle rPr = do + opts <- asks docxOptions + let extraInfo = if isEnabled Ext_styles opts + then spanWith ("", [], [("custom-style", s)]) + else id + transform <- runStyleToTransform rPr{rStyle = Nothing} + return $ extraInfo . transform + | Just True <- isItalic rPr = do + transform <- runStyleToTransform rPr{isItalic = Nothing} + return $ emph . transform + | Just True <- isBold rPr = do + transform <- runStyleToTransform rPr{isBold = Nothing} + return $ strong . transform + | Just True <- isSmallCaps rPr = do + transform <- runStyleToTransform rPr{isSmallCaps = Nothing} + return $ smallcaps . transform + | Just True <- isStrike rPr = do + transform <- runStyleToTransform rPr{isStrike = Nothing} + return $ strikeout . transform + | Just SupScrpt <- rVertAlign rPr = do + transform <- runStyleToTransform rPr{rVertAlign = Nothing} + return $ superscript . transform + | Just SubScrpt <- rVertAlign rPr = do + transform <- runStyleToTransform rPr{rVertAlign = Nothing} + return $ subscript . transform + | Just "single" <- rUnderline rPr = do + transform <- runStyleToTransform rPr{rUnderline = Nothing} + return $ underlineSpan . transform + | otherwise = return id + +runToInlines :: PandocMonad m => Run -> DocxContext m Inlines runToInlines (Run rs runElems) | Just (s, _) <- rStyle rs - , s `elem` codeStyles = - let rPr = resolveDependentRunStyle rs - codeString = code $ concatMap runElemToString runElems - in - return $ case rVertAlign rPr of - Just SupScrpt -> superscript codeString - Just SubScrpt -> subscript codeString - _ -> codeString + , s `elem` codeStyles = do + rPr <- resolveDependentRunStyle rs + let codeString = code $ concatMap runElemToString runElems + return $ case rVertAlign rPr of + Just SupScrpt -> superscript codeString + Just SubScrpt -> subscript codeString + _ -> codeString | otherwise = do - let ils = smushInlines (map runElemToInlines runElems) - return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils + rPr <- resolveDependentRunStyle rs + let ils = smushInlines (map runElemToInlines runElems) + transform <- runStyleToTransform rPr + return $ transform ils runToInlines (Footnote bps) = do - blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> mapM bodyPartToBlocks bps return $ note blksList runToInlines (Endnote bps) = do - blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> mapM bodyPartToBlocks bps return $ note blksList runToInlines (InlineDrawing fp title alt bs ext) = do - mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + (lift . lift) $ P.insertMedia fp Nothing bs return $ imageWith (extentToAttr ext) fp title $ text alt runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" @@ -330,20 +351,39 @@ extentToAttr (Just (w, h)) = showDim d = show (d / 914400) ++ "in" extentToAttr _ = nullAttr -blocksToInlinesWarn :: String -> Blocks -> DocxContext Inlines +blocksToInlinesWarn :: PandocMonad m => String -> Blocks -> DocxContext m Inlines blocksToInlinesWarn cmtId blks = do let blkList = toList blks notParaOrPlain :: Block -> Bool - notParaOrPlain (Para _) = False + notParaOrPlain (Para _) = False notParaOrPlain (Plain _) = False - notParaOrPlain _ = True - when (not $ null $ filter notParaOrPlain blkList) - (addDocxWarning $ "Docx comment " ++ cmtId ++ " will not retain formatting") - return $ fromList $ blocksToInlines blkList - -parPartToInlines :: ParPart -> DocxContext Inlines -parPartToInlines (PlainRun r) = runToInlines r -parPartToInlines (Insertion _ author date runs) = do + notParaOrPlain _ = True + unless ( not (any notParaOrPlain blkList)) $ + lift $ P.report $ DocxParserWarning $ + "Docx comment " ++ cmtId ++ " will not retain formatting" + return $ blocksToInlines' blkList + +-- The majority of work in this function is done in the primed +-- subfunction `partPartToInlines'`. We make this wrapper so that we +-- don't have to modify `docxImmedPrevAnchor` state after every function. +parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines +parPartToInlines parPart = + case parPart of + (BookMark _ anchor) | notElem anchor dummyAnchors -> do + inHdrBool <- asks docxInHeaderBlock + ils <- parPartToInlines' parPart + immedPrevAnchor <- gets docxImmedPrevAnchor + unless (isJust immedPrevAnchor || inHdrBool) + (modify $ \s -> s{ docxImmedPrevAnchor = Just anchor}) + return ils + _ -> do + ils <- parPartToInlines' parPart + modify $ \s -> s{ docxImmedPrevAnchor = Nothing} + return ils + +parPartToInlines' :: PandocMonad m => ParPart -> DocxContext m Inlines +parPartToInlines' (PlainRun r) = runToInlines r +parPartToInlines' (ChangedRuns (TrackedChange Insertion (ChangeInfo _ author date)) runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> smushInlines <$> mapM runToInlines runs @@ -352,7 +392,7 @@ parPartToInlines (Insertion _ author date runs) = do ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["insertion"], [("author", author), ("date", date)]) return $ spanWith attr ils -parPartToInlines (Deletion _ author date runs) = do +parPartToInlines' (ChangedRuns (TrackedChange Deletion (ChangeInfo _ author date)) runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> return mempty @@ -361,7 +401,7 @@ parPartToInlines (Deletion _ author date runs) = do ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["deletion"], [("author", author), ("date", date)]) return $ spanWith attr ils -parPartToInlines (CommentStart cmtId author date bodyParts) = do +parPartToInlines' (CommentStart cmtId author date bodyParts) = do opts <- asks docxOptions case readerTrackChanges opts of AllChanges -> do @@ -370,16 +410,16 @@ parPartToInlines (CommentStart cmtId author date bodyParts) = do let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)]) return $ spanWith attr ils _ -> return mempty -parPartToInlines (CommentEnd cmtId) = do +parPartToInlines' (CommentEnd cmtId) = do opts <- asks docxOptions case readerTrackChanges opts of AllChanges -> do let attr = ("", ["comment-end"], [("id", cmtId)]) return $ spanWith attr mempty _ -> return mempty -parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = +parPartToInlines' (BookMark _ anchor) | anchor `elem` dummyAnchors = return mempty -parPartToInlines (BookMark _ anchor) = +parPartToInlines' (BookMark _ anchor) = -- We record these, so we can make sure not to overwrite -- user-defined anchor links with header auto ids. do @@ -395,27 +435,40 @@ parPartToInlines (BookMark _ anchor) = -- of rewriting user-defined anchor links. However, since these -- are not defined in pandoc, it seems like a necessary evil to -- avoid an extra pass. - let newAnchor = - if not inHdrBool && anchor `elem` (M.elems anchorMap) - then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) - else anchor - unless inHdrBool - (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) - return $ spanWith (newAnchor, ["anchor"], []) mempty -parPartToInlines (Drawing fp title alt bs ext) = do - mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + immedPrevAnchor <- gets docxImmedPrevAnchor + case immedPrevAnchor of + Just prevAnchor -> do + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor prevAnchor anchorMap}) + return mempty + Nothing -> do + let newAnchor = + if not inHdrBool && anchor `elem` M.elems anchorMap + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return $ spanWith (newAnchor, ["anchor"], []) mempty +parPartToInlines' (Drawing fp title alt bs ext) = do + (lift . lift) $ P.insertMedia fp Nothing bs return $ imageWith (extentToAttr ext) fp title $ text alt -parPartToInlines Chart = do +parPartToInlines' Chart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" -parPartToInlines (InternalHyperLink anchor runs) = do +parPartToInlines' (InternalHyperLink anchor runs) = do ils <- smushInlines <$> mapM runToInlines runs return $ link ('#' : anchor) "" ils -parPartToInlines (ExternalHyperLink target runs) = do +parPartToInlines' (ExternalHyperLink target runs) = do ils <- smushInlines <$> mapM runToInlines runs return $ link target "" ils -parPartToInlines (PlainOMath exps) = do +parPartToInlines' (PlainOMath exps) = return $ math $ writeTeX exps +parPartToInlines' (SmartTag runs) = + smushInlines <$> mapM runToInlines runs +parPartToInlines' (Field info runs) = + case info of + HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs + UnknownField -> smushInlines <$> mapM runToInlines runs +parPartToInlines' NullParPart = return mempty isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (_, classes, kvs) _) = @@ -426,10 +479,10 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchor :: Blocks -> DocxContext Blocks +makeHeaderAnchor :: PandocMonad m => Blocks -> DocxContext m Blocks makeHeaderAnchor bs = traverse makeHeaderAnchor' bs -makeHeaderAnchor' :: Block -> DocxContext Block +makeHeaderAnchor' :: PandocMonad m => Block -> DocxContext m Block -- If there is an anchor already there (an anchor span in the header, -- to be exact), we rename and associate the new id with the old one. makeHeaderAnchor' (Header n (ident, classes, kvs) ils) @@ -458,126 +511,184 @@ makeHeaderAnchor' blk = return blk -- Rewrite a standalone paragraph block as a plain singleParaToPlain :: Blocks -> Blocks singleParaToPlain blks - | (Para (ils) :< seeq) <- viewl $ unMany blks + | (Para ils :< seeq) <- viewl $ unMany blks , Seq.null seeq = singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: Cell -> DocxContext Blocks +cellToBlocks :: PandocMonad m => Cell -> DocxContext m Blocks cellToBlocks (Cell bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks -rowToBlocksList :: Row -> DocxContext [Blocks] +rowToBlocksList :: PandocMonad m => Row -> DocxContext m [Blocks] rowToBlocksList (Row cells) = do blksList <- mapM cellToBlocks cells return $ map singleParaToPlain blksList -trimLineBreaks :: [Inline] -> [Inline] -trimLineBreaks [] = [] -trimLineBreaks (LineBreak : ils) = trimLineBreaks ils -trimLineBreaks ils - | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils') -trimLineBreaks ils = ils +-- like trimInlines, but also take out linebreaks +trimSps :: Inlines -> Inlines +trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils + where isSp Space = True + isSp SoftBreak = True + isSp LineBreak = True + isSp _ = False -parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks) +parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) parStyleToTransform pPr | (c:cs) <- pStyle pPr - , c `elem` divsToKeep = - let pPr' = pPr { pStyle = cs } - in - (divWith ("", [c], [])) . (parStyleToTransform pPr') + , c `elem` divsToKeep = do + let pPr' = pPr { pStyle = cs } + transform <- parStyleToTransform pPr' + return $ divWith ("", [c], []) . transform | (c:cs) <- pStyle pPr, - c `elem` listParagraphDivs = + c `elem` listParagraphDivs = do let pPr' = pPr { pStyle = cs, indentation = Nothing} - in - (divWith ("", [c], [])) . (parStyleToTransform pPr') - | (_:cs) <- pStyle pPr - , Just True <- pBlockQuote pPr = - let pPr' = pPr { pStyle = cs } - in - blockQuote . (parStyleToTransform pPr') - | (_:cs) <- pStyle pPr = + transform <- parStyleToTransform pPr' + return $ divWith ("", [c], []) . transform + | (c:cs) <- pStyle pPr + , Just True <- pBlockQuote pPr = do + opts <- asks docxOptions + let pPr' = pPr { pStyle = cs } + transform <- parStyleToTransform pPr' + let extraInfo = if isEnabled Ext_styles opts + then divWith ("", [], [("custom-style", c)]) + else id + return $ extraInfo . blockQuote . transform + | (c:cs) <- pStyle pPr = do + opts <- asks docxOptions let pPr' = pPr { pStyle = cs} - in - parStyleToTransform pPr' + transform <- parStyleToTransform pPr' + let extraInfo = if isEnabled Ext_styles opts + then divWith ("", [], [("custom-style", c)]) + else id + return $ extraInfo . transform | null (pStyle pPr) , Just left <- indentation pPr >>= leftParIndent - , Just hang <- indentation pPr >>= hangingParIndent = + , Just hang <- indentation pPr >>= hangingParIndent = do let pPr' = pPr { indentation = Nothing } - in - case (left - hang) > 0 of - True -> blockQuote . (parStyleToTransform pPr') - False -> parStyleToTransform pPr' + transform <- parStyleToTransform pPr' + return $ case (left - hang) > 0 of + True -> blockQuote . transform + False -> transform | null (pStyle pPr), - Just left <- indentation pPr >>= leftParIndent = + Just left <- indentation pPr >>= leftParIndent = do let pPr' = pPr { indentation = Nothing } - in - case left > 0 of - True -> blockQuote . (parStyleToTransform pPr') - False -> parStyleToTransform pPr' -parStyleToTransform _ = id + transform <- parStyleToTransform pPr' + return $ case left > 0 of + True -> blockQuote . transform + False -> transform +parStyleToTransform _ = return id -bodyPartToBlocks :: BodyPart -> DocxContext Blocks +bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) - | not $ null $ codeDivs `intersect` (pStyle pPr) = - return - $ parStyleToTransform pPr - $ codeBlock - $ concatMap parPartToString parparts + | not $ null $ codeDivs `intersect` (pStyle pPr) = do + transform <- parStyleToTransform pPr + return $ + transform $ + codeBlock $ + concatMap parPartToString parparts | Just (style, n) <- pHeading pPr = do - ils <- local (\s-> s{docxInHeaderBlock=True}) $ + ils <-local (\s-> s{docxInHeaderBlock=True}) (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do - ils <- smushInlines <$> mapM parPartToInlines parparts >>= - (return . fromList . trimLineBreaks . normalizeSpaces . toList) + ils <- (trimSps . smushInlines) <$> mapM parPartToInlines parparts + prevParaIls <- gets docxPrevPara dropIls <- gets docxDropCap let ils' = dropIls <> ils if dropCap pPr then do modify $ \s -> s { docxDropCap = ils' } return mempty else do modify $ \s -> s { docxDropCap = mempty } - return $ case isNull ils' of - True -> mempty - _ -> parStyleToTransform pPr $ para ils' + let ils'' = prevParaIls <> + (if isNull prevParaIls then mempty else space) <> + ils' + opts <- asks docxOptions + case () of + + _ | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> + return mempty + _ | Just (TrackedChange Insertion _) <- pChange pPr + , AcceptChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' + _ | Just (TrackedChange Insertion _) <- pChange pPr + , RejectChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = ils''} + return mempty + _ | Just (TrackedChange Insertion cInfo) <- pChange pPr + , AllChanges <- readerTrackChanges opts + , ChangeInfo _ cAuthor cDate <- cInfo -> do + let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)]) + insertMark = spanWith attr mempty + transform <- parStyleToTransform pPr + return $ transform $ + para $ ils'' <> insertMark + _ | Just (TrackedChange Deletion _) <- pChange pPr + , AcceptChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = ils''} + return mempty + _ | Just (TrackedChange Deletion _) <- pChange pPr + , RejectChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' + _ | Just (TrackedChange Deletion cInfo) <- pChange pPr + , AllChanges <- readerTrackChanges opts + , ChangeInfo _ cAuthor cDate <- cInfo -> do + let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)]) + insertMark = spanWith attr mempty + transform <- parStyleToTransform pPr + return $ transform $ + para $ ils'' <> insertMark + _ | otherwise -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do - let - kvs = case levelInfo of - (_, fmt, txt, Just start) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - , ("start", (show start)) - ] - - (_, fmt, txt, Nothing) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - ] + -- We check whether this current numId has previously been used, + -- since Docx expects us to pick up where we left off. + listState <- gets docxListState + let startFromState = M.lookup (numId, lvl) listState + (_, fmt,txt, startFromLevelInfo) = levelInfo + start = case startFromState of + Just n -> n + 1 + Nothing -> fromMaybe 1 startFromLevelInfo + kvs = [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", show start) + ] + modify $ \st -> st{ docxListState = M.insert (numId, lvl) start listState} blks <- bodyPartToBlocks (Paragraph pPr parparts) return $ divWith ("", ["list-item"], kvs) blks -bodyPartToBlocks (ListItem pPr _ _ _ parparts) = - let pPr' = pPr {pStyle = "ListParagraph": (pStyle pPr)} +bodyPartToBlocks (ListItem pPr _ _ _ parparts) = + let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look (r:rs)) = do +bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do let caption = text cap (hdr, rows) = case firstRowFormatting look of True | null rs -> (Nothing, [r]) | otherwise -> (Just r, rs) False -> (Nothing, r:rs) - cells <- mapM rowToBlocksList rows + cells <- mapM rowToBlocksList rows - let width = case cells of - r':_ -> length r' - -- shouldn't happen - [] -> 0 + let width = maybe 0 maximum $ nonEmpty $ map rowLength parts + -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out + -- our own, see + -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155 + nonEmpty [] = Nothing + nonEmpty l = Just l + rowLength :: Row -> Int + rowLength (Row c) = length c hdrCells <- case hdr of Just r' -> rowToBlocksList r' @@ -592,36 +703,54 @@ bodyPartToBlocks (Tbl cap _ look (r:rs)) = do widths = replicate width 0 :: [Double] return $ table caption (zip alignments widths) hdrCells cells -bodyPartToBlocks (OMathPara e) = do +bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) -- replace targets with generated anchors. -rewriteLink' :: Inline -> DocxContext Inline +rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils ('#':target, title)) = do anchorMap <- gets docxAnchorMap - return $ case M.lookup target anchorMap of - Just newTarget -> (Link attr ils ('#':newTarget, title)) - Nothing -> l + case M.lookup target anchorMap of + Just newTarget -> do + modify $ \s -> s{docxAnchorSet = Set.insert newTarget (docxAnchorSet s)} + return $ Link attr ils ('#':newTarget, title) + Nothing -> do + modify $ \s -> s{docxAnchorSet = Set.insert target (docxAnchorSet s)} + return l rewriteLink' il = return il -rewriteLinks :: [Block] -> DocxContext [Block] +rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block] rewriteLinks = mapM (walkM rewriteLink') -bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag, [String]) +removeOrphanAnchors'' :: PandocMonad m => Inline -> DocxContext m [Inline] +removeOrphanAnchors'' s@(Span (ident, classes, _) ils) + | "anchor" `elem` classes = do + anchorSet <- gets docxAnchorSet + return $ if ident `Set.member` anchorSet + then [s] + else ils +removeOrphanAnchors'' il = return [il] + +removeOrphanAnchors' :: PandocMonad m => [Inline] -> DocxContext m [Inline] +removeOrphanAnchors' ils = liftM concat $ mapM removeOrphanAnchors'' ils + +removeOrphanAnchors :: PandocMonad m => [Block] -> DocxContext m [Block] +removeOrphanAnchors = mapM (walkM removeOrphanAnchors') + +bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks - mediaBag <- gets docxMediaBag - warnings <- gets docxWarnings - return $ (meta, - blks', - mediaBag, - warnings) - -docxToOutput :: ReaderOptions -> Docx -> Either PandocError (Meta, [Block], MediaBag, [String]) + blks'' <- removeOrphanAnchors blks' + return (meta, blks'') + +docxToOutput :: PandocMonad m + => ReaderOptions + -> Docx + -> m (Meta, [Block]) docxToOutput opts (Docx (Document _ body)) = let dEnv = def { docxOptions = opts} in evalDocxContext (bodyToOutput body) dEnv def diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index 39e0df825..003265e6e 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -1,15 +1,16 @@ -{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, - PatternGuards #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE TypeSynonymInstances #-} module Text.Pandoc.Readers.Docx.Combine ( smushInlines , smushBlocks ) where -import Text.Pandoc.Builder import Data.List -import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr, (><), (|>)) +import Data.Sequence (ViewL (..), ViewR (..), viewl, viewr, (><), (|>)) import qualified Data.Sequence as Seq (null) +import Text.Pandoc.Builder data Modifier a = Modifier (a -> a) | AttrModifier (Attr -> a -> a) Attr @@ -35,16 +36,16 @@ spaceOutInlines ils = right = case viewr contents of (_ :> Space) -> space _ -> mempty in - (left, (stackInlines fs $ trimInlines . Many $ contents), right) + (left, stackInlines fs $ trimInlines . Many $ contents, right) stackInlines :: [Modifier Inlines] -> Inlines -> Inlines stackInlines [] ms = ms stackInlines (NullModifier : fs) ms = stackInlines fs ms -stackInlines ((Modifier f) : fs) ms = +stackInlines (Modifier f : fs) ms = if isEmpty ms then stackInlines fs ms else f $ stackInlines fs ms -stackInlines ((AttrModifier f attr) : fs) ms = f attr $ stackInlines fs ms +stackInlines (AttrModifier f attr : fs) ms = f attr $ stackInlines fs ms unstackInlines :: Inlines -> ([Modifier Inlines], Inlines) unstackInlines ms = case ilModifier ms of @@ -56,15 +57,15 @@ unstackInlines ms = case ilModifier ms of ilModifier :: Inlines -> Modifier Inlines ilModifier ils = case viewl (unMany ils) of (x :< xs) | Seq.null xs -> case x of - (Emph _) -> Modifier emph - (Strong _) -> Modifier strong - (SmallCaps _) -> Modifier smallcaps - (Strikeout _) -> Modifier strikeout - (Superscript _) -> Modifier superscript - (Subscript _) -> Modifier subscript + (Emph _) -> Modifier emph + (Strong _) -> Modifier strong + (SmallCaps _) -> Modifier smallcaps + (Strikeout _) -> Modifier strikeout + (Superscript _) -> Modifier superscript + (Subscript _) -> Modifier subscript (Link attr _ tgt) -> Modifier $ linkWith attr (fst tgt) (snd tgt) - (Span attr _) -> AttrModifier spanWith attr - _ -> NullModifier + (Span attr _) -> AttrModifier spanWith attr + _ -> NullModifier _ -> NullModifier ilInnards :: Inlines -> Inlines @@ -78,25 +79,25 @@ ilInnards ils = case viewl (unMany ils) of (Subscript lst) -> fromList lst (Link _ lst _) -> fromList lst (Span _ lst) -> fromList lst - _ -> ils + _ -> ils _ -> ils inlinesL :: Inlines -> (Inlines, Inlines) inlinesL ils = case viewl $ unMany ils of (s :< sq) -> (singleton s, Many sq) - _ -> (mempty, ils) + _ -> (mempty, ils) inlinesR :: Inlines -> (Inlines, Inlines) inlinesR ils = case viewr $ unMany ils of (sq :> s) -> (Many sq, singleton s) - _ -> (ils, mempty) + _ -> (ils, mempty) combineInlines :: Inlines -> Inlines -> Inlines combineInlines x y = let (xs', x') = inlinesR x (y', ys') = inlinesL y in - xs' <> (combineSingletonInlines x' y') <> ys' + xs' <> combineSingletonInlines x' y' <> ys' combineSingletonInlines :: Inlines -> Inlines -> Inlines combineSingletonInlines x y = @@ -113,10 +114,10 @@ combineSingletonInlines x y = stackInlines (x_rem_attr ++ y_rem_attr) mempty | isEmpty xs -> let (sp, y') = spaceOutInlinesL y in - (stackInlines x_rem_attr mempty) <> sp <> y' + stackInlines x_rem_attr mempty <> sp <> y' | isEmpty ys -> let (x', sp) = spaceOutInlinesR x in - x' <> sp <> (stackInlines y_rem_attr mempty) + x' <> sp <> stackInlines y_rem_attr mempty | otherwise -> let (x', xsp) = spaceOutInlinesR x (ysp, y') = spaceOutInlinesL y @@ -129,15 +130,15 @@ combineSingletonInlines x y = combineBlocks :: Blocks -> Blocks -> Blocks combineBlocks bs cs - | bs' :> (BlockQuote bs'') <- viewr (unMany bs) - , (BlockQuote cs'') :< cs' <- viewl (unMany cs) = - Many $ (bs' |> (BlockQuote (bs'' <> cs''))) >< cs' + | bs' :> BlockQuote bs'' <- viewr (unMany bs) + , BlockQuote cs'' :< cs' <- viewl (unMany cs) = + Many $ (bs' |> BlockQuote (bs'' <> cs'')) >< cs' combineBlocks bs cs = bs <> cs instance (Monoid a, Eq a) => Eq (Modifier a) where - (Modifier f) == (Modifier g) = (f mempty == g mempty) - (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) - (NullModifier) == (NullModifier) = True + (Modifier f) == (Modifier g) = f mempty == g mempty + (AttrModifier f attr) == (AttrModifier g attr') = f attr mempty == g attr' mempty + NullModifier == NullModifier = True _ == _ = False isEmpty :: (Monoid a, Eq a) => a -> Bool diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs new file mode 100644 index 000000000..6eeb55d2f --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fields.hs @@ -0,0 +1,89 @@ +{- +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fields + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +For parsing Field definitions in instText tags, as described in +ECMA-376-1:2016, §17.16.5 -} + +module Text.Pandoc.Readers.Docx.Fields ( FieldInfo(..) + , parseFieldInfo + ) where + +import Text.Parsec +import Text.Parsec.String (Parser) + +type URL = String + +data FieldInfo = HyperlinkField URL + | UnknownField + deriving (Show) + +parseFieldInfo :: String -> Either ParseError FieldInfo +parseFieldInfo = parse fieldInfo "" + +fieldInfo :: Parser FieldInfo +fieldInfo = + try (HyperlinkField <$> hyperlink) + <|> + return UnknownField + +escapedQuote :: Parser String +escapedQuote = string "\\\"" + +inQuotes :: Parser String +inQuotes = + (try escapedQuote) <|> (anyChar >>= (\c -> return [c])) + +quotedString :: Parser String +quotedString = do + char '"' + concat <$> manyTill inQuotes (try (char '"')) + +unquotedString :: Parser String +unquotedString = manyTill anyChar (try $ lookAhead space *> return () <|> eof) + +fieldArgument :: Parser String +fieldArgument = quotedString <|> unquotedString + +-- there are other switches, but this is the only one I've seen in the wild so far, so it's the first one I'll implement. See §17.16.5.25 +hyperlinkSwitch :: Parser (String, String) +hyperlinkSwitch = do + sw <- string "\\l" + spaces + farg <- fieldArgument + return (sw, farg) + +hyperlink :: Parser URL +hyperlink = do + many space + string "HYPERLINK" + spaces + farg <- fieldArgument + switches <- spaces *> many hyperlinkSwitch + let url = case switches of + ("\\l", s) : _ -> farg ++ ('#': s) + _ -> farg + return url diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index 395a53907..c0f05094a 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Docx.Lists - Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal License : GNU GPL, version 2 or above Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> @@ -33,38 +33,33 @@ module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets , listParagraphDivs ) where -import Text.Pandoc.JSON -import Text.Pandoc.Generic (bottomUp) -import Text.Pandoc.Shared (trim) -import Control.Monad import Data.List import Data.Maybe +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.JSON +import Text.Pandoc.Shared (trim) isListItem :: Block -> Bool isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True -isListItem _ = False +isListItem _ = False getLevel :: Block -> Maybe Integer -getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs -getLevel _ = Nothing +getLevel (Div (_, _, kvs) _) = read <$> lookup "level" kvs +getLevel _ = Nothing getLevelN :: Block -> Integer -getLevelN b = case getLevel b of - Just n -> n - Nothing -> -1 +getLevelN b = fromMaybe (-1) (getLevel b) getNumId :: Block -> Maybe Integer -getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs -getNumId _ = Nothing +getNumId (Div (_, _, kvs) _) = read <$> lookup "num-id" kvs +getNumId _ = Nothing getNumIdN :: Block -> Integer -getNumIdN b = case getNumId b of - Just n -> n - Nothing -> -1 +getNumIdN b = fromMaybe (-1) (getNumId b) getText :: Block -> Maybe String getText (Div (_, _, kvs) _) = lookup "text" kvs -getText _ = Nothing +getText _ = Nothing data ListType = Itemized | Enumerated ListAttributes @@ -109,27 +104,27 @@ listParagraphDivs = ["ListParagraph"] handleListParagraphs :: [Block] -> [Block] handleListParagraphs [] = [] handleListParagraphs ( - (Div attr1@(_, classes1, _) blks1) : - (Div (ident2, classes2, kvs2) blks2) : + Div attr1@(_, classes1, _) blks1 : + Div (ident2, classes2, kvs2) blks2 : blks ) | "list-item" `elem` classes1 && - not ("list-item" `elem` classes2) && + notElem "list-item" classes2 && (not . null) (listParagraphDivs `intersect` classes2) = -- We don't want to keep this indent. let newDiv2 = - (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2 in - handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) -handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + handleListParagraphs (Div attr1 (blks1 ++ [newDiv2]) : blks) +handleListParagraphs (blk:blks) = blk : handleListParagraphs blks separateBlocks' :: Block -> [[Block]] -> [[Block]] -separateBlocks' blk ([] : []) = [[blk]] -separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] -separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' blk [[]] = [[blk]] +separateBlocks' b@(BulletList _) acc = init acc ++ [last acc ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = init acc ++ [last acc ++ [b]] -- The following is for the invisible bullet lists. This is how -- pandoc-generated ooxml does multiparagraph item lists. -separateBlocks' b acc | liftM trim (getText b) == Just "" = - (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc | fmap trim (getText b) == Just "" = + init acc ++ [last acc ++ [b]] separateBlocks' b acc = acc ++ [[b]] separateBlocks :: [Block] -> [[Block]] @@ -138,63 +133,60 @@ separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) flatToBullets' :: Integer -> [Block] -> [Block] flatToBullets' _ [] = [] flatToBullets' num xs@(b : elems) - | getLevelN b == num = b : (flatToBullets' num elems) + | getLevelN b == num = b : flatToBullets' num elems | otherwise = let bNumId = getNumIdN b bLevel = getLevelN b (children, remaining) = span (\b' -> - ((getLevelN b') > bLevel || - ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + getLevelN b' > bLevel || + (getLevelN b' == bLevel && getNumIdN b' == bNumId)) xs in case getListType b of Just (Enumerated attr) -> - (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : - (flatToBullets' num remaining) + OrderedList attr (separateBlocks $ flatToBullets' bLevel children) : + flatToBullets' num remaining _ -> - (BulletList (separateBlocks $ flatToBullets' bLevel children)) : - (flatToBullets' num remaining) + BulletList (separateBlocks $ flatToBullets' bLevel children) : + flatToBullets' num remaining flatToBullets :: [Block] -> [Block] flatToBullets elems = flatToBullets' (-1) elems singleItemHeaderToHeader :: Block -> Block -singleItemHeaderToHeader (OrderedList _ [[h@(Header _ _ _)]]) = h -singleItemHeaderToHeader blk = blk +singleItemHeaderToHeader (OrderedList _ [[h@Header{}]]) = h +singleItemHeaderToHeader blk = blk blocksToBullets :: [Block] -> [Block] blocksToBullets blks = map singleItemHeaderToHeader $ - bottomUp removeListDivs $ - flatToBullets $ (handleListParagraphs blks) + bottomUp removeListDivs $flatToBullets (handleListParagraphs blks) plainParaInlines :: Block -> [Inline] plainParaInlines (Plain ils) = ils -plainParaInlines (Para ils) = ils -plainParaInlines _ = [] +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] blocksToDefinitions' [] acc [] = reverse acc blocksToDefinitions' defAcc acc [] = - reverse $ (DefinitionList (reverse defAcc)) : acc + reverse $ DefinitionList (reverse defAcc) : acc blocksToDefinitions' defAcc acc - ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + (Div (_, classes1, _) blks1 : Div (ident2, classes2, kvs2) blks2 : blks) | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - pair = case remainingAttr2 == ("", [], []) of - True -> (concatMap plainParaInlines blks1, [blks2]) - False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + pair = if remainingAttr2 == ("", [], []) then (concatMap plainParaInlines blks1, [blks2]) else (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) in blocksToDefinitions' (pair : defAcc) acc blks blocksToDefinitions' defAcc acc - ((Div (ident2, classes2, kvs2) blks2) : blks) + (Div (ident2, classes2, kvs2) blks2 : blks) | (not . null) defAcc && "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) defItems2 = case remainingAttr2 == ("", [], []) of - True -> blks2 + True -> blks2 False -> [Div remainingAttr2 blks2] ((defTerm, defItems):defs) = defAcc defAcc' = case null defItems of @@ -205,18 +197,18 @@ blocksToDefinitions' defAcc acc blocksToDefinitions' [] acc (b:blks) = blocksToDefinitions' [] (b:acc) blks blocksToDefinitions' defAcc acc (b:blks) = - blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + blocksToDefinitions' [] (b : DefinitionList (reverse defAcc) : acc) blks removeListDivs' :: Block -> [Block] removeListDivs' (Div (ident, classes, kvs) blks) | "list-item" `elem` classes = case delete "list-item" classes of - [] -> blks - classes' -> [Div (ident, classes', kvs) $ blks] + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] removeListDivs' (Div (ident, classes, kvs) blks) | not $ null $ listParagraphDivs `intersect` classes = case classes \\ listParagraphDivs of - [] -> blks + [] -> blks classes' -> [Div (ident, classes', kvs) blks] removeListDivs' blk = [blk] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index deb2caccf..1f7f07e36 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,7 +1,9 @@ -{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +22,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Docx.Parse - Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal License : GNU GPL, version 2 or above Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> @@ -49,28 +51,34 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParagraphStyle(..) , Row(..) , Cell(..) + , TrackedChange(..) + , ChangeType(..) + , ChangeInfo(..) + , FieldInfo(..) , archiveToDocx , archiveToDocxWithWarnings ) where import Codec.Archive.Zip -import Text.XML.Light -import Data.Maybe -import Data.List -import System.FilePath +import Control.Applicative ((<|>)) +import Control.Monad.Except +import Control.Monad.Reader +import Control.Monad.State.Strict import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B -import qualified Text.Pandoc.UTF8 as UTF8 -import Control.Monad.Reader -import Control.Monad.State -import Control.Applicative ((<|>)) +import Data.Char (chr, ord, readLitChar) +import Data.List import qualified Data.Map as M -import Control.Monad.Except -import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive) -import Text.TeXMath.Readers.OMML (readOMML) -import Text.TeXMath.Unicode.Fonts (getUnicode, stringToFont, Font(..)) -import Text.TeXMath (Exp) +import Data.Maybe +import System.FilePath import Text.Pandoc.Readers.Docx.Util -import Data.Char (readLitChar, ord, chr, isDigit) +import Text.Pandoc.Readers.Docx.Fields +import Text.Pandoc.Shared (filteredFilesFromArchive, safeRead) +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.TeXMath (Exp) +import Text.TeXMath.Readers.OMML (readOMML) +import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont) +import Text.XML.Light +import qualified Text.XML.Light.Cursor as XMLC data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -84,10 +92,19 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show -data ReaderState = ReaderState { stateWarnings :: [String] } +data ReaderState = ReaderState { stateWarnings :: [String] + , stateFldCharState :: FldCharState + } deriving Show -data DocxError = DocxError | WrongElem +data FldCharState = FldCharOpen + | FldCharFieldInfo FieldInfo + | FldCharContent FieldInfo [Run] + | FldCharClosed + deriving (Show) + +data DocxError = DocxError + | WrongElem deriving Show type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) @@ -97,7 +114,7 @@ runD dx re rs = runState (runReaderT (runExceptT dx) re) rs maybeToD :: Maybe a -> D a maybeToD (Just a) = return a -maybeToD Nothing = throwError DocxError +maybeToD Nothing = throwError DocxError eitherToD :: Either a b -> D b eitherToD (Right b) = return b @@ -115,6 +132,36 @@ mapD f xs = in concatMapM handler xs +unwrapSDT :: NameSpaces -> Content -> [Content] +unwrapSDT ns (Elem element) + | isElem ns "w" "sdt" element + , Just sdtContent <- findChildByName ns "w" "sdtContent" element + = concatMap (unwrapSDT ns) $ map Elem $ elChildren sdtContent +unwrapSDT _ content = [content] + +unwrapSDTchild :: NameSpaces -> Content -> Content +unwrapSDTchild ns (Elem element) = + Elem $ element { elContent = concatMap (unwrapSDT ns) (elContent element) } +unwrapSDTchild _ content = content + +walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor +walkDocument' ns cur = + let modifiedCur = XMLC.modifyContent (unwrapSDTchild ns) cur + in + case XMLC.nextDF modifiedCur of + Just cur' -> walkDocument' ns cur' + Nothing -> XMLC.root modifiedCur + +walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument ns element = + let cur = XMLC.fromContent (Elem element) + cur' = walkDocument' ns cur + in + case XMLC.toTree cur' of + Elem element' -> Just element' + _ -> Nothing + + data Docx = Docx Document deriving Show @@ -160,17 +207,27 @@ data Notes = Notes NameSpaces data Comments = Comments NameSpaces (M.Map String Element) deriving Show -data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer - , rightParIndent :: Maybe Integer +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer , hangingParIndent :: Maybe Integer} deriving Show -data ParagraphStyle = ParagraphStyle { pStyle :: [String] +data ChangeType = Insertion | Deletion + deriving Show + +data ChangeInfo = ChangeInfo ChangeId Author ChangeDate + deriving Show + +data TrackedChange = TrackedChange ChangeType ChangeInfo + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] , indentation :: Maybe ParIndentation , dropCap :: Bool , pHeading :: Maybe (String, Int) , pNumInfo :: Maybe (String, String) , pBlockQuote :: Maybe Bool + , pChange :: Maybe TrackedChange } deriving Show @@ -181,6 +238,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] , pHeading = Nothing , pNumInfo = Nothing , pBlockQuote = Nothing + , pChange = Nothing } @@ -208,8 +266,7 @@ data Cell = Cell [BodyPart] type Extent = Maybe (Double, Double) data ParPart = PlainRun Run - | Insertion ChangeId Author ChangeDate [Run] - | Deletion ChangeId Author ChangeDate [Run] + | ChangedRuns TrackedChange [Run] | CommentStart CommentId Author CommentDate [BodyPart] | CommentEnd CommentId | BookMark BookMarkId Anchor @@ -218,6 +275,10 @@ data ParPart = PlainRun Run | Drawing FilePath String String B.ByteString Extent -- title, alt | Chart -- placeholder for now | PlainOMath [Exp] + | SmartTag [Run] + | Field FieldInfo [Run] + | NullParPart -- when we need to return nothing, but + -- not because of an error. deriving Show data Run = Run RunStyle [RunElem] @@ -233,19 +294,19 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen data VertAlign = BaseLn | SupScrpt | SubScrpt deriving Show -data RunStyle = RunStyle { isBold :: Maybe Bool - , isItalic :: Maybe Bool +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool , isSmallCaps :: Maybe Bool - , isStrike :: Maybe Bool - , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String - , rStyle :: Maybe CharStyle} + , isStrike :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rStyle :: Maybe CharStyle} deriving Show -data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) +data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) , isBlockQuote :: Maybe Bool - , numInfo :: Maybe (String, String) - , psStyle :: Maybe ParStyle} + , numInfo :: Maybe (String, String) + , psStyle :: Maybe ParStyle} deriving Show defaultRunStyle :: RunStyle @@ -281,7 +342,9 @@ archiveToDocxWithWarnings archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument - rState = ReaderState { stateWarnings = [] } + rState = ReaderState { stateWarnings = [] + , stateFldCharState = FldCharClosed + } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of Right doc -> Right (Docx doc, stateWarnings st) @@ -294,14 +357,14 @@ archiveToDocument zf = do entry <- maybeToD $ findEntryByPath "word/document.xml" zf docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces docElem - bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem - body <- elemToBody namespaces bodyElem + bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem + let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + body <- elemToBody namespaces bodyElem' return $ Document namespaces body elemToBody :: NameSpaces -> Element -> D Body elemToBody ns element | isElem ns "w" "body" element = - mapD (elemToBodyPart ns) (elChildren element) >>= - (\bps -> return $ Body bps) + fmap Body (mapD (elemToBodyPart ns) (elChildren element)) elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) @@ -322,15 +385,15 @@ archiveToStyles zf = isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element - , Just styleType <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle - , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>= - findAttr (elemName ns "w" "val") - , Just ps <- parentStyle = (basedOnVal == getStyleId ps) + , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= + findAttrByName ns "w" "val" + , Just ps <- parentStyle = basedOnVal == getStyleId ps | isElem ns "w" "style" element - , Just styleType <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle - , Nothing <- findChild (elemName ns "w" "basedOn") element + , Nothing <- findChildByName ns "w" "basedOn" element , Nothing <- parentStyle = True | otherwise = False @@ -343,8 +406,8 @@ instance ElemToStyle CharStyle where cStyleType _ = "character" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttr (elemName ns "w" "type") element - , Just styleId <- findAttr (elemName ns "w" "styleId") element = + , Just "character" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = Just (styleId, elemToRunStyle ns element parentStyle) | otherwise = Nothing getStyleId s = fst s @@ -353,8 +416,8 @@ instance ElemToStyle ParStyle where cStyleType _ = "paragraph" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "paragraph" <- findAttr (elemName ns "w" "type") element - , Just styleId <- findAttr (elemName ns "w" "styleId") element = + , Just "paragraph" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = Just (styleId, elemToParStyleData ns element parentStyle) | otherwise = Nothing getStyleId s = fst s @@ -368,10 +431,10 @@ getStyleChildren ns element parentStyle buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] buildBasedOnList ns element rootStyle = - case (getStyleChildren ns element rootStyle) of + case getStyleChildren ns element rootStyle of [] -> [] stys -> stys ++ - (concatMap (\s -> buildBasedOnList ns element (Just s)) stys) + concatMap (buildBasedOnList ns element . Just) stys archiveToNotes :: Archive -> Notes archiveToNotes zf = @@ -380,14 +443,14 @@ archiveToNotes zf = enElem = findEntryByPath "word/endnotes.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) fn_namespaces = case fnElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] en_namespaces = case enElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= (elemToNotes ns "footnote") - en = enElem >>= (elemToNotes ns "endnote") + fn = fnElem >>= elemToNotes ns "footnote" + en = enElem >>= elemToNotes ns "endnote" in Notes ns fn en @@ -396,19 +459,19 @@ archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) cmts_namespaces = case cmtsElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] - cmts = (elemToComments cmts_namespaces) <$> cmtsElem + cmts = elemToComments cmts_namespaces <$> cmtsElem in case cmts of - Just c -> Comments cmts_namespaces c + Just c -> Comments cmts_namespaces c Nothing -> Comments cmts_namespaces M.empty filePathToRelType :: FilePath -> Maybe DocumentLocation -filePathToRelType "word/_rels/document.xml.rels" = Just InDocument +filePathToRelType "word/_rels/document.xml.rels" = Just InDocument filePathToRelType "word/_rels/footnotes.xml.rels" = Just InFootnote -filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote -filePathToRelType _ = Nothing +filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote +filePathToRelType _ = Nothing relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship relElemToRelationship relType element | qName (elName element) == "Relationship" = @@ -439,24 +502,23 @@ lookupLevel :: String -> String -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs - lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls - return lvl + lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttr (elemName ns "w" "numId") element - absNumId <- findChild (elemName ns "w" "abstractNumId") element - >>= findAttr (elemName ns "w" "val") + numId <- findAttrByName ns "w" "numId" element + absNumId <- findChildByName ns "w" "abstractNumId" element + >>= findAttrByName ns "w" "val" return $ Numb numId absNumId numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttr (elemName ns "w" "abstractNumId") element - let levelElems = findChildren (elemName ns "w" "lvl") element + absNumId <- findAttrByName ns "w" "abstractNumId" element + let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels absNumElemToAbsNum _ _ = Nothing @@ -464,26 +526,26 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttr (elemName ns "w" "ilvl") element - fmt <- findChild (elemName ns "w" "numFmt") element - >>= findAttr (elemName ns "w" "val") - txt <- findChild (elemName ns "w" "lvlText") element - >>= findAttr (elemName ns "w" "val") - let start = findChild (elemName ns "w" "start") element - >>= findAttr (elemName ns "w" "val") + ilvl <- findAttrByName ns "w" "ilvl" element + fmt <- findChildByName ns "w" "numFmt" element + >>= findAttrByName ns "w" "val" + txt <- findChildByName ns "w" "lvlText" element + >>= findAttrByName ns "w" "val" + let start = findChildByName ns "w" "start" element + >>= findAttrByName ns "w" "val" >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) return (ilvl, fmt, txt, start) levelElemToLevel _ _ = Nothing archiveToNumbering' :: Archive -> Maybe Numbering -archiveToNumbering' zf = do +archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of Nothing -> Just $ Numbering [] [] [] Just entry -> do numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces numberingElem - numElems = findChildren (elemName namespaces "w" "num") numberingElem - absNumElems = findChildren (elemName namespaces "w" "abstractNum") numberingElem + numElems = findChildrenByName namespaces "w" "num" numberingElem + absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem nums = mapMaybe (numElemToNum namespaces) numElems absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems return $ Numbering namespaces nums absNums @@ -496,22 +558,23 @@ elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) elemToNotes ns notetype element | isElem ns "w" (notetype ++ "s") element = let pairs = mapMaybe - (\e -> findAttr (elemName ns "w" "id") e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) - (findChildren (elemName ns "w" notetype) element) + (findChildrenByName ns "w" notetype element) in - Just $ M.fromList $ pairs + Just $ + M.fromList pairs elemToNotes _ _ _ = Nothing elemToComments :: NameSpaces -> Element -> M.Map String Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttr (elemName ns "w" "id") e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) - (findChildren (elemName ns "w" "comment") element) + (findChildrenByName ns "w" "comment" element) in - M.fromList $ pairs + M.fromList pairs elemToComments _ _ = M.empty @@ -520,16 +583,16 @@ elemToComments _ _ = M.empty elemToTblGrid :: NameSpaces -> Element -> D TblGrid elemToTblGrid ns element | isElem ns "w" "tblGrid" element = - let cols = findChildren (elemName ns "w" "gridCol") element + let cols = findChildrenByName ns "w" "gridCol" element in - mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger)) + mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) cols elemToTblGrid _ _ = throwError WrongElem elemToTblLook :: NameSpaces -> Element -> D TblLook elemToTblLook ns element | isElem ns "w" "tblLook" element = - let firstRow = findAttr (elemName ns "w" "firstRow") element - val = findAttr (elemName ns "w" "val") element + let firstRow = findAttrByName ns "w" "firstRow" element + val = findAttrByName ns "w" "val" element firstRowFmt = case firstRow of Just "1" -> True @@ -538,13 +601,13 @@ elemToTblLook ns element | isElem ns "w" "tblLook" element = Just bitMask -> testBitMask bitMask 0x020 Nothing -> False in - return $ TblLook{firstRowFormatting = firstRowFmt} + return TblLook{firstRowFormatting = firstRowFmt} elemToTblLook _ _ = throwError WrongElem elemToRow :: NameSpaces -> Element -> D Row elemToRow ns element | isElem ns "w" "tr" element = do - let cellElems = findChildren (elemName ns "w" "tc") element + let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems return $ Row cells elemToRow _ _ = throwError WrongElem @@ -558,15 +621,15 @@ elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation elemToParIndentation ns element | isElem ns "w" "ind" element = - Just $ ParIndentation { + Just ParIndentation { leftParIndent = - findAttr (elemName ns "w" "left") element >>= + findAttrByName ns "w" "left" element >>= stringToInteger , rightParIndent = - findAttr (elemName ns "w" "right") element >>= + findAttrByName ns "w" "right" element >>= stringToInteger , hangingParIndent = - findAttr (elemName ns "w" "hanging") element >>= + findAttrByName ns "w" "hanging" element >>= stringToInteger} elemToParIndentation _ _ = Nothing @@ -574,7 +637,7 @@ testBitMask :: String -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of [] -> False - ((n', _) : _) -> ((n' .|. n) /= 0) + ((n', _) : _) -> (n' .|. n) /= 0 stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) @@ -582,7 +645,7 @@ stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildren (elemName ns "m" "oMathPara") element = + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do expsLst <- eitherToD $ readOMML $ showElement c return $ OMathPara expsLst @@ -610,17 +673,17 @@ elemToBodyPart ns element _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChild (elemName ns "w" "tblPr") element - >>= findChild (elemName ns "w" "tblCaption") - >>= findAttr (elemName ns "w" "val") + let caption' = findChildByName ns "w" "tblPr" element + >>= findChildByName ns "w" "tblCaption" + >>= findAttrByName ns "w" "val" caption = (fromMaybe "" caption') - grid' = case findChild (elemName ns "w" "tblGrid") element of - Just g -> elemToTblGrid ns g + grid' = case findChildByName ns "w" "tblGrid" element of + Just g -> elemToTblGrid ns g Nothing -> return [] - tblLook' = case findChild (elemName ns "w" "tblPr") element >>= - findChild (elemName ns "w" "tblLook") + tblLook' = case findChildByName ns "w" "tblPr" element >>= + findChildByName ns "w" "tblLook" of - Just l -> elemToTblLook ns l + Just l -> elemToTblLook ns l Nothing -> return defaultTblLook grid <- grid' @@ -649,26 +712,22 @@ expandDrawingId s = do getTitleAndAlt :: NameSpaces -> Element -> (String, String) getTitleAndAlt ns element = - let mbDocPr = findChild (elemName ns "wp" "inline") element >>= - findChild (elemName ns "wp" "docPr") - title = case mbDocPr >>= findAttr (elemName ns "" "title") of - Just title' -> title' - Nothing -> "" - alt = case mbDocPr >>= findAttr (elemName ns "" "descr") of - Just alt' -> alt' - Nothing -> "" + let mbDocPr = findChildByName ns "wp" "inline" element >>= + findChildByName ns "wp" "docPr" + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart elemToParPart ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element + , Just drawingElem <- findChildByName ns "w" "drawing" element , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttr (elemName ns "r" "embed") + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) @@ -676,9 +735,9 @@ elemToParPart ns element -- The below is an attempt to deal with images in deprecated vml format. elemToParPart ns element | isElem ns "w" "r" element - , Just _ <- findChild (elemName ns "w" "pict") element = + , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttr (elemName ns "r" "id") + >>= findAttrByName ns "r" "id" in case drawing of -- Todo: check out title and attr for deprecated format. @@ -687,86 +746,148 @@ elemToParPart ns element -- Chart elemToParPart ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element + , Just drawingElem <- findChildByName ns "w" "drawing" element , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem - = return Chart + = return Chart +{- +The next one is a bit complicated. fldChar fields work by first +having a <w:fldChar fldCharType="begin"> in a run, then a run with +<w:instrText>, then a <w:fldChar fldCharType="separate"> run, then the +content runs, and finally a <w:fldChar fldCharType="end"> run. For +example (omissions and my comments in brackets): + + <w:r> + [...] + <w:fldChar w:fldCharType="begin"/> + </w:r> + <w:r> + [...] + <w:instrText xml:space="preserve"> HYPERLINK [hyperlink url] </w:instrText> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="separate"/> + </w:r> + <w:r w:rsidRPr=[...]> + [...] + <w:t>Foundations of Analysis, 2nd Edition</w:t> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="end"/> + </w:r> + +So we do this in a number of steps. If we encounter the fldchar begin +tag, we start open a fldchar state variable (see state above). We add +the instrtext to it as FieldInfo. Then we close that and start adding +the runs when we get to separate. Then when we get to end, we produce +the Field type with approriate FieldInfo and Runs. +-} elemToParPart ns element - | isElem ns "w" "r" element = - elemToRun ns element >>= (\r -> return $ PlainRun r) + | isElem ns "w" "r" element + , Just fldChar <- findChildByName ns "w" "fldChar" element + , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharClosed | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen} + return NullParPart + FldCharFieldInfo info | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info []} + return NullParPart + FldCharContent info runs | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = FldCharClosed} + return $ Field info $ reverse runs + _ -> throwError WrongElem elemToParPart ns element - | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttr (elemName ns "w" "id") element - , Just cAuthor <- findAttr (elemName ns "w" "author") element - , Just cDate <- findAttr (elemName ns "w" "date") element = do - runs <- mapD (elemToRun ns) (elChildren element) - return $ Insertion cId cAuthor cDate runs + | isElem ns "w" "r" element + , Just instrText <- findChildByName ns "w" "instrText" element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharOpen -> do + info <- eitherToD $ parseFieldInfo $ strContent instrText + modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + return NullParPart + _ -> return NullParPart elemToParPart ns element - | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttr (elemName ns "w" "id") element - , Just cAuthor <- findAttr (elemName ns "w" "author") element - , Just cDate <- findAttr (elemName ns "w" "date") element = do + | isElem ns "w" "r" element = do + run <- elemToRun ns element + -- we check to see if we have an open FldChar in state that we're + -- recording. + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info runs -> do + modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} + return NullParPart + _ -> return $ PlainRun run +elemToParPart ns element + | Just change <- getTrackedChange ns element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ ChangedRuns change runs +elemToParPart ns element + | isElem ns "w" "smartTag" element = do runs <- mapD (elemToRun ns) (elChildren element) - return $ Deletion cId cAuthor cDate runs + return $ SmartTag runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttr (elemName ns "w" "id") element - , Just bmName <- findAttr (elemName ns "w" "name") element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttr (elemName ns "r" "id") element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of - Just target -> do - case findAttr (elemName ns "w" "anchor") element of + Just target -> + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttr (elemName ns "w" "id") element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem - Nothing -> throwError WrongElem + Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttr (elemName ns "w" "id") element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttr (elemName ns "w" "id") element - , Just cmtAuthor <- findAttr (elemName ns "w" "author") element - , Just cmtDate <- findAttr (elemName ns "w" "date") element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , Just cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem lookupFootnote :: String -> Notes -> Maybe Element -lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) +lookupFootnote s (Notes _ fns _) = fns >>= M.lookup s lookupEndnote :: String -> Notes -> Maybe Element -lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) +lookupEndnote s (Notes _ _ ens) = ens >>= M.lookup s elemToExtent :: Element -> Extent elemToExtent drawingElem = case (getDim "cx", getDim "cy") of (Just w, Just h) -> Just (w, h) - _ -> Nothing + _ -> Nothing where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem @@ -794,7 +915,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttr (elemName ns "w" "id") element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -802,7 +923,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttr (elemName ns "w" "id") element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -813,8 +934,8 @@ childElemToRun _ _ = throwError WrongElem elemToRun :: NameSpaces -> Element -> D Run elemToRun ns element | isElem ns "w" "r" element - , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element = - do let choices = findChildren (elemName ns "mc" "Choice") altCont + , Just altCont <- findChildByName ns "mc" "AlternateContent" element = + do let choices = findChildrenByName ns "mc" "Choice" altCont choiceChildren = map head $ filter (not . null) $ map elChildren choices outputs <- mapD (childElemToRun ns) choiceChildren case outputs of @@ -822,15 +943,15 @@ elemToRun ns element [] -> throwError WrongElem elemToRun ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + , Just drawingElem <- findChildByName ns "w" "drawing" element = childElemToRun ns drawingElem elemToRun ns element | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "footnoteReference") element = + , Just ref <- findChildByName ns "w" "footnoteReference" element = childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "endnoteReference") element = + , Just ref <- findChildByName ns "w" "endnoteReference" element = childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element = do @@ -854,22 +975,37 @@ getParStyleField field stylemap styles = Just y getParStyleField _ _ _ = Nothing +getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange +getTrackedChange ns element + | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = + Just $ TrackedChange Insertion (ChangeInfo cId cAuthor cDate) +getTrackedChange ns element + | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = + Just $ TrackedChange Deletion (ChangeInfo cId cAuthor cDate) +getTrackedChange _ _ = Nothing + elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle elemToParagraphStyle ns element sty - | Just pPr <- findChild (elemName ns "w" "pPr") element = + | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (findAttr (elemName ns "w" "val")) - (findChildren (elemName ns "w" "pStyle") pPr) + (findAttrByName ns "w" "val") + (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = style , indentation = - findChild (elemName ns "w" "ind") pPr >>= + findChildByName ns "w" "ind" pPr >>= elemToParIndentation ns , dropCap = case - findChild (elemName ns "w" "framePr") pPr >>= - findAttr (elemName ns "w" "dropCap") + findChildByName ns "w" "framePr" pPr >>= + findAttrByName ns "w" "dropCap" of Just "none" -> False Just _ -> True @@ -877,13 +1013,20 @@ elemToParagraphStyle ns element sty , pHeading = getParStyleField headingLev sty style , pNumInfo = getParStyleField numInfo sty style , pBlockQuote = getParStyleField isBlockQuote sty style + , pChange = findChildByName ns "w" "rPr" pPr >>= + filterChild (\e -> isElem ns "w" "ins" e || + isElem ns "w" "moveTo" e || + isElem ns "w" "del" e || + isElem ns "w" "moveFrom" e + ) >>= + getTrackedChange ns } elemToParagraphStyle _ _ _ = defaultParagraphStyle checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag | Just t <- findChild tag rPr - , Just val <- findAttr (elemName ns "w" "val") t = + , Just val <- findAttrByName ns "w" "val" t = Just $ case val of "true" -> True "false" -> False @@ -897,11 +1040,11 @@ checkOnOff _ _ _ = Nothing elemToRunStyleD :: NameSpaces -> Element -> D RunStyle elemToRunStyleD ns element - | Just rPr <- findChild (elemName ns "w" "rPr") element = do + | Just rPr <- findChildByName ns "w" "rPr" element = do charStyles <- asks envCharStyles let parentSty = case - findChild (elemName ns "w" "rStyle") rPr >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "rStyle" rPr >>= + findAttrByName ns "w" "val" of Just styName | Just style <- M.lookup styName charStyles -> Just (styName, style) @@ -911,7 +1054,7 @@ elemToRunStyleD _ _ = return defaultRunStyle elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle elemToRunStyle ns element parentStyle - | Just rPr <- findChild (elemName ns "w" "rPr") element = + | Just rPr <- findChildByName ns "w" "rPr" element = RunStyle { isBold = checkOnOff ns rPr (elemName ns "w" "b") @@ -919,32 +1062,31 @@ elemToRunStyle ns element parentStyle , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") , rVertAlign = - findChild (elemName ns "w" "vertAlign") rPr >>= - findAttr (elemName ns "w" "val") >>= + findChildByName ns "w" "vertAlign" rPr >>= + findAttrByName ns "w" "val" >>= \v -> Just $ case v of "superscript" -> SupScrpt "subscript" -> SubScrpt _ -> BaseLn , rUnderline = - findChild (elemName ns "w" "u") rPr >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "u" rPr >>= + findAttrByName ns "w" "val" , rStyle = parentStyle } elemToRunStyle _ _ _ = defaultRunStyle -isNumericNotNull :: String -> Bool -isNumericNotNull str = (str /= []) && (all isDigit str) - getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) getHeaderLevel ns element - | Just styleId <- findAttr (elemName ns "w" "styleId") element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just index <- stripPrefix "Heading" styleId - , isNumericNotNull index = Just (styleId, read index) - | Just styleId <- findAttr (elemName ns "w" "styleId") element - , Just index <- findChild (elemName ns "w" "name") element >>= - findAttr (elemName ns "w" "val") >>= + , Just n <- stringToInteger index + , n > 0 = Just (styleId, fromInteger n) + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just index <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" >>= stripPrefix "heading " - , isNumericNotNull index = Just (styleId, read index) + , Just n <- stringToInteger index + , n > 0 = Just (styleId, fromInteger n) getHeaderLevel _ _ = Nothing blockQuoteStyleIds :: [String] @@ -955,23 +1097,23 @@ blockQuoteStyleNames = ["Quote", "Block Text"] getBlockQuote :: NameSpaces -> Element -> Maybe Bool getBlockQuote ns element - | Just styleId <- findAttr (elemName ns "w" "styleId") element + | Just styleId <- findAttrByName ns "w" "styleId" element , styleId `elem` blockQuoteStyleIds = Just True - | Just styleName <- findChild (elemName ns "w" "name") element >>= - findAttr (elemName ns "w" "val") + | Just styleName <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" , styleName `elem` blockQuoteStyleNames = Just True getBlockQuote _ _ = Nothing getNumInfo :: NameSpaces -> Element -> Maybe (String, String) getNumInfo ns element = do - let numPr = findChild (elemName ns "w" "pPr") element >>= - findChild (elemName ns "w" "numPr") + let numPr = findChildByName ns "w" "pPr" element >>= + findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= - findChild (elemName ns "w" "ilvl") >>= - findAttr (elemName ns "w" "val")) + findChildByName ns "w" "ilvl" >>= + findAttrByName ns "w" "val") numId <- numPr >>= - findChild (elemName ns "w" "numId") >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "numId" >>= + findAttrByName ns "w" "val" return (numId, lvl) @@ -1015,10 +1157,10 @@ getSymChar ns element let [(char, _)] = readLitChar ("\\x" ++ s) in TextRun . maybe "" (:[]) $ getUnicode font char where - getCodepoint = findAttr (elemName ns "w" "char") element - getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + getCodepoint = findAttrByName ns "w" "char" element + getFont = stringToFont =<< findAttrByName ns "w" "font" element lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + lowerFromPrivate xs = xs getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1029,11 +1171,9 @@ elemToRunElems ns element let font = do fontElem <- findElement (qualName "rFonts") element stringToFont =<< - (foldr (<|>) Nothing $ - map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem setFont :: Maybe Font -> ReaderEnv -> ReaderEnv setFont f s = s{envFont = f} - diff --git a/src/Text/Pandoc/Readers/Docx/StyleMap.hs b/src/Text/Pandoc/Readers/Docx/StyleMap.hs index 00906cf07..b32a73770 100644 --- a/src/Text/Pandoc/Readers/Docx/StyleMap.hs +++ b/src/Text/Pandoc/Readers/Docx/StyleMap.hs @@ -7,11 +7,11 @@ module Text.Pandoc.Readers.Docx.StyleMap ( StyleMaps(..) , hasStyleName ) where -import Text.XML.Light -import Text.Pandoc.Readers.Docx.Util -import Control.Monad.State -import Data.Char (toLower) -import qualified Data.Map as M +import Control.Monad.State.Strict +import Data.Char (toLower) +import qualified Data.Map as M +import Text.Pandoc.Readers.Docx.Util +import Text.XML.Light newtype ParaStyleMap = ParaStyleMap ( M.Map String String ) newtype CharStyleMap = CharStyleMap ( M.Map String String ) @@ -30,7 +30,7 @@ instance StyleMap CharStyleMap where insert :: (StyleMap a) => Maybe String -> Maybe String -> a -> a insert (Just k) (Just v) m = alterMap (M.insert k v) m -insert _ _ m = m +insert _ _ m = m getStyleId :: (StyleMap a) => String -> a -> String getStyleId s = M.findWithDefault (filter (/=' ') s) (map toLower s) . getMap diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index 33d69ccf3..d9d65bc07 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -3,10 +3,13 @@ module Text.Pandoc.Readers.Docx.Util ( , elemName , isElem , elemToNameSpaces + , findChildByName + , findChildrenByName + , findAttrByName ) where -import Text.XML.Light import Data.Maybe (mapMaybe) +import Text.XML.Light type NameSpaces = [(String, String)] @@ -15,7 +18,7 @@ elemToNameSpaces = mapMaybe attrToNSPair . elAttribs attrToNSPair :: Attr -> Maybe (String, String) attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing +attrToNSPair _ = Nothing elemName :: NameSpaces -> String -> String -> QName elemName ns prefix name = @@ -23,5 +26,21 @@ elemName ns prefix name = isElem :: NameSpaces -> String -> String -> Element -> Bool isElem ns prefix name element = - qName (elName element) == name && - qURI (elName element) == lookup prefix ns + let ns' = ns ++ elemToNameSpaces element + in qName (elName element) == name && + qURI (elName element) == lookup prefix ns' + +findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findChild (elemName ns' pref name) el + +findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findChildren (elemName ns' pref name) el + +findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findAttr (elemName ns' pref name) el diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 4c31bf1ae..3b13bbe13 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -1,55 +1,54 @@ -{-# LANGUAGE - ViewPatterns - , StandaloneDeriving - , TupleSections - , FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts #-} + +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE ViewPatterns #-} module Text.Pandoc.Readers.EPUB (readEPUB) where -import Text.XML.Light +import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry, + toArchiveOrFail) +import Control.DeepSeq (NFData, deepseq) +import Control.Monad (guard, liftM) +import Control.Monad.Except (throwError) +import qualified Data.ByteString.Lazy as BL (ByteString) +import Data.List (isInfixOf, isPrefixOf) +import qualified Data.Map as M (Map, elems, fromList, lookup) +import Data.Maybe (fromMaybe, mapMaybe) +import Data.Monoid ((<>)) +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL +import Network.URI (unEscapeString) +import System.FilePath (dropFileName, dropFileName, normalise, splitFileName, + takeFileName, (</>)) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad, insertMedia) import Text.Pandoc.Definition hiding (Attr) -import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Error -import Text.Pandoc.Walk (walk, query) -import Text.Pandoc.Options ( ReaderOptions(..), readerTrace) -import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) -import Network.URI (unEscapeString) -import Text.Pandoc.MediaBag (MediaBag, insertMedia) -import Control.Monad.Except (MonadError, throwError, runExcept, Except) +import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) -import qualified Text.Pandoc.Builder as B -import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry - , findEntryByPath, Entry) -import qualified Data.ByteString.Lazy as BL (ByteString) -import System.FilePath ( takeFileName, (</>), dropFileName, normalise - , dropFileName - , splitFileName ) +import Text.Pandoc.Options (ReaderOptions (..)) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) -import Control.Monad (guard, liftM, when) -import Data.List (isPrefixOf, isInfixOf) -import Data.Maybe (mapMaybe, fromMaybe) -import qualified Data.Map as M (Map, lookup, fromList, elems) -import Data.Monoid ((<>)) -import Control.DeepSeq (deepseq, NFData) - -import Debug.Trace (trace) +import Text.Pandoc.Walk (query, walk) +import Text.XML.Light type Items = M.Map String (FilePath, MimeType) -readEPUB :: ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag) +readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of - Right archive -> runEPUB $ archiveToEPUB opts $ archive - Left _ -> Left $ ParseFailure "Couldn't extract ePub file" + Right archive -> archiveToEPUB opts archive + Left _ -> throwError $ PandocParseError "Couldn't extract ePub file" -runEPUB :: Except PandocError a -> Either PandocError a -runEPUB = runExcept +-- runEPUB :: Except PandocError a -> Either PandocError a +-- runEPUB = runExcept -- Note that internal reference are aggresively normalised so that all ids -- are of the form "filename#id" -- -archiveToEPUB :: (MonadError PandocError m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) +archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc archiveToEPUB os archive = do -- root is path to folder with manifest file in (root, content) <- getManifest archive @@ -62,40 +61,36 @@ archiveToEPUB os archive = do Pandoc _ bs <- foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine)) `liftM` parseSpineElem root b) mempty spine - let ast = coverDoc <> (Pandoc meta bs) - let mediaBag = fetchImages (M.elems items) root archive ast - return $ (ast, mediaBag) + let ast = coverDoc <> Pandoc meta bs + fetchImages (M.elems items) root archive ast + return ast where - os' = os {readerParseRaw = True} - parseSpineElem :: MonadError PandocError m => FilePath -> (FilePath, MimeType) -> m Pandoc + os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)} + parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc parseSpineElem (normalise -> r) (normalise -> path, mime) = do - when (readerTrace os) (traceM path) doc <- mimeToReader mime r path let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty return $ docSpan <> doc - mimeToReader :: MonadError PandocError m => MimeType -> FilePath -> FilePath -> m Pandoc + mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc mimeToReader "application/xhtml+xml" (unEscapeString -> root) (unEscapeString -> path) = do fname <- findEntryByPathE (root </> path) archive - html <- either throwError return . - readHtml os' . - UTF8.toStringLazy $ - fromEntry fname + html <- readHtml os' . TL.toStrict . TL.decodeUtf8 $ fromEntry fname return $ fixInternalReferences path html mimeToReader s _ (unEscapeString -> path) | s `elem` imageMimes = return $ imageToPandoc path - | otherwise = return $ mempty + | otherwise = return mempty -- paths should be absolute when this function is called -- renameImages should do this -fetchImages :: [(FilePath, MimeType)] +fetchImages :: PandocMonad m + => [(FilePath, MimeType)] -> FilePath -- ^ Root -> Archive -> Pandoc - -> MediaBag + -> m () fetchImages mimes root arc (query iq -> links) = - foldr (uncurry3 insertMedia) mempty - (mapMaybe getEntry links) + mapM_ (uncurry3 insertMedia) (mapMaybe getEntry links) where getEntry link = let abslink = normalise (root </> link) in @@ -104,7 +99,7 @@ fetchImages mimes root arc (query iq -> links) = iq :: Inline -> [FilePath] iq (Image _ _ (url, _)) = [url] -iq _ = [] +iq _ = [] -- Remove relative paths renameImages :: FilePath -> Inline -> Inline @@ -121,13 +116,13 @@ imageMimes = ["image/gif", "image/jpeg", "image/png"] type CoverImage = FilePath -parseManifest :: (MonadError PandocError m) => Element -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items) parseManifest content = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover, (M.fromList r)) + return (cover, M.fromList r) where findCover e = maybe False (isInfixOf "cover-image") (findAttr (emptyName "properties") e) @@ -137,18 +132,18 @@ parseManifest content = do mime <- findAttrE (emptyName "media-type") e return (uid, (href, mime)) -parseSpine :: MonadError PandocError m => Items -> Element -> m [(FilePath, MimeType)] +parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do spine <- findElementE (dfName "spine") e let itemRefs = findChildren (dfName "itemref") spine - mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + mapM (mkE "parseSpine" . flip M.lookup is) $ mapMaybe parseItemRef itemRefs where parseItemRef ref = do let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) guard linear findAttr (emptyName "idref") ref -parseMeta :: MonadError PandocError m => Element -> m Meta +parseMeta :: PandocMonad m => Element -> m Meta parseMeta content = do meta <- findElementE (dfName "metadata") content let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True @@ -164,29 +159,29 @@ parseMetaItem e@(stripNamespace . elName -> field) meta = renameMeta :: String -> String renameMeta "creator" = "author" -renameMeta s = s +renameMeta s = s -getManifest :: MonadError PandocError m => Archive -> m (String, Element) +getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry let namespaces = mapMaybe attrToNSPair (elAttribs docElem) ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) - as <- liftM ((map attrToPair) . elAttribs) + as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) manifestFile <- mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive - liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + fmap ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) -- Fixup fixInternalReferences :: FilePath -> Pandoc -> Pandoc fixInternalReferences pathToFile = - (walk $ renameImages root) - . (walk $ fixBlockIRs filename) - . (walk $ fixInlineIRs filename) + walk (renameImages root) + . walk (fixBlockIRs filename) + . walk (fixInlineIRs filename) where (root, escapeURI -> filename) = splitFileName pathToFile @@ -221,7 +216,7 @@ fixAttrs :: FilePath -> B.Attr -> B.Attr fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) addHash :: String -> String -> String -addHash _ "" = "" +addHash _ "" = "" addHash s ident = takeFileName s ++ "#" ++ ident removeEPUBAttrs :: [(String, String)] -> [(String, String)] @@ -242,9 +237,6 @@ foldM' f z (x:xs) = do uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d uncurry3 f (a, b, c) = f a b c -traceM :: Monad m => String -> m () -traceM = flip trace (return ()) - -- Utility stripNamespace :: QName -> String @@ -252,7 +244,7 @@ stripNamespace (QName v _ _) = v attrToNSPair :: Attr -> Maybe (String, String) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) -attrToNSPair _ = Nothing +attrToNSPair _ = Nothing attrToPair :: Attr -> (String, String) attrToPair (Attr (QName name _ _) val) = (name, val) @@ -268,18 +260,18 @@ emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: MonadError PandocError m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m String findAttrE q e = mkE "findAttr" $ findAttr q e -findEntryByPathE :: MonadError PandocError m => FilePath -> Archive -> m Entry +findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise -> path) a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a -parseXMLDocE :: MonadError PandocError m => String -> m Element +parseXMLDocE :: PandocMonad m => String -> m Element parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc -findElementE :: MonadError PandocError m => QName -> Element -> m Element +findElementE :: PandocMonad m => QName -> Element -> m Element findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x -mkE :: MonadError PandocError m => String -> Maybe a -> m a -mkE s = maybe (throwError . ParseFailure $ s) return +mkE :: PandocMonad m => String -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index abe5f66ce..0e79f9ec3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,7 +1,10 @@ -{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, -ViewPatterns#-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +23,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2015 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,66 +37,79 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag + , NamedTag(..) , isTextTag , isCommentTag ) where +import Control.Applicative ((<|>)) +import Control.Arrow (first) +import Control.Monad (guard, mplus, msum, mzero, unless, void) +import Control.Monad.Except (throwError) +import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT) +import Data.Char (isAlphaNum, isDigit, isLetter) +import Data.Default (Default (..), def) +import Data.Foldable (for_) +import Data.List (isPrefixOf) +import Data.List.Split (wordsBy, splitWhen) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isJust, isNothing) +import Data.Monoid (First (..), (<>)) +import qualified Data.Set as Set +import Data.Text (Text) +import qualified Data.Text as T +import Network.URI (URI, nonStrictRelativeTo, parseURIReference) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match -import Text.Pandoc.Definition +import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField - , escapeURI, safeRead, mapLeft ) -import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) - , Extension (Ext_epub_html_exts, - Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.Logging +import Text.Pandoc.Options ( + Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs, + Ext_native_spans, Ext_raw_html, Ext_line_blocks), + ReaderOptions (readerExtensions, readerStripComments), + extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, + extractSpaces, safeRead, underlineSpan) import Text.Pandoc.Walk -import qualified Data.Map as M -import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf ) -import Data.Char ( isDigit ) -import Control.Monad ( guard, when, mzero, void, unless ) -import Control.Arrow ((***)) -import Control.Applicative ( (<|>) ) -import Data.Monoid (First (..)) -import Text.Printf (printf) -import Debug.Trace (trace) -import Text.TeXMath (readMathML, writeTeX) -import Data.Default (Default (..), def) -import Control.Monad.Reader (Reader,ask, asks, local, runReader) -import Network.URI (URI, parseURIReference, nonStrictRelativeTo) -import Text.Pandoc.Error -import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) -import Data.Monoid ((<>)) import Text.Parsec.Error -import qualified Data.Set as Set +import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assumes @'\n'@ line endings) - -> Either PandocError Pandoc -readHtml opts inp = - mapLeft (ParseFailure . getError) . flip runReader def $ - runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) - "source" tags - where tags = stripPrefixes . canonicalizeTags $ - parseTagsOptions parseOptions{ optTagPosition = True } inp - parseDoc = do - blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta . parserState <$> getState - bs' <- replaceNotes (B.toList blocks) - return $ Pandoc meta bs' - getError (errorMessages -> ms) = case ms of - [] -> "" - (m:_) -> messageString m - -replaceNotes :: [Block] -> TagParser [Block] +readHtml :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> m Pandoc +readHtml opts inp = do + let tags = stripPrefixes . canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } + (crFilter inp) + parseDoc = do + blocks <- fixPlains False . mconcat <$> manyTill block eof + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + reportLogMessages + return $ Pandoc meta bs' + getError (errorMessages -> ms) = case ms of + [] -> "" + (m:_) -> messageString m + result <- flip runReaderT def $ + runParserT parseDoc + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty []) + "source" tags + case result of + Right doc -> return doc + Left err -> throwError $ PandocParseError $ getError err + +replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes = walkM replaceNotes' -replaceNotes' :: Inline -> TagParser Inline +replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes where getNotes = noteTable <$> getState @@ -105,38 +121,46 @@ data HTMLState = noteTable :: [(String, Blocks)], baseHref :: Maybe URI, identifiers :: Set.Set String, - headerMap :: M.Map Inlines String + headerMap :: M.Map Inlines String, + logMessages :: [LogMessage] } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext - , inChapter :: Bool -- ^ Set if in chapter section - , inPlain :: Bool -- ^ Set if in pPlain + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain } -setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInChapter = local (\s -> s {inChapter = True}) -setInPlain :: HTMLParser s a -> HTMLParser s a +setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) -type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) +type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) -type TagParser = HTMLParser [Tag String] +type TagParser m = HTMLParser m [Tag Text] -pBody :: TagParser Blocks +pHtml :: PandocMonad m => TagParser m Blocks +pHtml = try $ do + (TagOpen "html" attr) <- lookAhead pAnyTag + for_ (lookup "lang" attr) $ + updateState . B.setMeta "lang" . B.text . T.unpack + pInTags "html" block + +pBody :: PandocMonad m => TagParser m Blocks pBody = pInTags "body" block -pHead :: TagParser Blocks +pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) where pTitle = pInTags "title" inline >>= setTitle . trimInlines - setTitle t = mempty <$ (updateState $ B.setMeta "title" t) + setTitle t = mempty <$ updateState (B.setMeta "title" t) pMetaTag = do - mt <- pSatisfy (~== TagOpen "meta" []) - let name = fromAttrib "name" mt + mt <- pSatisfy (matchTagOpen "meta" []) + let name = T.unpack $ fromAttrib "name" mt if null name then return mempty else do - let content = fromAttrib "content" mt + let content = T.unpack $ fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ @@ -144,15 +168,13 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag (stateMeta ps) } } return mempty pBaseTag = do - bt <- pSatisfy (~== TagOpen "base" []) + bt <- pSatisfy (matchTagOpen "base" []) updateState $ \st -> st{ baseHref = - parseURIReference $ fromAttrib "href" bt } + parseURIReference $ T.unpack $ fromAttrib "href" bt } return mempty -block :: TagParser Blocks +block :: PandocMonad m => TagParser m Blocks block = do - tr <- getOption readerTrace - pos <- getPosition res <- choice [ eSection , eSwitch B.para block @@ -166,94 +188,107 @@ block = do , pList , pHrule , pTable + , pHtml , pHead , pBody + , pLineBlock , pDiv , pPlain + , pFigure , pRawHtmlBlock ] - when tr $ trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList res)) (return ()) + trace (take 60 $ show $ B.toList res) return res -namespaces :: [(String, TagParser Inlines)] +namespaces :: PandocMonad m => [(String, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] mathMLNamespace :: String mathMLNamespace = "http://www.w3.org/1998/Math/MathML" -eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a +eSwitch :: (PandocMonad m, Monoid a) + => (Inlines -> a) + -> TagParser m a + -> TagParser m a eSwitch constructor parser = try $ do guardEnabled Ext_epub_html_exts - pSatisfy (~== TagOpen "switch" []) + pSatisfy (matchTagOpen "switch" []) cases <- getFirst . mconcat <$> manyTill (First <$> (eCase <* skipMany pBlank) ) - (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + (lookAhead $ try $ pSatisfy (matchTagOpen "default" [])) skipMany pBlank fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) skipMany pBlank - pSatisfy (~== TagClose "switch") + pSatisfy (matchTagClose "switch") return $ maybe fallback constructor cases -eCase :: TagParser (Maybe Inlines) +eCase :: PandocMonad m => TagParser m (Maybe Inlines) eCase = do skipMany pBlank - TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) - case (flip lookup namespaces) =<< lookup "required-namespace" attr of - Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) - Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "case" []) + let attr = toStringAttr attr' + case flip lookup namespaces =<< lookup "required-namespace" attr of + Just p -> Just <$> pInTags "case" (skipMany pBlank *> p <* skipMany pBlank) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case")) -eFootnote :: TagParser () +eFootnote :: PandocMonad m => TagParser m () eFootnote = try $ do let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (flip elem notes) (lookup "type" attr)) + (TagOpen tag attr') <- lookAhead pAnyTag + let attr = toStringAttr attr' + guard $ maybe False (`elem` notes) (lookup "type" attr) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block addNote ident content -addNote :: String -> Blocks -> TagParser () -addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) +addNote :: PandocMonad m => String -> Blocks -> TagParser m () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) -eNoteref :: TagParser Inlines +eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do guardEnabled Ext_epub_html_exts - TagOpen tag attr <- lookAhead $ pAnyTag - guard (maybe False (== "noteref") (lookup "type" attr)) + TagOpen tag attr' <- lookAhead pAnyTag + let attr = toStringAttr attr' + guard $ lookup "type" attr == Just "noteref" let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) guard (not (null ident)) pInTags tag block return $ B.rawInline "noteref" ident -- Strip TOC if there is one, better to generate again -eTOC :: TagParser () +eTOC :: PandocMonad m => TagParser m () eTOC = try $ do guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (== "toc") (lookup "type" attr)) + (TagOpen tag attr) <- lookAhead pAnyTag + guard $ lookup "type" attr == Just "toc" void (pInTags tag block) -pList :: TagParser Blocks +pList :: PandocMonad m => TagParser m Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList -pBulletList :: TagParser Blocks +pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do - pSatisfy (~== TagOpen "ul" []) + pSatisfy (matchTagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ul")) + not (matchTagClose "ul" t)) -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items -pListItem :: TagParser a -> TagParser Blocks +pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks pListItem nonItem = do - TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) - let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) - (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) + let attr = toStringAttr attr' + let addId ident bs = case B.toList bs of + (Plain ils:xs) -> B.fromList (Plain + [Span (ident, [], []) ils] : xs) + _ -> B.divWith (ident, [], []) bs + maybe id addId (lookup "id" attr) <$> + pInTags "li" block <* skipMany nonItem parseListStyleType :: String -> ListNumberStyle parseListStyleType "lower-roman" = LowerRoman @@ -271,9 +306,10 @@ parseTypeAttr "A" = UpperAlpha parseTypeAttr "1" = Decimal parseTypeAttr _ = DefaultStyle -pOrderedList :: TagParser Blocks +pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) + let attribs = toStringAttr attribs' let (start, style) = (sta', sty') where sta = fromMaybe "1" $ lookup "start" attribs @@ -295,23 +331,23 @@ pOrderedList = try $ do ] let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ol")) + not (matchTagClose "ol" t)) -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items -pDefinitionList :: TagParser Blocks +pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList = try $ do - pSatisfy (~== TagOpen "dl" []) + pSatisfy (matchTagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") return $ B.definitionList items -pDefListItem :: TagParser (Inlines, [Blocks]) +pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) pDefListItem = try $ do - let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && - not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) && + not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t)) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem @@ -322,133 +358,165 @@ fixPlains :: Bool -> Blocks -> Blocks fixPlains inList bs = if any isParaish bs' then B.fromList $ map plainToPara bs' else bs - where isParaish (Para _) = True - isParaish (CodeBlock _ _) = True - isParaish (Header _ _ _) = True - isParaish (BlockQuote _) = True - isParaish (BulletList _) = not inList - isParaish (OrderedList _ _) = not inList - isParaish (DefinitionList _) = not inList - isParaish _ = False + where isParaish Para{} = True + isParaish CodeBlock{} = True + isParaish Header{} = True + isParaish BlockQuote{} = True + isParaish BulletList{} = not inList + isParaish OrderedList{} = not inList + isParaish DefinitionList{} = not inList + isParaish _ = False plainToPara (Plain xs) = Para xs - plainToPara x = x + plainToPara x = x bs' = B.toList bs -pRawTag :: TagParser String +pRawTag :: PandocMonad m => TagParser m Text pRawTag = do tag <- pAnyTag let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag - then return [] + then return mempty else return $ renderTags' [tag] -pDiv :: TagParser Blocks +pLineBlock :: PandocMonad m => TagParser m Blocks +pLineBlock = try $ do + guardEnabled Ext_line_blocks + _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")]) + ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div"))) + let lns = map B.fromList $ + splitWhen (== LineBreak) $ filter (/= SoftBreak) $ + B.toList ils + return $ B.lineBlock lns + +pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs - let isDivLike "div" = True + let isDivLike "div" = True isDivLike "section" = True - isDivLike _ = False - TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + isDivLike "main" = True + isDivLike _ = False + TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + let attr = toStringAttr attr' contents <- pInTags tag block let (ident, classes, kvs) = mkAttr attr let classes' = if tag == "section" then "section":classes else classes - return $ B.divWith (ident, classes', kvs) contents + kvs' = if tag == "main" && isNothing (lookup "role" kvs) + then ("role", "main"):kvs + else kvs + return $ B.divWith (ident, classes', kvs') contents -pRawHtmlBlock :: TagParser Blocks +pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do - raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag - parseRaw <- getOption readerParseRaw - if parseRaw && not (null raw) + raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag) + exts <- getOption readerExtensions + if extensionEnabled Ext_raw_html exts && not (null raw) then return $ B.rawBlock "html" raw - else return mempty + else ignore raw + +ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a +ignore raw = do + pos <- getPosition + -- raw can be null for tags like <!DOCTYPE>; see paRawTag + -- in this case we don't want a warning: + unless (null raw) $ + logMessage $ SkippedContent raw pos + return mempty -pHtmlBlock :: String -> TagParser String +pHtmlBlock :: PandocMonad m => Text -> TagParser m Text pHtmlBlock t = try $ do - open <- pSatisfy (~== TagOpen t []) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) - return $ renderTags' $ [open] ++ contents ++ [TagClose t] + open <- pSatisfy (matchTagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (matchTagClose t)) + return $ renderTags' $ [open] <> contents <> [TagClose t] -- Sets chapter context -eSection :: TagParser Blocks +eSection :: PandocMonad m => TagParser m Blocks eSection = try $ do - let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as) let sectTag = tagOpen (`elem` sectioningContent) matchChapter TagOpen tag _ <- lookAhead $ pSatisfy sectTag setInChapter (pInTags tag block) -headerLevel :: String -> TagParser Int -headerLevel tagtype = do - let level = read (drop 1 tagtype) - (try $ do - guardEnabled Ext_epub_html_exts - asks inChapter >>= guard - return (level - 1)) - <|> - return level - -eTitlePage :: TagParser () +headerLevel :: PandocMonad m => Text -> TagParser m Int +headerLevel tagtype = + case safeRead (T.unpack (T.drop 1 tagtype)) of + Just level -> +-- try (do +-- guardEnabled Ext_epub_html_exts +-- asks inChapter >>= guard +-- return (level - 1)) +-- <|> + return level + Nothing -> fail "Could not retrieve header level" + +eTitlePage :: PandocMonad m => TagParser m () eTitlePage = try $ do - let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) + let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as) let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") isTitlePage TagOpen tag _ <- lookAhead $ pSatisfy groupTag () <$ pInTags tag block -pHeader :: TagParser Blocks +pHeader :: PandocMonad m => TagParser m Blocks pHeader = try $ do - TagOpen tagtype attr <- pSatisfy $ + TagOpen tagtype attr' <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) - let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + let attr = toStringAttr attr' + let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) + [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] - attr' <- registerHeader (ident, classes, keyvals) contents + attr'' <- registerHeader (ident, classes, keyvals) contents return $ if bodyTitle then mempty -- skip a representation of the title in the body - else B.headerWith attr' level contents + else B.headerWith attr'' level contents -pHrule :: TagParser Blocks +pHrule :: PandocMonad m => TagParser m Blocks pHrule = do pSelfClosing (=="hr") (const True) return B.horizontalRule -pTable :: TagParser Blocks +pTable :: PandocMonad m => TagParser m Blocks pTable = try $ do - TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + TagOpen _ _ <- pSatisfy (matchTagOpen "table" []) skipMany pBlank caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank widths' <- (mconcat <$> many1 pColgroup) <|> many pCol let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = do pOptInTag "tbody" $ many1 pTr + pTr = try $ skipMany pBlank >> + pInTags "tr" (pCell "td" <|> pCell "th") + pTBody = pOptInTag "tbody" $ many1 pTr head'' <- pOptInTag "thead" pTh - head' <- pOptInTag "tbody" $ do - if null head'' - then pTh - else return head'' + head' <- map snd <$> + pOptInTag "tbody" + (if null head'' then pTh else return head'') rowsLs <- many pTBody rows' <- pOptInTag "tfoot" $ many pTr - TagClose _ <- pSatisfy (~== TagClose "table") - let rows'' = (concat rowsLs) ++ rows' + TagClose _ <- pSatisfy (matchTagClose "table") + let rows'' = concat rowsLs <> rows' + let rows''' = map (map snd) rows'' + -- let rows''' = map (map snd) rows'' -- fail on empty table - guard $ not $ null head' && null rows'' + guard $ not $ null head' && null rows''' let isSinglePlain x = case B.toList x of [] -> True [Plain _] -> True _ -> False - let isSimple = all isSinglePlain $ concat (head':rows'') - let cols = length $ if null head' then head rows'' else head' + let isSimple = all isSinglePlain $ concat (head':rows''') + let cols = length $ if null head' then head rows''' else head' -- add empty cells to short rows let addEmpties r = case cols - length r of - n | n > 0 -> r ++ replicate n mempty + n | n > 0 -> r <> replicate n mempty | otherwise -> r - let rows = map addEmpties rows'' - let aligns = replicate cols AlignDefault + let rows = map addEmpties rows''' + let aligns = case rows'' of + (cs:_) -> map fst cs + _ -> replicate cols AlignDefault let widths = if null widths' then if isSimple then replicate cols 0 @@ -456,80 +524,120 @@ pTable = try $ do else widths' return $ B.table caption (zip aligns widths) head' rows -pCol :: TagParser Double +pCol :: PandocMonad m => TagParser m Double pCol = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) + let attribs = toStringAttr attribs' skipMany pBlank - optional $ pSatisfy (~== TagClose "col") + optional $ pSatisfy (matchTagClose "col") skipMany pBlank - return $ case lookup "width" attribs of + let width = case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> - fromMaybe 0.0 $ safeRead ('0':'.':filter - (`notElem` " \t\r\n%'\";") xs) + fromMaybe 0.0 $ safeRead (filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) _ -> 0.0 Just x | not (null x) && last x == '%' -> - fromMaybe 0.0 $ safeRead ('0':'.':init x) + fromMaybe 0.0 $ safeRead (init x) _ -> 0.0 + if width > 0.0 + then return $ width / 100.0 + else return 0.0 -pColgroup :: TagParser [Double] +pColgroup :: PandocMonad m => TagParser m [Double] pColgroup = try $ do - pSatisfy (~== TagOpen "colgroup" []) + pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans :: Tag Text -> Bool noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" where isNullOrOne x = case fromAttrib x t of "" -> True "1" -> True _ -> False -pCell :: String -> TagParser [Blocks] +pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)] pCell celltype = try $ do skipMany pBlank + tag <- lookAhead $ + pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t) + let extractAlign' [] = "" + extractAlign' ("text-align":x:_) = x + extractAlign' (_:xs) = extractAlign' xs + let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':']) + let align = case maybeFromAttrib "align" tag `mplus` + (extractAlign <$> maybeFromAttrib "style" tag) of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault res <- pInTags' celltype noColOrRowSpans block skipMany pBlank - return [res] + return [(align, res)] -pBlockQuote :: TagParser Blocks +pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote = do contents <- pInTags "blockquote" block return $ B.blockQuote $ fixPlains False contents -pPlain :: TagParser Blocks +pPlain :: PandocMonad m => TagParser m Blocks pPlain = do contents <- setInPlain $ trimInlines . mconcat <$> many1 inline if B.isNull contents then return mempty else return $ B.plain contents -pPara :: TagParser Blocks +pPara :: PandocMonad m => TagParser m Blocks pPara = do contents <- trimInlines <$> pInTags "p" inline - return $ B.para contents + (do guardDisabled Ext_empty_paragraphs + guard (B.isNull contents) + return mempty) + <|> return (B.para contents) + +pFigure :: PandocMonad m => TagParser m Blocks +pFigure = try $ do + TagOpen _ _ <- pSatisfy (matchTagOpen "figure" []) + skipMany pBlank + let pImg = (\x -> (Just x, Nothing)) <$> + (pOptInTag "p" pImage <* skipMany pBlank) + pCapt = (\x -> (Nothing, Just x)) <$> do + bs <- pInTags "figcaption" block + return $ blocksToInlines' $ B.toList bs + pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure") + res <- many (pImg <|> pCapt <|> pSkip) + let mbimg = msum $ map fst res + let mbcap = msum $ map snd res + TagClose _ <- pSatisfy (matchTagClose "figure") + let caption = fromMaybe mempty mbcap + case B.toList <$> mbimg of + Just [Image attr _ (url, tit)] -> + return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption + _ -> mzero -pCodeBlock :: TagParser Blocks +pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do - TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) + let attr = toStringAttr attr' contents <- manyTill pAnyTag (pCloses "pre" <|> eof) let rawText = concatMap tagToString contents -- drop leading newline if any let result' = case rawText of - '\n':xs -> xs - _ -> rawText + '\n':xs -> xs + _ -> rawText -- drop trailing newline if any let result = case reverse result' of - '\n':_ -> init result' - _ -> result' + '\n':_ -> init result' + _ -> result' return $ B.codeBlockWith (mkAttr attr) result -tagToString :: Tag String -> String -tagToString (TagText s) = s +tagToString :: Tag Text -> String +tagToString (TagText s) = T.unpack s tagToString (TagOpen "br" _) = "\n" -tagToString _ = "" +tagToString _ = "" -inline :: TagParser Inlines +inline :: PandocMonad m => TagParser m Inlines inline = choice [ eNoteref , eSwitch id inline @@ -540,6 +648,7 @@ inline = choice , pSuperscript , pSubscript , pStrikeout + , pUnderline , pLineBreak , pLink , pImage @@ -549,30 +658,31 @@ inline = choice , pRawHtmlInline ] -pLocation :: TagParser () +pLocation :: PandocMonad m => TagParser m () pLocation = do (TagPosition r c) <- pSat isTagPosition setPosition $ newPos "input" r c -pSat :: (Tag String -> Bool) -> TagParser (Tag String) +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSat f = do pos <- getPosition token show (const pos) (\x -> if f x then Just x else Nothing) -pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String) +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSatisfy f = try $ optional pLocation >> pSat f -pAnyTag :: TagParser (Tag String) +pAnyTag :: PandocMonad m => TagParser m (Tag Text) pAnyTag = pSatisfy (const True) -pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool) - -> TagParser (Tag String) +pSelfClosing :: PandocMonad m + => (Text -> Bool) -> ([Attribute Text] -> Bool) + -> TagParser m (Tag Text) pSelfClosing f g = do open <- pSatisfy (tagOpen f g) optional $ pSatisfy (tagClose f) return open -pQ :: TagParser Inlines +pQ :: PandocMonad m => TagParser m Inlines pQ = do context <- asks quoteContext let quoteType = case context of @@ -587,45 +697,50 @@ pQ = do withQuoteContext innerQuoteContext $ pInlinesInTags "q" constructor -pEmph :: TagParser Inlines +pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph -pStrong :: TagParser Inlines +pStrong :: PandocMonad m => TagParser m Inlines pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong -pSuperscript :: TagParser Inlines +pSuperscript :: PandocMonad m => TagParser m Inlines pSuperscript = pInlinesInTags "sup" B.superscript -pSubscript :: TagParser Inlines +pSubscript :: PandocMonad m => TagParser m Inlines pSubscript = pInlinesInTags "sub" B.subscript -pStrikeout :: TagParser Inlines -pStrikeout = do +pStrikeout :: PandocMonad m => TagParser m Inlines +pStrikeout = pInlinesInTags "s" B.strikeout <|> pInlinesInTags "strike" B.strikeout <|> pInlinesInTags "del" B.strikeout <|> - try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + try (do pSatisfy (matchTagOpen "span" [("class","strikeout")]) contents <- mconcat <$> manyTill inline (pCloses "span") return $ B.strikeout contents) -pLineBreak :: TagParser Inlines +pUnderline :: PandocMonad m => TagParser m Inlines +pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan + +pLineBreak :: PandocMonad m => TagParser m Inlines pLineBreak = do pSelfClosing (=="br") (const True) return B.linebreak -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag String -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib :: String -> Tag Text -> Maybe String +maybeFromAttrib name (TagOpen _ attrs) = + T.unpack <$> lookup (T.pack name) attrs maybeFromAttrib _ _ = Nothing -pLink :: TagParser Inlines +pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) - let title = fromAttrib "title" tag + let title = T.unpack $ fromAttrib "title" tag -- take id from id attribute if present, otherwise name - let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $ + maybeFromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of @@ -639,128 +754,149 @@ pLink = try $ do _ -> url' return $ B.linkWith (uid, cls, []) (escapeURI url) title lab -pImage :: TagParser Inlines +pImage :: PandocMonad m => TagParser m Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState - let url' = fromAttrib "src" tag + let url' = T.unpack $ fromAttrib "src" tag let url = case (parseURIReference url', mbBaseHref) of (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) _ -> url' - let title = fromAttrib "title" tag - let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let title = T.unpack $ fromAttrib "title" tag + let alt = T.unpack $ fromAttrib "alt" tag + let uid = T.unpack $ fromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag let getAtt k = case fromAttrib k tag of "" -> [] - v -> [(k, v)] - let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"] + v -> [(T.unpack k, T.unpack v)] + let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) -pCode :: TagParser Inlines +pCode :: PandocMonad m => TagParser m Inlines pCode = try $ do - (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + let attr = toStringAttr attr' result <- manyTill pAnyTag (pCloses open) - return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result + return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $ + innerText result -pSpan :: TagParser Inlines +pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do guardEnabled Ext_native_spans - TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + let attr = toStringAttr attr' contents <- pInTags "span" inline - let isSmallCaps = fontVariant == "small-caps" + let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes where styleAttr = fromMaybe "" $ lookup "style" attr fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr + classes = fromMaybe [] $ + words <$> lookup "class" attr let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) return $ tag contents -pRawHtmlInline :: TagParser Inlines +pRawHtmlInline :: PandocMonad m => TagParser m Inlines pRawHtmlInline = do inplain <- asks inPlain result <- pSatisfy (tagComment (const True)) <|> if inplain then pSatisfy (not . isBlockTag) else pSatisfy isInlineTag - parseRaw <- getOption readerParseRaw - if parseRaw - then return $ B.rawInline "html" $ renderTags' [result] - else return mempty + exts <- getOption readerExtensions + let raw = T.unpack $ renderTags' [result] + if extensionEnabled Ext_raw_html exts + then return $ B.rawInline "html" raw + else ignore raw mathMLToTeXMath :: String -> Either String String mathMLToTeXMath s = writeTeX <$> readMathML s -pMath :: Bool -> TagParser Inlines +toStringAttr :: [(Text, Text)] -> [(String, String)] +toStringAttr = map go + where go (x,y) = (T.unpack x, T.unpack y) + +pMath :: PandocMonad m => Bool -> TagParser m Inlines pMath inCase = try $ do - open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True) -- we'll assume math tags are MathML unless specially marked -- otherwise... + let attr = toStringAttr attr' unless inCase $ guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) - case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of + contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math")) + case mathMLToTeXMath (T.unpack $ renderTags $ + [open] <> contents <> [TagClose "math"]) of Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ - innerText contents + T.unpack $ innerText contents Right [] -> return mempty Right x -> return $ case lookup "display" attr of Just "block" -> B.displayMath x _ -> B.math x -pInlinesInTags :: String -> (Inlines -> Inlines) - -> TagParser Inlines +pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines) + -> TagParser m Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (Monoid a) => String -> TagParser a -> TagParser a +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a pInTags tagtype parser = pInTags' tagtype (const True) parser -pInTags' :: (Monoid a) => String -> (Tag String -> Bool) -> TagParser a - -> TagParser a +pInTags' :: (PandocMonad m, Monoid a) + => Text + -> (Tag Text -> Bool) + -> TagParser m a + -> TagParser m a pInTags' tagtype tagtest parser = try $ do pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) mconcat <$> manyTill parser (pCloses tagtype <|> eof) -- parses p, preceeded by an optional opening tag -- and followed by an optional closing tags -pOptInTag :: String -> TagParser a -> TagParser a +pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a pOptInTag tagtype p = try $ do skipMany pBlank - optional $ pSatisfy (~== TagOpen tagtype []) + optional $ pSatisfy (matchTagOpen tagtype []) skipMany pBlank x <- p skipMany pBlank - optional $ pSatisfy (~== TagClose tagtype) + optional $ pSatisfy (matchTagClose tagtype) skipMany pBlank return x -pCloses :: String -> TagParser () +pCloses :: PandocMonad m => Text -> TagParser m () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagClose t') | t' == tagtype -> void pAnyTag (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "dd" -> return () (TagClose "table") | tagtype == "td" -> return () (TagClose "table") | tagtype == "tr" -> return () + (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags + -> return () -- see #3794 _ -> mzero -pTagText :: TagParser Inlines +pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState qu <- ask - case flip runReader qu $ runParserT (many pTagContents) st "text" str of - Left _ -> fail $ "Could not parse `" ++ str ++ "'" + parsed <- lift $ lift $ + flip runReaderT qu $ runParserT (many pTagContents) st "text" str + case parsed of + Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'" Right result -> return $ mconcat result -pBlank :: TagParser () +pBlank :: PandocMonad m => TagParser m () pBlank = try $ do (TagText str) <- pSatisfy isTagText - guard $ all isSpace str + guard $ T.all isSpace str -type InlinesParser = HTMLParser String +type InlinesParser m = HTMLParser m Text -pTagContents :: InlinesParser Inlines +pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline @@ -770,7 +906,7 @@ pTagContents = <|> pSymbol <|> pBad -pStr :: InlinesParser Inlines +pStr :: PandocMonad m => InlinesParser m Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) @@ -778,24 +914,24 @@ pStr = do return $ B.str result isSpecial :: Char -> Bool -isSpecial '"' = True -isSpecial '\'' = True -isSpecial '.' = True -isSpecial '-' = True -isSpecial '$' = True +isSpecial '"' = True +isSpecial '\'' = True +isSpecial '.' = True +isSpecial '-' = True +isSpecial '$' = True isSpecial '\8216' = True isSpecial '\8217' = True isSpecial '\8220' = True isSpecial '\8221' = True -isSpecial _ = False +isSpecial _ = False -pSymbol :: InlinesParser Inlines +pSymbol :: PandocMonad m => InlinesParser m Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: InlinesParser Inlines +pBad :: PandocMonad m => InlinesParser m Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -829,7 +965,7 @@ pBad = do _ -> '?' return $ B.str [c'] -pSpace :: InlinesParser Inlines +pSpace :: PandocMonad m => InlinesParser m Inlines pSpace = many1 (satisfy isSpace) >>= \xs -> if '\n' `elem` xs then return B.softbreak @@ -839,86 +975,96 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> -- Constants -- -eitherBlockOrInline :: [String] -eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", - "del", "ins", - "progress", "map", "area", "noscript", "script", - "object", "svg", "video", "source"] - -{- -inlineHtmlTags :: [[Char]] -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] --} - -blockHtmlTags :: [String] -blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", - "blockquote", "body", "button", "canvas", - "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "fieldset", "figcaption", "figure", - "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", - "isindex", "menu", "noframes", "ol", "output", "p", "pre", - "section", "table", "tbody", "textarea", - "thead", "tfoot", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style"] +eitherBlockOrInline :: Set.Set Text +eitherBlockOrInline = Set.fromList + ["audio", "applet", "button", "iframe", "embed", + "del", "ins", "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] + +blockHtmlTags :: Set.Set Text +blockHtmlTags = Set.fromList + ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "canvas", + "caption", "center", "col", "colgroup", "dd", "details", + "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. -blockDocBookTags :: [String] -blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", - "orderedlist", "segmentedlist", "simplelist", - "variablelist", "caution", "important", "note", "tip", - "warning", "address", "literallayout", "programlisting", - "programlistingco", "screen", "screenco", "screenshot", - "synopsis", "example", "informalexample", "figure", - "informalfigure", "table", "informaltable", "para", - "simpara", "formalpara", "equation", "informalequation", - "figure", "screenshot", "mediaobject", "qandaset", - "procedure", "task", "cmdsynopsis", "funcsynopsis", - "classsynopsis", "blockquote", "epigraph", "msgset", - "sidebar", "title"] - -epubTags :: [String] -epubTags = ["case", "switch", "default"] - -blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags - -isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen isInlineTagName (const True) t || - tagClose isInlineTagName t || - tagComment (const True) t - where isInlineTagName x = x `notElem` blockTags - -isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen isBlockTagName (const True) t || - tagClose isBlockTagName t || - tagComment (const True) t - where isBlockTagName ('?':_) = True - isBlockTagName ('!':_) = True - isBlockTagName x = x `elem` blockTags - || x `elem` eitherBlockOrInline - -isTextTag :: Tag String -> Bool +blockDocBookTags :: Set.Set Text +blockDocBookTags = Set.fromList + ["calloutlist", "bibliolist", "glosslist", "itemizedlist", + "orderedlist", "segmentedlist", "simplelist", + "variablelist", "caution", "important", "note", "tip", + "warning", "address", "literallayout", "programlisting", + "programlistingco", "screen", "screenco", "screenshot", + "synopsis", "example", "informalexample", "figure", + "informalfigure", "table", "informaltable", "para", + "simpara", "formalpara", "equation", "informalequation", + "figure", "screenshot", "mediaobject", "qandaset", + "procedure", "task", "cmdsynopsis", "funcsynopsis", + "classsynopsis", "blockquote", "epigraph", "msgset", + "sidebar", "title"] + +epubTags :: Set.Set Text +epubTags = Set.fromList ["case", "switch", "default"] + +blockTags :: Set.Set Text +blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] + +class NamedTag a where + getTagName :: a -> Maybe Text + +instance NamedTag (Tag Text) where + getTagName (TagOpen t _) = Just t + getTagName (TagClose t) = Just t + getTagName _ = Nothing + +instance NamedTag (Tag String) where + getTagName (TagOpen t _) = Just (T.pack t) + getTagName (TagClose t) = Just (T.pack t) + getTagName _ = Nothing + +isInlineTag :: NamedTag (Tag a) => Tag a -> Bool +isInlineTag t = isInlineTagName || isCommentTag t + where isInlineTagName = case getTagName t of + Just x -> x + `Set.notMember` blockTags + Nothing -> False + +isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +isBlockTag t = isBlockTagName || isTagComment t + where isBlockTagName = + case getTagName t of + Just x + | "?" `T.isPrefixOf` x -> True + | "!" `T.isPrefixOf` x -> True + | otherwise -> x `Set.member` blockTags + || x `Set.member` eitherBlockOrInline + Nothing -> False + +isTextTag :: Tag a -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag String -> Bool +isCommentTag :: Tag a -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: String -> String -> Bool +closes :: Text -> Text -> Bool _ `closes` "body" = False _ `closes` "html" = False "body" `closes` "head" = True "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True @@ -926,12 +1072,12 @@ _ `closes` "html" = False "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True --- http://www.w3.org/TR/html-markup/p.html +-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", "table", "ul"] = True -"meta" `closes` "meta" = True +_ `closes` "meta" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True @@ -942,17 +1088,18 @@ t `closes` "select" | t /= "option" = True "tfoot" `closes` t | t `elem` ["thead","colgroup"] = True "tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True t `closes` t2 | - t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && - t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" t1 `closes` t2 | - t1 `elem` blockTags && - t2 `notElem` (blockTags ++ eitherBlockOrInline) = True + t1 `Set.member` blockTags && + t2 `Set.notMember` blockTags && + t2 `Set.notMember` eitherBlockOrInline = True _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Monad m) +htmlInBalanced :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do @@ -973,8 +1120,11 @@ htmlInBalanced f = try $ do let cs = ec - sc lscontents <- unlines <$> count ls anyLine cscontents <- count cs anyChar - (_,closetag) <- htmlTag (~== TagClose tn) - return (lscontents ++ cscontents ++ closetag) + closetag <- do + x <- many (satisfy (/='>')) + char '>' + return (x <> ">") + return (lscontents <> cscontents <> closetag) _ -> mzero _ -> mzero @@ -992,64 +1142,99 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag String] -> Bool +hasTagWarning :: [Tag a] -> Bool hasTagWarning (TagWarning _:_) = True -hasTagWarning _ = False +hasTagWarning _ = False -- | Matches a tag meeting a certain condition. -htmlTag :: Monad m +htmlTag :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead (char '<') + startpos <- getPosition inp <- getInput - let (next : _) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = False } inp - guard $ f next + let ts = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False + , optTagPosition = True } + (inp ++ " ") -- add space to ensure that + -- we get a TagPosition after the tag + (next, ln, col) <- case ts of + (TagPosition{} : next : TagPosition ln col : _) + | f next -> return (next, ln, col) + _ -> mzero + + -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> + -- should NOT be parsed as an HTML tag, see #2277, + -- so we exclude . even though it's a valid character + -- in XML element names + let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_' + let isName s = case s of + [] -> False + ('?':_) -> True -- processing instruction + (c:cs) -> isLetter c && all isNameChar cs + + let endpos = if ln == 1 + then setSourceColumn startpos + (sourceColumn startpos + (col - 1)) + else setSourceColumn (setSourceLine startpos + (sourceLine startpos + (ln - 1))) + col + let endAngle = try $ + do char '>' + pos <- getPosition + guard $ pos >= endpos + let handleTag tagname = do - -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> - -- should NOT be parsed as an HTML tag, see #2277 - guard $ not ('.' `elem` tagname) + -- basic sanity check, since the parser is very forgiving + -- and finds tags in stuff like x<y) + guard $ isName tagname + guard $ not $ null tagname -- <https://example.org> should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] - guard $ not (null tagname) guard $ last tagname /= ':' - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + char '<' + rendered <- manyTill anyChar endAngle + return (next, "<" ++ rendered ++ ">") case next of TagComment s | "<!--" `isPrefixOf` inp -> do - count (length s + 4) anyChar - skipMany (satisfy (/='>')) - char '>' - return (next, "<!--" ++ s ++ "-->") + string "<!--" + count (length s) anyChar + string "-->" + stripComments <- getOption readerStripComments + if stripComments + then return (next, "") + else return (next, "<!--" <> s <> "-->") | otherwise -> fail "bogus comment mode, HTML5 parse error" - TagOpen tagname _attr -> handleTag tagname - TagClose tagname -> handleTag tagname + TagOpen tagname attr -> do + guard $ all (isName . fst) attr + handleTag tagname + TagClose tagname -> + handleTag tagname _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes + attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr -- Strip namespace prefixes -stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix -stripPrefix :: Tag String -> Tag String +stripPrefix :: Tag Text -> Tag Text stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) + TagOpen (stripPrefix' s) (map (first stripPrefix') as) stripPrefix (TagClose s) = TagClose (stripPrefix' s) stripPrefix x = x -stripPrefix' :: String -> String +stripPrefix' :: Text -> Text stripPrefix' s = - case span (/= ':') s of - (_, "") -> s - (_, (_:ts)) -> ts + if T.null t then s else T.drop 1 t + where (_, t) = T.span (/= ':') s isSpace :: Char -> Bool isSpace ' ' = True @@ -1068,9 +1253,13 @@ instance HasHeaderMap HTMLState where extractHeaderMap = headerMap updateHeaderMap f s = s{ headerMap = f (headerMap s) } +instance HasLogMessages HTMLState where + addLogMessage m s = s{ logMessages = m : logMessages s } + getLogMessages = reverse . logMessages + -- This signature should be more general -- MonadReader HTMLLocal m => HasQuoteContext st m -instance HasQuoteContext st (Reader HTMLLocal) where +instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where getQuoteContext = asks quoteContext withQuoteContext q = local (\s -> s{quoteContext = q}) @@ -1088,19 +1277,32 @@ instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState +-- For now we need a special verison here; the one in Shared has String type +renderTags' :: [Tag Text] -> Text +renderTags' = renderTagsOptions + renderOptions{ optMinimize = matchTags ["hr", "br", "img", + "meta", "link"] + , optRawTag = matchTags ["script", "style"] } + where matchTags tags = flip elem tags . T.toLower + -- EPUB Specific -- -- -sectioningContent :: [String] +sectioningContent :: [Text] sectioningContent = ["article", "aside", "nav", "section"] -groupingContent :: [String] +groupingContent :: [Text] groupingContent = ["p", "hr", "pre", "blockquote", "ol" , "ul", "li", "dl", "dt", "dt", "dd" , "figure", "figcaption", "div", "main"] +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) {- @@ -1108,7 +1310,7 @@ types :: [(String, ([String], Int))] types = -- Document divisions map (\s -> (s, (["section", "body"], 0))) ["volume", "part", "chapter", "division"] - ++ -- Document section and components + <> -- Document section and components [ ("abstract", ([], 0))] -} diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 12953bb72..e98c79ed8 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -14,33 +14,40 @@ module Text.Pandoc.Readers.Haddock ( readHaddock ) where -import Text.Pandoc.Builder (Blocks, Inlines) -import qualified Text.Pandoc.Builder as B -import Data.Monoid ((<>)) -import Text.Pandoc.Shared (trim, splitBy) +import Control.Monad.Except (throwError) import Data.List (intersperse, stripPrefix) import Data.Maybe (fromMaybe) -import Text.Pandoc.Definition -import Text.Pandoc.Options +import Data.Monoid ((<>)) +import Data.Text (Text, unpack) import Documentation.Haddock.Parser import Documentation.Haddock.Types -import Debug.Trace (trace) - +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition import Text.Pandoc.Error +import Text.Pandoc.Options +import Text.Pandoc.Shared (crFilter, splitBy, trim) + -- | Parse Haddock markup and return a 'Pandoc' document. -readHaddock :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse - -> Either PandocError Pandoc -readHaddock opts = +readHaddock :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readHaddock opts s = case readHaddockEither opts (unpack (crFilter s)) of + Right result -> return result + Left e -> throwError e + +readHaddockEither :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse + -> Either PandocError Pandoc +readHaddockEither _opts = #if MIN_VERSION_haddock_library(1,2,0) - Right . B.doc . docHToBlocks . trace' . _doc . parseParas + Right . B.doc . docHToBlocks . _doc . parseParas #else - Right . B.doc . docHToBlocks . trace' . parseParas + Right . B.doc . docHToBlocks . parseParas #endif - where trace' x = if readerTrace opts - then trace (show x) x - else x docHToBlocks :: DocH String Identifier -> Blocks docHToBlocks d' = @@ -90,7 +97,7 @@ docHToBlocks d' = isPlain (Plain _) = True isPlain _ = False extractContents (Plain xs) = xs - extractContents _ = [] + extractContents _ = [] docHToInlines :: Bool -> DocH String Identifier -> Inlines docHToInlines isCode d' = @@ -135,7 +142,7 @@ makeExample prompt expression result = <> B.space <> B.codeWith ([], ["haskell","expr"], []) (trim expression) <> B.linebreak - <> (mconcat $ intersperse B.linebreak $ map coder result') + <> mconcat (intersperse B.linebreak $ map coder result') where -- 1. drop trailing whitespace from the prompt, remember the prefix prefix = takeWhile (`elem` " \t") prompt diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs new file mode 100644 index 000000000..8158a4511 --- /dev/null +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -0,0 +1,496 @@ +{-# LANGUAGE ExplicitForAll, TupleSections #-} +module Text.Pandoc.Readers.JATS ( readJATS ) where +import Control.Monad.State.Strict +import Data.Char (isDigit, isSpace, toUpper) +import Data.Default +import Data.Generics +import Data.List (intersperse) +import qualified Data.Map as Map +import Data.Maybe (maybeToList, fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Options +import Text.Pandoc.Shared (underlineSpan, crFilter, safeRead) +import Text.TeXMath (readMathML, writeTeX) +import Text.XML.Light +import qualified Data.Set as S (fromList, member) +import Data.Set ((\\)) + +type JATS m = StateT JATSState m + +data JATSState = JATSState{ jatsSectionLevel :: Int + , jatsQuoteType :: QuoteType + , jatsMeta :: Meta + , jatsBook :: Bool + , jatsFigureTitle :: Inlines + , jatsContent :: [Content] + } deriving Show + +instance Default JATSState where + def = JATSState{ jatsSectionLevel = 0 + , jatsQuoteType = DoubleQuote + , jatsMeta = mempty + , jatsBook = False + , jatsFigureTitle = mempty + , jatsContent = [] } + + +readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readJATS _ inp = do + let tree = normalizeTree . parseXML + $ T.unpack $ crFilter inp + (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree + return $ Pandoc (jatsMeta st') (toList . mconcat $ bs) + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr = + fromMaybe "" . maybeAttrValue attr + +maybeAttrValue :: String -> Element -> Maybe String +maybeAttrValue attr elt = + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + +-- convenience function +named :: String -> Element -> Bool +named s e = qName (elName e) == s + +-- + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> JATS m () +addMeta field val = modify (setMeta field val) + +instance HasMeta JATSState where + setMeta field v s = s {jatsMeta = setMeta field v (jatsMeta s)} + deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)} + +isBlockElement :: Content -> Bool +isBlockElement (Elem e) = qName (elName e) `S.member` blocktags + where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags + paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap", + "code", "fig", "fig-group", "graphic", "media", "preformat", + "supplementary-material", "table-wrap", "table-wrap-group", + "alternatives", "disp-formula", "disp-formula-group"] + lists = ["def-list", "list"] + mathML = ["tex-math", "mml:math"] + other = ["p", "related-article", "related-object", "ack", "disp-quote", + "speech", "statement", "verse-group", "x"] + inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material", + "related-article", "related-object", "hr", "bold", "fixed-case", + "italic", "monospace", "overline", "overline-start", "overline-end", + "roman", "sans-serif", "sc", "strike", "underline", "underline-start", + "underline-end", "ruby", "alternatives", "inline-graphic", "private-char", + "chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev", + "milestone-end", "milestone-start", "named-content", "styled-content", + "fn", "target", "xref", "sub", "sup", "x", "address", "array", + "boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic", + "media", "preformat", "supplementary-material", "table-wrap", + "table-wrap-group", "disp-formula", "disp-formula-group", + "citation-alternatives", "element-citation", "mixed-citation", + "nlm-citation", "award-id", "funding-source", "open-access", + "def-list", "list", "ack", "disp-quote", "speech", "statement", + "verse-group"] +isBlockElement _ = False + +-- Trim leading and trailing newline characters +trimNl :: String -> String +trimNl = reverse . go . reverse . go + where go ('\n':xs) = xs + go xs = xs + +-- function that is used by both graphic (in parseBlock) +-- and inline-graphic (in parseInline) +getGraphic :: PandocMonad m => Element -> JATS m Inlines +getGraphic e = do + let atVal a = attrValue a e + attr = (atVal "id", words $ atVal "role", []) + imageUrl = atVal "href" + captionOrLabel = case filterChild (\x -> named "caption" x + || named "label" x) e of + Nothing -> return mempty + Just z -> mconcat <$> + mapM parseInline (elContent z) + figTitle <- gets jatsFigureTitle + let (caption, title) = if isNull figTitle + then (captionOrLabel, atVal "title") + else (return figTitle, "fig:") + fmap (imageWith attr imageUrl title) caption + +getBlocks :: PandocMonad m => Element -> JATS m Blocks +getBlocks e = mconcat <$> + mapM parseBlock (elContent e) + + +parseBlock :: PandocMonad m => Content -> JATS m Blocks +parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE +parseBlock (Text (CData _ s _)) = if all isSpace s + then return mempty + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ map toUpper x +parseBlock (Elem e) = + case qName (elName e) of + "p" -> parseMixed para (elContent e) + "code" -> codeBlockWithLang + "preformat" -> codeBlockWithLang + "disp-quote" -> parseBlockquote + "list" -> case attrValue "list-type" e of + "bullet" -> bulletList <$> listitems + listType -> do + let start = fromMaybe 1 $ + (strContent <$> (filterElement (named "list-item") e + >>= filterElement (named "lable"))) + >>= safeRead + orderedListWith (start, parseListStyleType listType, DefaultDelim) + <$> listitems + "def-list" -> definitionList <$> deflistitems + "sec" -> gets jatsSectionLevel >>= sect . (+1) + "graphic" -> para <$> getGraphic e + "journal-meta" -> parseMetadata e + "article-meta" -> parseMetadata e + "custom-meta" -> parseMetadata e + "title" -> return mempty -- processed by header + "table" -> parseTable + "fig" -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e + "table-wrap" -> divWith (attrValue "id" e, ["table-wrap"], []) <$> getBlocks e + "caption" -> divWith (attrValue "id" e, ["caption"], []) <$> sect 6 + "ref-list" -> parseRefList e + "?xml" -> return mempty + _ -> getBlocks e + where parseMixed container conts = do + let (ils,rest) = break isBlockElement conts + ils' <- (trimInlines . mconcat) <$> mapM parseInline ils + let p = if ils' == mempty then mempty else container ils' + case rest of + [] -> return p + (r:rs) -> do + b <- parseBlock r + x <- parseMixed container rs + return $ p <> b <> x + codeBlockWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + x -> [x] + return $ codeBlockWith (attrValue "id" e, classes', []) + $ trimNl $ strContentRecursive e + parseBlockquote = do + attrib <- case filterChild (named "attribution") e of + Nothing -> return mempty + Just z -> (para . (str "— " <>) . mconcat) + <$> + mapM parseInline (elContent z) + contents <- getBlocks e + return $ blockQuote (contents <> attrib) + parseListStyleType "roman-lower" = LowerRoman + parseListStyleType "roman-upper" = UpperRoman + parseListStyleType "alpha-lower" = LowerAlpha + parseListStyleType "alpha-upper" = UpperAlpha + parseListStyleType _ = DefaultStyle + listitems = mapM getBlocks $ filterChildren (named "list-item") e + deflistitems = mapM parseVarListEntry $ filterChildren + (named "def-item") e + parseVarListEntry e' = do + let terms = filterChildren (named "term") e' + let items = filterChildren (named "def") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + parseTable = do + let isCaption x = named "title" x || named "caption" x + caption <- case filterChild isCaption e of + Just t -> getInlines t + Nothing -> return mempty + let e' = fromMaybe e $ filterChild (named "tgroup") e + let isColspec x = named "colspec" x || named "col" x + let colspecs = case filterChild (named "colgroup") e' of + Just c -> filterChildren isColspec c + _ -> filterChildren isColspec e' + let isRow x = named "row" x || named "tr" x + headrows <- case filterChild (named "thead") e' of + Just h -> case filterChild isRow h of + Just x -> parseRow x + Nothing -> return [] + Nothing -> return [] + bodyrows <- case filterChild (named "tbody") e' of + Just b -> mapM parseRow + $ filterChildren isRow b + Nothing -> mapM parseRow + $ filterChildren isRow e' + let toAlignment c = case findAttr (unqual "align") c of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let toWidth c = case findAttr (unqual "colwidth") c of + Just w -> fromMaybe 0 + $ safeRead $ '0': filter (\x -> + isDigit x || x == '.') w + Nothing -> 0 :: Double + let numrows = case bodyrows of + [] -> 0 + xs -> maximum $ map length xs + let aligns = case colspecs of + [] -> replicate numrows AlignDefault + cs -> map toAlignment cs + let widths = case colspecs of + [] -> replicate numrows 0 + cs -> let ws = map toWidth cs + tot = sum ws + in if all (> 0) ws + then map (/ tot) ws + else replicate numrows 0 + let headrows' = if null headrows + then replicate numrows mempty + else headrows + return $ table caption (zip aligns widths) + headrows' bodyrows + isEntry x = named "entry" x || named "td" x || named "th" x + parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry + sect n = do isbook <- gets jatsBook + let n' = if isbook || n == 0 then n + 1 else n + headerText <- case filterChild (named "title") e `mplus` + (filterChild (named "info") e >>= + filterChild (named "title")) of + Just t -> getInlines t + Nothing -> return mempty + oldN <- gets jatsSectionLevel + modify $ \st -> st{ jatsSectionLevel = n } + b <- getBlocks e + let ident = attrValue "id" e + modify $ \st -> st{ jatsSectionLevel = oldN } + return $ headerWith (ident,[],[]) n' headerText <> b + +getInlines :: PandocMonad m => Element -> JATS m Inlines +getInlines e' = (trimInlines . mconcat) <$> + mapM parseInline (elContent e') + +parseMetadata :: PandocMonad m => Element -> JATS m Blocks +parseMetadata e = do + getTitle e + getAuthors e + getAffiliations e + return mempty + +getTitle :: PandocMonad m => Element -> JATS m () +getTitle e = do + tit <- case filterElement (named "article-title") e of + Just s -> getInlines s + Nothing -> return mempty + subtit <- case filterElement (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + when (tit /= mempty) $ addMeta "title" tit + when (subtit /= mempty) $ addMeta "subtitle" subtit + +getAuthors :: PandocMonad m => Element -> JATS m () +getAuthors e = do + authors <- mapM getContrib $ filterElements + (\x -> named "contrib" x && + attrValue "contrib-type" x == "author") e + authorNotes <- mapM getInlines $ filterElements (named "author-notes") e + let authors' = case (reverse authors, authorNotes) of + ([], _) -> [] + (_, []) -> authors + (a:as, ns) -> reverse as ++ [a <> mconcat ns] + unless (null authors) $ addMeta "author" authors' + +getAffiliations :: PandocMonad m => Element -> JATS m () +getAffiliations x = do + affs <- mapM getInlines $ filterChildren (named "aff") x + unless (null affs) $ addMeta "institute" affs + +getContrib :: PandocMonad m => Element -> JATS m Inlines +getContrib x = do + given <- maybe (return mempty) getInlines + $ filterElement (named "given-names") x + family <- maybe (return mempty) getInlines + $ filterElement (named "surname") x + if given == mempty && family == mempty + then return mempty + else if given == mempty || family == mempty + then return $ given <> family + else return $ given <> space <> family + +parseRefList :: PandocMonad m => Element -> JATS m Blocks +parseRefList e = do + refs <- mapM parseRef $ filterChildren (named "ref") e + addMeta "references" refs + return mempty + +parseRef :: PandocMonad m + => Element -> JATS m (Map.Map String MetaValue) +parseRef e = do + let refId = text $ attrValue "id" e + let getInlineText n = maybe (return mempty) getInlines . filterChild (named n) + case filterChild (named "element-citation") e of + Just c -> do + let refType = text $ + case attrValue "publication-type" c of + "journal" -> "article-journal" + x -> x + (refTitle, refContainerTitle) <- do + t <- getInlineText "article-title" c + ct <- getInlineText "source" c + if t == mempty + then return (ct, mempty) + else return (t, ct) + refLabel <- getInlineText "label" c + refYear <- getInlineText "year" c + refVolume <- getInlineText "volume" c + refFirstPage <- getInlineText "fpage" c + refLastPage <- getInlineText "lpage" c + refPublisher <- getInlineText "publisher-name" c + refPublisherPlace <- getInlineText "publisher-loc" c + let refPages = refFirstPage <> (if refLastPage == mempty + then mempty + else text "\x2013" <> refLastPage) + let personGroups' = filterChildren (named "person-group") c + let getName nm = do + given <- maybe (return mempty) getInlines + $ filterChild (named "given-names") nm + family <- maybe (return mempty) getInlines + $ filterChild (named "surname") nm + return $ toMetaValue $ Map.fromList [ + ("given", given) + , ("family", family) + ] + personGroups <- mapM (\pg -> + do names <- mapM getName + (filterChildren (named "name") pg) + return (attrValue "person-group-type" pg, + toMetaValue names)) + personGroups' + return $ Map.fromList $ + [ ("id", toMetaValue refId) + , ("type", toMetaValue refType) + , ("title", toMetaValue refTitle) + , ("container-title", toMetaValue refContainerTitle) + , ("publisher", toMetaValue refPublisher) + , ("publisher-place", toMetaValue refPublisherPlace) + , ("title", toMetaValue refTitle) + , ("issued", toMetaValue + $ Map.fromList [ + ("year", refYear) + ]) + , ("volume", toMetaValue refVolume) + , ("page", toMetaValue refPages) + , ("citation-label", toMetaValue refLabel) + ] ++ personGroups + Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty + -- TODO handle mixed-citation + +strContentRecursive :: Element -> String +strContentRecursive = strContent . + (\e' -> e'{ elContent = map elementToStr $ elContent e' }) + +elementToStr :: Content -> Content +elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing +elementToStr x = x + +parseInline :: PandocMonad m => Content -> JATS m Inlines +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = + return $ maybe (text $ map toUpper ref) text $ lookupEntity ref +parseInline (Elem e) = + case qName (elName e) of + "italic" -> emph <$> innerInlines + "bold" -> strong <$> innerInlines + "strike" -> strikeout <$> innerInlines + "sub" -> subscript <$> innerInlines + "sup" -> superscript <$> innerInlines + "underline" -> underlineSpan <$> innerInlines + "break" -> return linebreak + "sc" -> smallcaps <$> innerInlines + + "code" -> codeWithLang + "monospace" -> codeWithLang + + "inline-graphic" -> getGraphic e + "disp-quote" -> do + qt <- gets jatsQuoteType + let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote + modify $ \st -> st{ jatsQuoteType = qt' } + contents <- innerInlines + modify $ \st -> st{ jatsQuoteType = qt } + return $ if qt == SingleQuote + then singleQuoted contents + else doubleQuoted contents + + "xref" -> do + ils <- innerInlines + let rid = attrValue "rid" e + let refType = ("ref-type",) <$> maybeAttrValue "ref-type" e + let attr = (attrValue "id" e, [], maybeToList refType) + return $ if refType == Just ("ref-type","bibr") + then cite [Citation{ + citationId = rid + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0}] ils + else linkWith attr ('#' : rid) "" ils + "ext-link" -> do + ils <- innerInlines + let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e + let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + Just h -> h + _ -> '#' : attrValue "rid" e + let ils' = if ils == mempty then str href else ils + let attr = (attrValue "id" e, [], []) + return $ linkWith attr href title ils' + + "disp-formula" -> formula displayMath + "inline-formula" -> formula math + "math" | qPrefix (elName e) == Just "mml" -> return . math $ mathML e + "tex-math" -> return . math $ strContent e + + "email" -> return $ link ("mailto:" ++ strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e + "fn" -> (note . mconcat) <$> + mapM parseBlock (elContent e) + _ -> innerInlines + where innerInlines = (trimInlines . mconcat) <$> + mapM parseInline (elContent e) + mathML x = + case readMathML . showElement $ everywhere (mkT removePrefix) x of + Left _ -> mempty + Right m -> writeTeX m + formula constructor = do + let whereToLook = fromMaybe e $ filterElement (named "alternatives") e + texMaths = map strContent $ + filterChildren (named "tex-math") whereToLook + mathMLs = map mathML $ + filterChildren isMathML whereToLook + return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs + + isMathML x = qName (elName x) == "math" && + qPrefix (elName x) == Just "mml" + removePrefix elname = elname { qPrefix = Nothing } + codeWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + l -> [l] + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index edcf35e51..57d2803ba 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,6 +1,10 @@ -{-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ScopedTypeVariables #-} {- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +23,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2015 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,218 +31,596 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Portability : portable Conversion of LaTeX to 'Pandoc' document. + -} module Text.Pandoc.Readers.LaTeX ( readLaTeX, + applyMacros, rawLaTeXInline, rawLaTeXBlock, inlineCommand, - handleIncludes + tokenize, + untokenize ) where -import Text.Pandoc.Definition -import Text.Pandoc.Walk -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, - mathDisplay, mathInline) -import qualified Text.Pandoc.UTF8 as UTF8 -import Data.Char ( chr, ord, isLetter, isAlphaNum ) -import Control.Monad.Trans (lift) +import Control.Applicative (many, optional, (<|>)) import Control.Monad -import Text.Pandoc.Builder -import Control.Applicative ((<|>), many, optional) -import Data.Maybe (fromMaybe, maybeToList) -import System.Environment (getEnv) -import System.FilePath (replaceExtension, (</>), takeExtension, addExtension) -import Data.List (intercalate) +import Control.Monad.Except (throwError) +import Control.Monad.Trans (lift) +import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord, toLower) +import Data.Default +import Data.List (intercalate, isPrefixOf) import qualified Data.Map as M -import qualified Control.Exception as E -import Text.Pandoc.Highlighting (fromListingsLanguage) +import Data.Maybe (fromMaybe, maybeToList) +import qualified Data.Set as Set +import Data.Text (Text) +import qualified Data.Text as T +import Safe (minimumDef) +import System.FilePath (addExtension, replaceExtension, takeExtension) +import Text.Pandoc.BCP47 (Lang (..), renderLang) +import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad, PandocPure, getResourcePath, lookupEnv, + readFileFromDirs, report, setResourcePath, + setTranslations, translateTerm, trace) +import Text.Pandoc.Error (PandocError (PandocMacroLoop, PandocParseError, PandocParsecError)) +import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension) import Text.Pandoc.ImageSize (numUnit, showFl) -import Text.Pandoc.Error +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) +import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), + Tok (..), TokType (..)) +import Text.Pandoc.Shared +import qualified Text.Pandoc.Translations as Translations +import Text.Pandoc.Walk +import Text.Parsec.Pos +import qualified Text.Pandoc.Builder as B --- | Parse LaTeX from string and return 'Pandoc' document. -readLaTeX :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assumes @'\n'@ line endings) - -> Either PandocError Pandoc -readLaTeX opts = readWith parseLaTeX def{ stateOptions = opts } +-- for debugging: +-- import Text.Pandoc.Extensions (getDefaultExtensions) +-- import Text.Pandoc.Class (runIOorExplode, PandocIO) +-- import Debug.Trace (traceShowId) -parseLaTeX :: LP Pandoc +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> m Pandoc +readLaTeX opts ltx = do + parsed <- runParserT parseLaTeX def{ sOptions = opts } "source" + (tokenize "source" (crFilter ltx)) + case parsed of + Right result -> return result + Left e -> throwError $ PandocParsecError (T.unpack ltx) e + +parseLaTeX :: PandocMonad m => LP m Pandoc parseLaTeX = do bs <- blocks eof st <- getState - let meta = stateMeta st - let (Pandoc _ bs') = doc bs + let meta = sMeta st + let doc' = doc bs + let headerLevel (Header n _ _) = [n] + headerLevel _ = [] + let bottomLevel = minimumDef 1 $ query headerLevel doc' + let adjustHeaders m (Header n attr ils) = Header (n+m) attr ils + adjustHeaders _ x = x + let (Pandoc _ bs') = + -- handle the case where you have \part or \chapter + (if bottomLevel < 1 + then walk (adjustHeaders (1 - bottomLevel)) + else id) $ + walk (resolveRefs (sLabels st)) doc' return $ Pandoc meta bs' -type LP = Parser String ParserState - -anyControlSeq :: LP String -anyControlSeq = do - char '\\' - next <- option '\n' anyChar - case next of - '\n' -> return "" - c | isLetter c -> (c:) <$> (many letter <* optional sp) - | otherwise -> return [c] - -controlSeq :: String -> LP String -controlSeq name = try $ do - char '\\' - case name of - "" -> mzero - [c] | not (isLetter c) -> string [c] - cs -> string cs <* notFollowedBy letter <* optional sp - return name - -dimenarg :: LP String -dimenarg = try $ do - ch <- option "" $ string "=" - num <- many1 digit - dim <- oneOfStrings ["pt","pc","in","bp","cm","mm","dd","cc","sp"] - return $ ch ++ num ++ dim - -sp :: LP () -sp = whitespace <|> endline - -whitespace :: LP () -whitespace = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') - -endline :: LP () -endline = try (newline >> lookAhead anyChar >> notFollowedBy blankline) +resolveRefs :: M.Map String [Inline] -> Inline -> Inline +resolveRefs labels x@(Link (ident,classes,kvs) _ _) = + case (lookup "reference-type" kvs, + lookup "reference" kvs) of + (Just "ref", Just lab) -> + case M.lookup lab labels of + Just txt -> Link (ident,classes,kvs) txt ('#':lab, "") + Nothing -> x + _ -> x +resolveRefs _ x = x + + +-- testParser :: LP PandocIO a -> Text -> IO a +-- testParser p t = do +-- res <- runIOorExplode (runParserT p defaultLaTeXState{ +-- sOptions = def{ readerExtensions = +-- enableExtension Ext_raw_tex $ +-- getDefaultExtensions "latex" }} "source" (tokenize "source" t)) +-- case res of +-- Left e -> error (show e) +-- Right r -> return r + +newtype HeaderNum = HeaderNum [Int] + deriving (Show) + +renderHeaderNum :: HeaderNum -> String +renderHeaderNum (HeaderNum xs) = + intercalate "." (map show xs) + +incrementHeaderNum :: Int -> HeaderNum -> HeaderNum +incrementHeaderNum level (HeaderNum ns) = HeaderNum $ + case reverse (take level (ns ++ repeat 0)) of + (x:xs) -> reverse (x+1 : xs) + [] -> [] -- shouldn't happen + +data LaTeXState = LaTeXState{ sOptions :: ReaderOptions + , sMeta :: Meta + , sQuoteContext :: QuoteContext + , sMacros :: M.Map Text Macro + , sContainers :: [String] + , sHeaders :: M.Map Inlines String + , sLogMessages :: [LogMessage] + , sIdentifiers :: Set.Set String + , sVerbatimMode :: Bool + , sCaption :: Maybe Inlines + , sInListItem :: Bool + , sInTableCell :: Bool + , sLastHeaderNum :: HeaderNum + , sLabels :: M.Map String [Inline] + , sToggles :: M.Map String Bool + } + deriving Show + +defaultLaTeXState :: LaTeXState +defaultLaTeXState = LaTeXState{ sOptions = def + , sMeta = nullMeta + , sQuoteContext = NoQuote + , sMacros = M.empty + , sContainers = [] + , sHeaders = M.empty + , sLogMessages = [] + , sIdentifiers = Set.empty + , sVerbatimMode = False + , sCaption = Nothing + , sInListItem = False + , sInTableCell = False + , sLastHeaderNum = HeaderNum [] + , sLabels = M.empty + , sToggles = M.empty + } + +instance PandocMonad m => HasQuoteContext LaTeXState m where + getQuoteContext = sQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = sQuoteContext oldState + setState oldState { sQuoteContext = context } + result <- parser + newState <- getState + setState newState { sQuoteContext = oldQuoteContext } + return result + +instance HasLogMessages LaTeXState where + addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st } + getLogMessages st = reverse $ sLogMessages st + +instance HasIdentifierList LaTeXState where + extractIdentifierList = sIdentifiers + updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st } + +instance HasIncludeFiles LaTeXState where + getIncludeFiles = sContainers + addIncludeFile f s = s{ sContainers = f : sContainers s } + dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s } + +instance HasHeaderMap LaTeXState where + extractHeaderMap = sHeaders + updateHeaderMap f st = st{ sHeaders = f $ sHeaders st } + +instance HasMacros LaTeXState where + extractMacros st = sMacros st + updateMacros f st = st{ sMacros = f (sMacros st) } + +instance HasReaderOptions LaTeXState where + extractReaderOptions = sOptions + +instance HasMeta LaTeXState where + setMeta field val st = + st{ sMeta = setMeta field val $ sMeta st } + deleteMeta field st = + st{ sMeta = deleteMeta field $ sMeta st } + +instance Default LaTeXState where + def = defaultLaTeXState + +type LP m = ParserT [Tok] LaTeXState m + +withVerbatimMode :: PandocMonad m => LP m a -> LP m a +withVerbatimMode parser = do + updateState $ \st -> st{ sVerbatimMode = True } + result <- parser + updateState $ \st -> st{ sVerbatimMode = False } + return result + +rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => LP m a -> ParserT String s m (a, String) +rawLaTeXParser parser = do + inp <- getInput + let toks = tokenize "source" $ T.pack inp + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + let rawparser = (,) <$> withRaw parser <*> getState + res <- lift $ runParserT rawparser lstate "chunk" toks + case res of + Left _ -> mzero + Right ((val, raw), st) -> do + updateState (updateMacros (sMacros st <>)) + rawstring <- takeP (T.length (untokenize raw)) + return (val, rawstring) + +applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => String -> ParserT String s m String +applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> + do let retokenize = doMacros 0 *> + (toksToString <$> many (satisfyTok (const True))) + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s)) + case res of + Left e -> fail (show e) + Right s' -> return s' + +rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => ParserT String s m String +rawLaTeXBlock = do + lookAhead (try (char '\\' >> letter)) + -- we don't want to apply newly defined latex macros to their own + -- definitions: + snd <$> rawLaTeXParser macroDef + <|> ((snd <$> rawLaTeXParser (environment <|> blockCommand)) >>= applyMacros) + +rawLaTeXInline :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => ParserT String s m String +rawLaTeXInline = do + lookAhead (try (char '\\' >> letter)) + rawLaTeXParser (inlineEnvironment <|> inlineCommand') >>= applyMacros . snd + +inlineCommand :: PandocMonad m => ParserT String ParserState m Inlines +inlineCommand = do + lookAhead (try (char '\\' >> letter)) + fst <$> rawLaTeXParser (inlineEnvironment <|> inlineCommand') + +tokenize :: SourceName -> Text -> [Tok] +tokenize sourcename = totoks (initialPos sourcename) + +totoks :: SourcePos -> Text -> [Tok] +totoks pos t = + case T.uncons t of + Nothing -> [] + Just (c, rest) + | c == '\n' -> + Tok pos Newline "\n" + : totoks (setSourceColumn (incSourceLine pos 1) 1) rest + | isSpaceOrTab c -> + let (sps, rest') = T.span isSpaceOrTab t + in Tok pos Spaces sps + : totoks (incSourceColumn pos (T.length sps)) + rest' + | isAlphaNum c -> + let (ws, rest') = T.span isAlphaNum t + in Tok pos Word ws + : totoks (incSourceColumn pos (T.length ws)) rest' + | c == '%' -> + let (cs, rest') = T.break (== '\n') rest + in Tok pos Comment ("%" <> cs) + : totoks (incSourceColumn pos (1 + T.length cs)) rest' + | c == '\\' -> + case T.uncons rest of + Nothing -> [Tok pos (CtrlSeq " ") "\\"] + Just (d, rest') + | isLetterOrAt d -> + -- \makeatletter is common in macro defs; + -- ideally we should make tokenization sensitive + -- to \makeatletter and \makeatother, but this is + -- probably best for now + let (ws, rest'') = T.span isLetterOrAt rest + (ss, rest''') = T.span isSpaceOrTab rest'' + in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss) + : totoks (incSourceColumn pos + (1 + T.length ws + T.length ss)) rest''' + | isSpaceOrTab d || d == '\n' -> + let (w1, r1) = T.span isSpaceOrTab rest + (w2, (w3, r3)) = case T.uncons r1 of + Just ('\n', r2) + -> (T.pack "\n", + T.span isSpaceOrTab r2) + _ -> (mempty, (mempty, r1)) + ws = "\\" <> w1 <> w2 <> w3 + in case T.uncons r3 of + Just ('\n', _) -> + Tok pos (CtrlSeq " ") ("\\" <> w1) + : totoks (incSourceColumn pos (T.length ws)) + r1 + _ -> + Tok pos (CtrlSeq " ") ws + : totoks (incSourceColumn pos (T.length ws)) + r3 + | otherwise -> + Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d]) + : totoks (incSourceColumn pos 2) rest' + | c == '#' -> + let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest + in case safeRead (T.unpack t1) of + Just i -> + Tok pos (Arg i) ("#" <> t1) + : totoks (incSourceColumn pos (1 + T.length t1)) t2 + Nothing -> + Tok pos Symbol "#" + : totoks (incSourceColumn pos 1) t2 + | c == '^' -> + case T.uncons rest of + Just ('^', rest') -> + case T.uncons rest' of + Just (d, rest'') + | isLowerHex d -> + case T.uncons rest'' of + Just (e, rest''') | isLowerHex e -> + Tok pos Esc2 (T.pack ['^','^',d,e]) + : totoks (incSourceColumn pos 4) rest''' + _ -> + Tok pos Esc1 (T.pack ['^','^',d]) + : totoks (incSourceColumn pos 3) rest'' + | d < '\128' -> + Tok pos Esc1 (T.pack ['^','^',d]) + : totoks (incSourceColumn pos 3) rest'' + _ -> Tok pos Symbol "^" : + Tok (incSourceColumn pos 1) Symbol "^" : + totoks (incSourceColumn pos 2) rest' + _ -> Tok pos Symbol "^" + : totoks (incSourceColumn pos 1) rest + | otherwise -> + Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest + +isSpaceOrTab :: Char -> Bool +isSpaceOrTab ' ' = True +isSpaceOrTab '\t' = True +isSpaceOrTab _ = False + +isLetterOrAt :: Char -> Bool +isLetterOrAt '@' = True +isLetterOrAt c = isLetter c isLowerHex :: Char -> Bool isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' -tildeEscape :: LP Char -tildeEscape = try $ do - string "^^" - c <- satisfy (\x -> x >= '\0' && x <= '\128') - d <- if isLowerHex c - then option "" $ count 1 (satisfy isLowerHex) - else return "" - if null d - then case ord c of - x | x >= 64 && x <= 127 -> return $ chr (x - 64) - | otherwise -> return $ chr (x + 64) - else return $ chr $ read ('0':'x':c:d) - -comment :: LP () -comment = do - char '%' - skipMany (satisfy (/='\n')) - optional newline - return () +untokenize :: [Tok] -> Text +untokenize = mconcat . map untoken + +untoken :: Tok -> Text +untoken (Tok _ _ t) = t + +satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok +satisfyTok f = + try $ do + res <- tokenPrim (T.unpack . untoken) updatePos matcher + doMacros 0 -- apply macros on remaining input stream + return res + where matcher t | f t = Just t + | otherwise = Nothing + updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos + updatePos _spos _ (Tok pos _ _ : _) = pos + updatePos spos _ [] = incSourceColumn spos 1 + +doMacros :: PandocMonad m => Int -> LP m () +doMacros n = do + verbatimMode <- sVerbatimMode <$> getState + unless verbatimMode $ do + inp <- getInput + case inp of + Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : + Tok _ Word name : Tok _ Symbol "}" : ts + -> handleMacros spos name ts + Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : + Tok _ Word name : Tok _ Symbol "}" : ts + -> handleMacros spos ("end" <> name) ts + Tok _ (CtrlSeq "expandafter") _ : t : ts + -> do setInput ts + doMacros n + getInput >>= setInput . combineTok t + Tok spos (CtrlSeq name) _ : ts + -> handleMacros spos name ts + _ -> return () + where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts) + | T.all isLetterOrAt w = + Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts + where (x1, x2) = T.break isSpaceOrTab x + combineTok t ts = t:ts + handleMacros spos name ts = do + macros <- sMacros <$> getState + case M.lookup name macros of + Nothing -> return () + Just (Macro expansionPoint numargs optarg newtoks) -> do + setInput ts + let getarg = try $ spaces >> bracedOrToken + args <- case optarg of + Nothing -> count numargs getarg + Just o -> + (:) <$> option o bracketedToks + <*> count (numargs - 1) getarg + -- first boolean param is true if we're tokenizing + -- an argument (in which case we don't want to + -- expand #1 etc.) + let addTok False (Tok _ (Arg i) _) acc | i > 0 + , i <= numargs = + foldr (addTok True) acc (args !! (i - 1)) + -- add space if needed after control sequence + -- see #4007 + addTok _ (Tok _ (CtrlSeq x) txt) + acc@(Tok _ Word _ : _) + | not (T.null txt) && + isLetter (T.last txt) = + Tok spos (CtrlSeq x) (txt <> " ") : acc + addTok _ t acc = setpos spos t : acc + ts' <- getInput + setInput $ foldr (addTok False) ts' newtoks + case expansionPoint of + ExpandWhenUsed -> + if n > 20 -- detect macro expansion loops + then throwError $ PandocMacroLoop (T.unpack name) + else doMacros (n + 1) + ExpandWhenDefined -> return () + + +setpos :: SourcePos -> Tok -> Tok +setpos spos (Tok _ tt txt) = Tok spos tt txt + +anyControlSeq :: PandocMonad m => LP m Tok +anyControlSeq = satisfyTok isCtrlSeq + where isCtrlSeq (Tok _ (CtrlSeq _) _) = True + isCtrlSeq _ = False + +anySymbol :: PandocMonad m => LP m Tok +anySymbol = satisfyTok isSym + where isSym (Tok _ Symbol _) = True + isSym _ = False + +spaces :: PandocMonad m => LP m () +spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +spaces1 :: PandocMonad m => LP m () +spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +tokTypeIn :: [TokType] -> Tok -> Bool +tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes + +controlSeq :: PandocMonad m => Text -> LP m Tok +controlSeq name = satisfyTok isNamed + where isNamed (Tok _ (CtrlSeq n) _) = n == name + isNamed _ = False + +symbol :: PandocMonad m => Char -> LP m Tok +symbol c = satisfyTok isc + where isc (Tok _ Symbol d) = case T.uncons d of + Just (c',_) -> c == c' + _ -> False + isc _ = False + +symbolIn :: PandocMonad m => [Char] -> LP m Tok +symbolIn cs = satisfyTok isInCs + where isInCs (Tok _ Symbol d) = case T.uncons d of + Just (c,_) -> c `elem` cs + _ -> False + isInCs _ = False + +sp :: PandocMonad m => LP m () +sp = whitespace <|> endline -bgroup :: LP () +whitespace :: PandocMonad m => LP m () +whitespace = () <$ satisfyTok isSpaceTok + where isSpaceTok (Tok _ Spaces _) = True + isSpaceTok _ = False + +newlineTok :: PandocMonad m => LP m () +newlineTok = () <$ satisfyTok isNewlineTok + +isNewlineTok :: Tok -> Bool +isNewlineTok (Tok _ Newline _) = True +isNewlineTok _ = False + +comment :: PandocMonad m => LP m () +comment = () <$ satisfyTok isCommentTok + where isCommentTok (Tok _ Comment _) = True + isCommentTok _ = False + +anyTok :: PandocMonad m => LP m Tok +anyTok = satisfyTok (const True) + +endline :: PandocMonad m => LP m () +endline = try $ do + newlineTok + lookAhead anyTok + notFollowedBy blankline + +blankline :: PandocMonad m => LP m () +blankline = try $ skipMany whitespace *> newlineTok + +primEscape :: PandocMonad m => LP m Char +primEscape = do + Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2]) + case toktype of + Esc1 -> case T.uncons (T.drop 2 t) of + Just (c, _) + | c >= '\64' && c <= '\127' -> return (chr (ord c - 64)) + | otherwise -> return (chr (ord c + 64)) + Nothing -> fail "Empty content of Esc1" + Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of + Just x -> return (chr x) + Nothing -> fail $ "Could not read: " ++ T.unpack t + _ -> fail "Expected an Esc1 or Esc2 token" -- should not happen + +bgroup :: PandocMonad m => LP m Tok bgroup = try $ do - skipMany (spaceChar <|> try (newline <* notFollowedBy blankline)) - () <$ char '{' - <|> () <$ controlSeq "bgroup" - <|> () <$ controlSeq "begingroup" - -egroup :: LP () -egroup = () <$ char '}' - <|> () <$ controlSeq "egroup" - <|> () <$ controlSeq "endgroup" - -grouped :: Monoid a => LP a -> LP a -grouped parser = try $ bgroup *> (mconcat <$> manyTill parser egroup) - -braced :: LP String -braced = bgroup *> (concat <$> manyTill - ( many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{')) - <|> try (string "\\}") - <|> try (string "\\{") - <|> try (string "\\\\") - <|> ((\x -> "{" ++ x ++ "}") <$> braced) - <|> count 1 anyChar - ) egroup) - -bracketed :: Monoid a => LP a -> LP a -bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']')) - -mathDisplay :: LP String -> LP Inlines -mathDisplay p = displayMath <$> (try p >>= applyMacros' . trim) - -mathInline :: LP String -> LP Inlines -mathInline p = math <$> (try p >>= applyMacros') - -mathChars :: LP String -mathChars = - concat <$> many (escapedChar - <|> (snd <$> withRaw braced) - <|> many1 (satisfy isOrdChar)) - where escapedChar = try $ do char '\\' - c <- anyChar - return ['\\',c] - isOrdChar '$' = False - isOrdChar '{' = False - isOrdChar '}' = False - isOrdChar '\\' = False - isOrdChar _ = True - -quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines -quoted' f starter ender = do - startchs <- starter - smart <- getOption readerSmart - if smart - then do - ils <- many (notFollowedBy ender >> inline) - (ender >> return (f (mconcat ils))) <|> - (<> mconcat ils) <$> - lit (case startchs of - "``" -> "“" - "`" -> "‘" - _ -> startchs) - else lit startchs - -doubleQuote :: LP Inlines -doubleQuote = do - quoted' doubleQuoted (try $ string "``") (void $ try $ string "''") - <|> quoted' doubleQuoted (string "“") (void $ char '”') - -- the following is used by babel for localized quotes: - <|> quoted' doubleQuoted (try $ string "\"`") (void $ try $ string "\"'") - <|> quoted' doubleQuoted (string "\"") (void $ char '"') - -singleQuote :: LP Inlines -singleQuote = do - smart <- getOption readerSmart - if smart - then quoted' singleQuoted (string "`") (try $ char '\'' >> notFollowedBy letter) - <|> quoted' singleQuoted (string "‘") (try $ char '’' >> notFollowedBy letter) - else str <$> many1 (oneOf "`\'‘’") - -inline :: LP Inlines -inline = (mempty <$ comment) - <|> (space <$ whitespace) - <|> (softbreak <$ endline) - <|> inlineText - <|> inlineCommand - <|> inlineEnvironment - <|> inlineGroup - <|> (char '-' *> option (str "-") - (char '-' *> option (str "–") (str "—" <$ char '-'))) - <|> doubleQuote - <|> singleQuote - <|> (str "”" <$ try (string "''")) - <|> (str "”" <$ char '”') - <|> (str "’" <$ char '\'') - <|> (str "’" <$ char '’') - <|> (str "\160" <$ char '~') - <|> mathDisplay (string "$$" *> mathChars <* string "$$") - <|> mathInline (char '$' *> mathChars <* char '$') - <|> (guardEnabled Ext_literate_haskell *> char '|' *> doLHSverb) - <|> (str . (:[]) <$> tildeEscape) - <|> (str . (:[]) <$> oneOf "[]") - <|> (str . (:[]) <$> oneOf "#&~^'`\"[]") -- TODO print warning? - -- <|> (str <$> count 1 (satisfy (\c -> c /= '\\' && c /='\n' && c /='}' && c /='{'))) -- eat random leftover characters - -inlines :: LP Inlines -inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) - -inlineGroup :: LP Inlines + skipMany sp + symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" + +egroup :: PandocMonad m => LP m Tok +egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup" + +grouped :: (PandocMonad m, Monoid a) => LP m a -> LP m a +grouped parser = try $ do + bgroup + -- first we check for an inner 'grouped', because + -- {{a,b}} should be parsed the same as {a,b} + try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) + +braced :: PandocMonad m => LP m [Tok] +braced = bgroup *> braced' 1 + where braced' (n :: Int) = + handleEgroup n <|> handleBgroup n <|> handleOther n + handleEgroup n = do + t <- egroup + if n == 1 + then return [] + else (t:) <$> braced' (n - 1) + handleBgroup n = do + t <- bgroup + (t:) <$> braced' (n + 1) + handleOther n = do + t <- anyTok + (t:) <$> braced' n + +bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a +bracketed parser = try $ do + symbol '[' + mconcat <$> manyTill parser (symbol ']') + +dimenarg :: PandocMonad m => LP m Text +dimenarg = try $ do + ch <- option False $ True <$ symbol '=' + Tok _ _ s <- satisfyTok isWordTok + guard $ T.take 2 (T.reverse s) `elem` + ["pt","pc","in","bp","cm","mm","dd","cc","sp"] + let num = T.take (T.length s - 2) s + guard $ T.length num > 0 + guard $ T.all isDigit num + return $ T.pack ['=' | ch] <> s + +-- inline elements: + +word :: PandocMonad m => LP m Inlines +word = (str . T.unpack . untoken) <$> satisfyTok isWordTok + +regularSymbol :: PandocMonad m => LP m Inlines +regularSymbol = (str . T.unpack . untoken) <$> satisfyTok isRegularSymbol + where isRegularSymbol (Tok _ Symbol t) = not $ T.any isSpecial t + isRegularSymbol _ = False + isSpecial c = c `Set.member` specialChars + +specialChars :: Set.Set Char +specialChars = Set.fromList "#$%&~_^\\{}" + +isWordTok :: Tok -> Bool +isWordTok (Tok _ Word _) = True +isWordTok _ = False + +inlineGroup :: PandocMonad m => LP m Inlines inlineGroup = do ils <- grouped inline if isNull ils @@ -247,386 +629,19 @@ inlineGroup = do -- we need the span so we can detitlecase bibtex entries; -- we need to know when something is {C}apitalized -block :: LP Blocks -block = (mempty <$ comment) - <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) - <|> environment - <|> macro - <|> blockCommand - <|> paragraph - <|> grouped block - <|> (mempty <$ char '&') -- loose & in table environment - - -blocks :: LP Blocks -blocks = mconcat <$> many block - -getRawCommand :: String -> LP String -getRawCommand name' = do - rawargs <- withRaw (many (try (optional sp *> opt)) *> - option "" (try (optional sp *> dimenarg)) *> - many braced) - return $ '\\' : name' ++ snd rawargs - -lookupListDefault :: (Ord k) => v -> [k] -> M.Map k v -> v -lookupListDefault d = (fromMaybe d .) . lookupList - where - lookupList l m = msum $ map (`M.lookup` m) l - -blockCommand :: LP Blocks -blockCommand = try $ do - name <- anyControlSeq - guard $ name /= "begin" && name /= "end" - star <- option "" (string "*" <* optional sp) - let name' = name ++ star - let raw = do - rawcommand <- getRawCommand name' - transformed <- applyMacros' rawcommand - guard $ transformed /= rawcommand - notFollowedBy $ parseFromString inlines transformed - parseFromString blocks transformed - lookupListDefault raw [name',name] blockCommands - -inBrackets :: Inlines -> Inlines -inBrackets x = str "[" <> x <> str "]" - --- eat an optional argument and one or more arguments in braces -ignoreInlines :: String -> (String, LP Inlines) -ignoreInlines name = (name, doraw <|> (mempty <$ optargs)) - where optargs = skipopts *> skipMany (try $ optional sp *> braced) - contseq = '\\':name - doraw = (rawInline "latex" . (contseq ++) . snd) <$> - (getOption readerParseRaw >>= guard >> withRaw optargs) - -ignoreBlocks :: String -> (String, LP Blocks) -ignoreBlocks name = (name, doraw <|> (mempty <$ optargs)) - where optargs = skipopts *> skipMany (try $ optional sp *> braced) - contseq = '\\':name - doraw = (rawBlock "latex" . (contseq ++) . snd) <$> - (getOption readerParseRaw >>= guard >> withRaw optargs) - -blockCommands :: M.Map String (LP Blocks) -blockCommands = M.fromList $ - [ ("par", mempty <$ skipopts) - , ("title", mempty <$ (skipopts *> - (grouped inline >>= addMeta "title") - <|> (grouped block >>= addMeta "title"))) - , ("subtitle", mempty <$ (skipopts *> tok >>= addMeta "subtitle")) - , ("author", mempty <$ (skipopts *> authors)) - -- -- in letter class, temp. store address & sig as title, author - , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) - , ("signature", mempty <$ (skipopts *> authors)) - , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) - -- sectioning - , ("chapter", updateState (\s -> s{ stateHasChapters = True }) - *> section nullAttr 0) - , ("chapter*", updateState (\s -> s{ stateHasChapters = True }) - *> section ("",["unnumbered"],[]) 0) - , ("section", section nullAttr 1) - , ("section*", section ("",["unnumbered"],[]) 1) - , ("subsection", section nullAttr 2) - , ("subsection*", section ("",["unnumbered"],[]) 2) - , ("subsubsection", section nullAttr 3) - , ("subsubsection*", section ("",["unnumbered"],[]) 3) - , ("paragraph", section nullAttr 4) - , ("paragraph*", section ("",["unnumbered"],[]) 4) - , ("subparagraph", section nullAttr 5) - , ("subparagraph*", section ("",["unnumbered"],[]) 5) - -- beamer slides - , ("frametitle", section nullAttr 3) - , ("framesubtitle", section nullAttr 4) - -- letters - , ("opening", (para . trimInlines) <$> (skipopts *> tok)) - , ("closing", skipopts *> closing) - -- - , ("hrule", pure horizontalRule) - , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok *> pure horizontalRule) - , ("item", skipopts *> looseItem) - , ("documentclass", skipopts *> braced *> preamble) - , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) - , ("caption", skipopts *> setCaption) - , ("PandocStartInclude", startInclude) - , ("PandocEndInclude", endInclude) - , ("bibliography", mempty <$ (skipopts *> braced >>= - addMeta "bibliography" . splitBibs)) - , ("addbibresource", mempty <$ (skipopts *> braced >>= - addMeta "bibliography" . splitBibs)) - ] ++ map ignoreBlocks - -- these commands will be ignored unless --parse-raw is specified, - -- in which case they will appear as raw latex blocks - [ "newcommand", "renewcommand", "newenvironment", "renewenvironment" - -- newcommand, etc. should be parsed by macro, but we need this - -- here so these aren't parsed as inline commands to ignore - , "special", "pdfannot", "pdfstringdef" - , "bibliographystyle" - , "maketitle", "makeindex", "makeglossary" - , "addcontentsline", "addtocontents", "addtocounter" - -- \ignore{} is used conventionally in literate haskell for definitions - -- that are to be processed by the compiler but not printed. - , "ignore" - , "hyperdef" - , "markboth", "markright", "markleft" - , "newpage" - ] - -addMeta :: ToMetaValue a => String -> a -> LP () -addMeta field val = updateState $ \st -> - st{ stateMeta = addMetaField field val $ stateMeta st } - -splitBibs :: String -> [Inlines] -splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') - -setCaption :: LP Blocks -setCaption = do - ils <- tok - mblabel <- option Nothing $ - try $ spaces' >> controlSeq "label" >> (Just <$> tok) - let ils' = case mblabel of - Just lab -> ils <> spanWith - ("",[],[("data-label", stringify lab)]) mempty - Nothing -> ils - updateState $ \st -> st{ stateCaption = Just ils' } - return mempty - -resetCaption :: LP () -resetCaption = updateState $ \st -> st{ stateCaption = Nothing } - -authors :: LP () -authors = try $ do - char '{' - let oneAuthor = mconcat <$> - many1 (notFollowedBy' (controlSeq "and") >> - (inline <|> mempty <$ blockCommand)) - -- skip e.g. \vspace{10pt} - auths <- sepBy oneAuthor (controlSeq "and") - char '}' - addMeta "author" (map trimInlines auths) - -section :: Attr -> Int -> LP Blocks -section (ident, classes, kvs) lvl = do - hasChapters <- stateHasChapters `fmap` getState - let lvl' = if hasChapters then lvl + 1 else lvl - skipopts - contents <- grouped inline - lab <- option ident $ try (spaces' >> controlSeq "label" >> spaces' >> braced) - attr' <- registerHeader (lab, classes, kvs) contents - return $ headerWith attr' lvl' contents - -inlineCommand :: LP Inlines -inlineCommand = try $ do - name <- anyControlSeq - guard $ name /= "begin" && name /= "end" - guard $ not $ isBlockCommand name - parseRaw <- getOption readerParseRaw - star <- option "" (string "*") - let name' = name ++ star - let raw = do - rawargs <- withRaw - (skipangles *> skipopts *> option "" dimenarg *> many braced) - let rawcommand = '\\' : name ++ star ++ snd rawargs - transformed <- applyMacros' rawcommand - if transformed /= rawcommand - then parseFromString inlines transformed - else if parseRaw - then return $ rawInline "latex" rawcommand - else return mempty - (lookupListDefault mzero [name',name] inlineCommands <* - optional (try (string "{}"))) - <|> raw - -unlessParseRaw :: LP () -unlessParseRaw = getOption readerParseRaw >>= guard . not - -isBlockCommand :: String -> Bool -isBlockCommand s = s `M.member` blockCommands - - -inlineEnvironments :: M.Map String (LP Inlines) -inlineEnvironments = M.fromList - [ ("displaymath", mathEnv id Nothing "displaymath") - , ("math", math <$> verbEnv "math") - , ("equation", mathEnv id Nothing "equation") - , ("equation*", mathEnv id Nothing "equation*") - , ("gather", mathEnv id (Just "gathered") "gather") - , ("gather*", mathEnv id (Just "gathered") "gather*") - , ("multline", mathEnv id (Just "gathered") "multline") - , ("multline*", mathEnv id (Just "gathered") "multline*") - , ("eqnarray", mathEnv id (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnv id (Just "aligned") "eqnarray*") - , ("align", mathEnv id (Just "aligned") "align") - , ("align*", mathEnv id (Just "aligned") "align*") - , ("alignat", mathEnv id (Just "aligned") "alignat") - , ("alignat*", mathEnv id (Just "aligned") "alignat*") - ] - -inlineCommands :: M.Map String (LP Inlines) -inlineCommands = M.fromList $ - [ ("emph", extractSpaces emph <$> tok) - , ("textit", extractSpaces emph <$> tok) - , ("textsl", extractSpaces emph <$> tok) - , ("textsc", extractSpaces smallcaps <$> tok) - , ("sout", extractSpaces strikeout <$> tok) - , ("textsuperscript", extractSpaces superscript <$> tok) - , ("textsubscript", extractSpaces subscript <$> tok) - , ("textbackslash", lit "\\") - , ("backslash", lit "\\") - , ("slash", lit "/") - , ("textbf", extractSpaces strong <$> tok) - , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) - , ("ldots", lit "…") - , ("dots", lit "…") - , ("mdots", lit "…") - , ("sim", lit "~") - , ("label", unlessParseRaw >> (inBrackets <$> tok)) - , ("ref", unlessParseRaw >> (inBrackets <$> tok)) - , ("noindent", unlessParseRaw >> return mempty) - , ("textgreek", tok) - , ("sep", lit ",") - , ("cref", unlessParseRaw >> (inBrackets <$> tok)) -- from cleveref.sty - , ("(", mathInline $ manyTill anyChar (try $ string "\\)")) - , ("[", mathDisplay $ manyTill anyChar (try $ string "\\]")) - , ("ensuremath", mathInline braced) - , ("texorpdfstring", (\_ x -> x) <$> tok <*> tok) - , ("P", lit "¶") - , ("S", lit "§") - , ("$", lit "$") - , ("%", lit "%") - , ("&", lit "&") - , ("#", lit "#") - , ("_", lit "_") - , ("{", lit "{") - , ("}", lit "}") - -- old TeX commands - , ("em", extractSpaces emph <$> inlines) - , ("it", extractSpaces emph <$> inlines) - , ("sl", extractSpaces emph <$> inlines) - , ("bf", extractSpaces strong <$> inlines) - , ("rm", inlines) - , ("itshape", extractSpaces emph <$> inlines) - , ("slshape", extractSpaces emph <$> inlines) - , ("scshape", extractSpaces smallcaps <$> inlines) - , ("bfseries", extractSpaces strong <$> inlines) - , ("/", pure mempty) -- italic correction - , ("aa", lit "å") - , ("AA", lit "Å") - , ("ss", lit "ß") - , ("o", lit "ø") - , ("O", lit "Ø") - , ("L", lit "Ł") - , ("l", lit "ł") - , ("ae", lit "æ") - , ("AE", lit "Æ") - , ("oe", lit "œ") - , ("OE", lit "Œ") - , ("pounds", lit "£") - , ("euro", lit "€") - , ("copyright", lit "©") - , ("textasciicircum", lit "^") - , ("textasciitilde", lit "~") - , ("H", try $ tok >>= accent hungarumlaut) - , ("`", option (str "`") $ try $ tok >>= accent grave) - , ("'", option (str "'") $ try $ tok >>= accent acute) - , ("^", option (str "^") $ try $ tok >>= accent circ) - , ("~", option (str "~") $ try $ tok >>= accent tilde) - , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) - , (".", option (str ".") $ try $ tok >>= accent dot) - , ("=", option (str "=") $ try $ tok >>= accent macron) - , ("c", option (str "c") $ try $ tok >>= accent cedilla) - , ("v", option (str "v") $ try $ tok >>= accent hacek) - , ("u", option (str "u") $ try $ tok >>= accent breve) - , ("i", lit "i") - , ("\\", linebreak <$ (optional (bracketed inline) *> spaces')) - , (",", pure mempty) - , ("@", pure mempty) - , (" ", lit "\160") - , ("ps", pure $ str "PS." <> space) - , ("TeX", lit "TeX") - , ("LaTeX", lit "LaTeX") - , ("bar", lit "|") - , ("textless", lit "<") - , ("textgreater", lit ">") - , ("thanks", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) - , ("footnote", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) - , ("verb", doverb) - , ("lstinline", skipopts *> doverb) - , ("Verb", doverb) - , ("texttt", (code . stringify . toList) <$> tok) - , ("url", (unescapeURL <$> braced) >>= \url -> - pure (link url "" (str url))) - , ("href", (unescapeURL <$> braced <* optional sp) >>= \url -> - tok >>= \lab -> - pure (link url "" lab)) - , ("includegraphics", do options <- option [] keyvals - src <- unescapeURL . removeDoubleQuotes <$> braced - mkImage options src) - , ("enquote", enquote) - , ("cite", citation "cite" NormalCitation False) - , ("Cite", citation "Cite" NormalCitation False) - , ("citep", citation "citep" NormalCitation False) - , ("citep*", citation "citep*" NormalCitation False) - , ("citeal", citation "citeal" NormalCitation False) - , ("citealp", citation "citealp" NormalCitation False) - , ("citealp*", citation "citealp*" NormalCitation False) - , ("autocite", citation "autocite" NormalCitation False) - , ("smartcite", citation "smartcite" NormalCitation False) - , ("footcite", inNote <$> citation "footcite" NormalCitation False) - , ("parencite", citation "parencite" NormalCitation False) - , ("supercite", citation "supercite" NormalCitation False) - , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) - , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) - , ("citeyear", citation "citeyear" SuppressAuthor False) - , ("autocite*", citation "autocite*" SuppressAuthor False) - , ("cite*", citation "cite*" SuppressAuthor False) - , ("parencite*", citation "parencite*" SuppressAuthor False) - , ("textcite", citation "textcite" AuthorInText False) - , ("citet", citation "citet" AuthorInText False) - , ("citet*", citation "citet*" AuthorInText False) - , ("citealt", citation "citealt" AuthorInText False) - , ("citealt*", citation "citealt*" AuthorInText False) - , ("textcites", citation "textcites" AuthorInText True) - , ("cites", citation "cites" NormalCitation True) - , ("autocites", citation "autocites" NormalCitation True) - , ("footcites", inNote <$> citation "footcites" NormalCitation True) - , ("parencites", citation "parencites" NormalCitation True) - , ("supercites", citation "supercites" NormalCitation True) - , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) - , ("Autocite", citation "Autocite" NormalCitation False) - , ("Smartcite", citation "Smartcite" NormalCitation False) - , ("Footcite", citation "Footcite" NormalCitation False) - , ("Parencite", citation "Parencite" NormalCitation False) - , ("Supercite", citation "Supercite" NormalCitation False) - , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) - , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) - , ("Citeyear", citation "Citeyear" SuppressAuthor False) - , ("Autocite*", citation "Autocite*" SuppressAuthor False) - , ("Cite*", citation "Cite*" SuppressAuthor False) - , ("Parencite*", citation "Parencite*" SuppressAuthor False) - , ("Textcite", citation "Textcite" AuthorInText False) - , ("Textcites", citation "Textcites" AuthorInText True) - , ("Cites", citation "Cites" NormalCitation True) - , ("Autocites", citation "Autocites" NormalCitation True) - , ("Footcites", citation "Footcites" NormalCitation True) - , ("Parencites", citation "Parencites" NormalCitation True) - , ("Supercites", citation "Supercites" NormalCitation True) - , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) - , ("citetext", complexNatbibCitation NormalCitation) - , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> - complexNatbibCitation AuthorInText) - <|> citation "citeauthor" AuthorInText False) - , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= - addMeta "nocite")) - ] ++ map ignoreInlines - -- these commands will be ignored unless --parse-raw is specified, - -- in which case they will appear as raw latex blocks: - [ "index" ] +doLHSverb :: PandocMonad m => LP m Inlines +doLHSverb = + (codeWith ("",["haskell"],[]) . T.unpack . untokenize) + <$> manyTill (satisfyTok (not . isNewlineTok)) (symbol '|') -mkImage :: [(String, String)] -> String -> LP Inlines +mkImage :: PandocMonad m => [(String, String)] -> String -> LP m Inlines mkImage options src = do - let replaceTextwidth (k,v) = case numUnit v of - Just (num, "\\textwidth") -> (k, showFl (num * 100) ++ "%") - _ -> (k, v) - let kvs = map replaceTextwidth $ filter (\(k,_) -> k `elem` ["width", "height"]) options + let replaceTextwidth (k,v) = + case numUnit v of + Just (num, "\\textwidth") -> (k, showFl (num * 100) ++ "%") + _ -> (k, v) + let kvs = map replaceTextwidth + $ filter (\(k,_) -> k `elem` ["width", "height"]) options let attr = ("",[], kvs) let alt = str "image" case takeExtension src of @@ -635,40 +650,164 @@ mkImage options src = do return $ imageWith attr (addExtension src defaultExt) "" alt _ -> return $ imageWith attr src "" alt -inNote :: Inlines -> Inlines -inNote ils = - note $ para $ ils <> str "." +doxspace :: PandocMonad m => LP m Inlines +doxspace = + (space <$ lookAhead (satisfyTok startsWithLetter)) <|> return mempty + where startsWithLetter (Tok _ Word t) = + case T.uncons t of + Just (c, _) | isLetter c -> True + _ -> False + startsWithLetter _ = False -unescapeURL :: String -> String -unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs - where isEscapable c = c `elem` ("#$%&~_^\\{}" :: String) -unescapeURL (x:xs) = x:unescapeURL xs -unescapeURL [] = "" -enquote :: LP Inlines +-- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €" +dosiunitx :: PandocMonad m => LP m Inlines +dosiunitx = do + skipopts + value <- tok + valueprefix <- option "" $ bracketed tok + unit <- tok + let emptyOr160 "" = "" + emptyOr160 _ = "\160" + return . mconcat $ [valueprefix, + emptyOr160 valueprefix, + value, + emptyOr160 unit, + unit] + +lit :: String -> LP m Inlines +lit = pure . str + +removeDoubleQuotes :: Text -> Text +removeDoubleQuotes t = + Data.Maybe.fromMaybe t $ T.stripPrefix "\"" t >>= T.stripSuffix "\"" + +doubleQuote :: PandocMonad m => LP m Inlines +doubleQuote = + quoted' doubleQuoted (try $ count 2 $ symbol '`') + (void $ try $ count 2 $ symbol '\'') + <|> quoted' doubleQuoted ((:[]) <$> symbol '“') (void $ symbol '”') + -- the following is used by babel for localized quotes: + <|> quoted' doubleQuoted (try $ sequence [symbol '"', symbol '`']) + (void $ try $ sequence [symbol '"', symbol '\'']) + +singleQuote :: PandocMonad m => LP m Inlines +singleQuote = + quoted' singleQuoted ((:[]) <$> symbol '`') + (try $ symbol '\'' >> + notFollowedBy (satisfyTok startsWithLetter)) + <|> quoted' singleQuoted ((:[]) <$> symbol '‘') + (try $ symbol '’' >> + notFollowedBy (satisfyTok startsWithLetter)) + where startsWithLetter (Tok _ Word t) = + case T.uncons t of + Just (c, _) | isLetter c -> True + _ -> False + startsWithLetter _ = False + +quoted' :: PandocMonad m + => (Inlines -> Inlines) + -> LP m [Tok] + -> LP m () + -> LP m Inlines +quoted' f starter ender = do + startchs <- (T.unpack . untokenize) <$> starter + smart <- extensionEnabled Ext_smart <$> getOption readerExtensions + if smart + then do + ils <- many (notFollowedBy ender >> inline) + (ender >> return (f (mconcat ils))) <|> + (<> mconcat ils) <$> + lit (case startchs of + "``" -> "“" + "`" -> "‘" + cs -> cs) + else lit startchs + +enquote :: PandocMonad m => LP m Inlines enquote = do skipopts - context <- stateQuoteContext <$> getState - if context == InDoubleQuote + quoteContext <- sQuoteContext <$> getState + if quoteContext == InDoubleQuote then singleQuoted <$> withQuoteContext InSingleQuote tok else doubleQuoted <$> withQuoteContext InDoubleQuote tok -doverb :: LP Inlines +doAcronym :: PandocMonad m => String -> LP m Inlines +doAcronym form = do + acro <- braced + return . mconcat $ [spanWith ("",[],[("acronym-label", toksToString acro), + ("acronym-form", "singular+" ++ form)]) + $ str $ toksToString acro] + +doAcronymPlural :: PandocMonad m => String -> LP m Inlines +doAcronymPlural form = do + acro <- braced + plural <- lit "s" + return . mconcat $ [spanWith ("",[],[("acronym-label", toksToString acro), + ("acronym-form", "plural+" ++ form)]) $ + mconcat [str $ toksToString acro, plural]] + +doverb :: PandocMonad m => LP m Inlines doverb = do - marker <- anyChar - code <$> manyTill (satisfy (/='\n')) (char marker) - -doLHSverb :: LP Inlines -doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char '|') + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + withVerbatimMode $ + (code . T.unpack . untokenize) <$> + manyTill (verbTok marker) (symbol marker) + +verbTok :: PandocMonad m => Char -> LP m Tok +verbTok stopchar = do + t@(Tok pos toktype txt) <- satisfyTok (not . isNewlineTok) + case T.findIndex (== stopchar) txt of + Nothing -> return t + Just i -> do + let (t1, t2) = T.splitAt i txt + inp <- getInput + setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) + : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp + return $ Tok pos toktype t1 + +dolstinline :: PandocMonad m => LP m Inlines +dolstinline = do + options <- option [] keyvals + let classes = maybeToList $ lookup "language" options >>= fromListingsLanguage + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + let stopchar = if marker == '{' then '}' else marker + withVerbatimMode $ + (codeWith ("",classes,[]) . T.unpack . untokenize) <$> + manyTill (verbTok stopchar) (symbol stopchar) + +keyval :: PandocMonad m => LP m (String, String) +keyval = try $ do + Tok _ Word key <- satisfyTok isWordTok + let isSpecSym (Tok _ Symbol t) = t /= "]" && t /= "," + isSpecSym _ = False + optional sp + val <- option [] $ do + symbol '=' + optional sp + braced <|> many1 (satisfyTok isWordTok <|> satisfyTok isSpecSym + <|> anyControlSeq) + optional sp + optional (symbol ',') + optional sp + return (T.unpack key, T.unpack . untokenize $ val) -lit :: String -> LP Inlines -lit = pure . str +keyvals :: PandocMonad m => LP m [(String, String)] +keyvals = try $ symbol '[' >> manyTill keyval (symbol ']') -accent :: (Char -> String) -> Inlines -> LP Inlines -accent f ils = +accent :: PandocMonad m => Char -> (Char -> String) -> LP m Inlines +accent c f = try $ do + ils <- tok case toList ils of (Str (x:xs) : ys) -> return $ fromList (Str (f x ++ xs) : ys) - [] -> mzero + [Space] -> return $ str [c] + [] -> return $ str [c] _ -> return ils grave :: Char -> String @@ -855,6 +994,19 @@ hacek 'Z' = "Ž" hacek 'z' = "ž" hacek c = [c] +ogonek :: Char -> String +ogonek 'a' = "ą" +ogonek 'e' = "ę" +ogonek 'o' = "ǫ" +ogonek 'i' = "į" +ogonek 'u' = "ų" +ogonek 'A' = "Ą" +ogonek 'E' = "Ę" +ogonek 'I' = "Į" +ogonek 'O' = "Ǫ" +ogonek 'U' = "Ų" +ogonek c = [c] + breve :: Char -> String breve 'A' = "Ă" breve 'a' = "ă" @@ -870,368 +1022,1181 @@ breve 'U' = "Ŭ" breve 'u' = "ŭ" breve c = [c] -tok :: LP Inlines -tok = try $ grouped inline <|> inlineCommand <|> str <$> count 1 inlineChar +toksToString :: [Tok] -> String +toksToString = T.unpack . untokenize + +mathDisplay :: String -> Inlines +mathDisplay = displayMath . trim -opt :: LP Inlines -opt = bracketed inline +mathInline :: String -> Inlines +mathInline = math . trim -rawopt :: LP String +dollarsMath :: PandocMonad m => LP m Inlines +dollarsMath = do + symbol '$' + display <- option False (True <$ symbol '$') + contents <- trim . toksToString <$> + many (notFollowedBy (symbol '$') >> anyTok) + if display + then + mathDisplay contents <$ try (symbol '$' >> symbol '$') + <|> (guard (null contents) >> return (mathInline "")) + else mathInline contents <$ symbol '$' + +-- citations + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +simpleCiteArgs :: PandocMonad m => LP m [Citation] +simpleCiteArgs = try $ do + first <- optionMaybe $ toList <$> opt + second <- optionMaybe $ toList <$> opt + keys <- try $ bgroup *> manyTill citationLabel egroup + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + +citationLabel :: PandocMonad m => LP m String +citationLabel = do + optional spaces + toksToString <$> + (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) + <* optional spaces + <* optional (symbol ',') + <* optional spaces) + where bibtexKeyChar = ".:;?!`'()/*@_+=-[]" :: [Char] + +cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] +cites mode multi = try $ do + cits <- if multi + then many1 simpleCiteArgs + else count 1 simpleCiteArgs + let cs = concat cits + return $ case mode of + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs + +citation :: PandocMonad m => String -> CitationMode -> Bool -> LP m Inlines +citation name mode multi = do + (c,raw) <- withRaw $ cites mode multi + return $ cite c (rawInline "latex" $ "\\" ++ name ++ toksToString raw) + +handleCitationPart :: Inlines -> [Citation] +handleCitationPart ils = + let isCite Cite{} = True + isCite _ = False + (pref, rest) = break isCite (toList ils) + in case rest of + (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs + _ -> [] + +complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines +complexNatbibCitation mode = try $ do + (cs, raw) <- + withRaw $ concat <$> do + bgroup + items <- mconcat <$> + many1 (notFollowedBy (symbol ';') >> inline) + `sepBy1` (symbol ';') + egroup + return $ map handleCitationPart items + case cs of + [] -> mzero + (c:cits) -> return $ cite (c{ citationMode = mode }:cits) + (rawInline "latex" $ "\\citetext" ++ toksToString raw) + +inNote :: Inlines -> Inlines +inNote ils = + note $ para $ ils <> str "." + +inlineCommand' :: PandocMonad m => LP m Inlines +inlineCommand' = try $ do + Tok _ (CtrlSeq name) cmd <- anyControlSeq + guard $ name /= "begin" && name /= "end" + star <- option "" ("*" <$ symbol '*' <* optional sp) + let name' = name <> star + let names = ordNub [name', name] -- check non-starred as fallback + let raw = do + guard $ isInlineCommand name || not (isBlockCommand name) + rawcommand <- getRawCommand name (cmd <> star) + (guardEnabled Ext_raw_tex >> return (rawInline "latex" rawcommand)) + <|> ignore rawcommand + lookupListDefault raw names inlineCommands + +tok :: PandocMonad m => LP m Inlines +tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar' + where singleChar' = do + Tok _ _ t <- singleChar + return (str (T.unpack t)) + +singleChar :: PandocMonad m => LP m Tok +singleChar = try $ do + Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) + guard $ not $ toktype == Symbol && + T.any (`Set.member` specialChars) t + if T.length t > 1 + then do + let (t1, t2) = (T.take 1 t, T.drop 1 t) + inp <- getInput + setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp + return $ Tok pos toktype t1 + else return $ Tok pos toktype t + +opt :: PandocMonad m => LP m Inlines +opt = bracketed inline <|> (str . T.unpack <$> rawopt) + +rawopt :: PandocMonad m => LP m Text rawopt = do - contents <- bracketed (many1 (noneOf "[]") <|> try (string "\\]") <|> - try (string "\\[") <|> rawopt) + inner <- untokenize <$> bracketedToks optional sp - return $ "[" ++ contents ++ "]" + return $ "[" <> inner <> "]" -skipopts :: LP () +skipopts :: PandocMonad m => LP m () skipopts = skipMany rawopt -- opts in angle brackets are used in beamer -rawangle :: LP () +rawangle :: PandocMonad m => LP m () rawangle = try $ do - char '<' - skipMany (noneOf ">") - char '>' - return () + symbol '<' + () <$ manyTill anyTok (symbol '>') -skipangles :: LP () +skipangles :: PandocMonad m => LP m () skipangles = skipMany rawangle -inlineText :: LP Inlines -inlineText = str <$> many1 inlineChar +ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a +ignore raw = do + pos <- getPosition + report $ SkippedContent raw pos + return mempty -inlineChar :: LP Char -inlineChar = noneOf "\\$%&~#{}^'`\"‘’“”-[] \t\n" +withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) +withRaw parser = do + inp <- getInput + result <- parser + nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok) + let raw = takeWhile (/= nxt) inp + return (result, raw) -environment :: LP Blocks -environment = do - controlSeq "begin" - name <- braced - M.findWithDefault mzero name environments - <|> rawEnv name +inBrackets :: Inlines -> Inlines +inBrackets x = str "[" <> x <> str "]" + +unescapeURL :: String -> String +unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs + where isEscapable c = c `elem` ("#$%&~_^\\{}" :: String) +unescapeURL (x:xs) = x:unescapeURL xs +unescapeURL [] = "" -inlineEnvironment :: LP Inlines +mathEnvWith :: PandocMonad m + => (Inlines -> a) -> Maybe Text -> Text -> LP m a +mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" ++ T.unpack y ++ "}\n" ++ x ++ + "\\end{" ++ T.unpack y ++ "}" + +mathEnv :: PandocMonad m => Text -> LP m String +mathEnv name = do + skipopts + optional blankline + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ T.unpack $ untokenize res + +inlineEnvironment :: PandocMonad m => LP m Inlines inlineEnvironment = try $ do controlSeq "begin" - name <- braced + name <- untokenize <$> braced M.findWithDefault mzero name inlineEnvironments -rawEnv :: String -> LP Blocks -rawEnv name = do - parseRaw <- getOption readerParseRaw - rawOptions <- mconcat <$> many rawopt - let addBegin x = "\\begin{" ++ name ++ "}" ++ rawOptions ++ x - if parseRaw - then (rawBlock "latex" . addBegin) <$> - (withRaw (env name blocks) >>= applyMacros' . snd) - else env name blocks - ----- - -type IncludeParser = ParserT String [String] IO String - --- | Replace "include" commands with file contents. -handleIncludes :: String -> IO (Either PandocError String) -handleIncludes s = mapLeft (ParsecError s) <$> runParserT includeParser' [] "input" s - -includeParser' :: IncludeParser -includeParser' = - concat <$> many (comment' <|> escaped' <|> blob' <|> include' - <|> startMarker' <|> endMarker' - <|> verbCmd' <|> verbatimEnv' <|> backslash') - -comment' :: IncludeParser -comment' = do - char '%' - xs <- manyTill anyChar newline - return ('%':xs ++ "\n") - -escaped' :: IncludeParser -escaped' = try $ string "\\%" <|> string "\\\\" - -verbCmd' :: IncludeParser -verbCmd' = fmap snd <$> - withRaw $ try $ do - string "\\verb" - c <- anyChar - manyTill anyChar (char c) - -verbatimEnv' :: IncludeParser -verbatimEnv' = fmap snd <$> - withRaw $ try $ do - string "\\begin" - name <- braced' - guard $ name `elem` ["verbatim", "Verbatim", "BVerbatim", - "lstlisting", "minted", "alltt", "comment"] - manyTill anyChar (try $ string $ "\\end{" ++ name ++ "}") - -blob' :: IncludeParser -blob' = try $ many1 (noneOf "\\%") - -backslash' :: IncludeParser -backslash' = string "\\" - -braced' :: IncludeParser -braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}') +inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) +inlineEnvironments = M.fromList [ + ("displaymath", mathEnvWith id Nothing "displaymath") + , ("math", math <$> mathEnv "math") + , ("equation", mathEnvWith id Nothing "equation") + , ("equation*", mathEnvWith id Nothing "equation*") + , ("gather", mathEnvWith id (Just "gathered") "gather") + , ("gather*", mathEnvWith id (Just "gathered") "gather*") + , ("multline", mathEnvWith id (Just "gathered") "multline") + , ("multline*", mathEnvWith id (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") + , ("align", mathEnvWith id (Just "aligned") "align") + , ("align*", mathEnvWith id (Just "aligned") "align*") + , ("alignat", mathEnvWith id (Just "aligned") "alignat") + , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") + ] -maybeAddExtension :: String -> FilePath -> FilePath -maybeAddExtension ext fp = - if null (takeExtension fp) - then addExtension fp ext - else fp +inlineCommands :: PandocMonad m => M.Map Text (LP m Inlines) +inlineCommands = M.union inlineLanguageCommands $ M.fromList + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) + , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) + , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) + , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) + , ("texttt", ttfamily) + , ("sout", extractSpaces strikeout <$> tok) + , ("alert", skipangles >> spanWith ("",["alert"],[]) <$> tok) -- beamer + , ("lq", return (str "‘")) + , ("rq", return (str "’")) + , ("textquoteleft", return (str "‘")) + , ("textquoteright", return (str "’")) + , ("textquotedblleft", return (str "“")) + , ("textquotedblright", return (str "”")) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("slash", lit "/") + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) + , ("ldots", lit "…") + , ("vdots", lit "\8942") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("sep", lit ",") + , ("label", rawInlineOr "label" dolabel) + , ("ref", rawInlineOr "ref" $ doref "ref") + , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty + , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty + , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty + , ("lettrine", optional opt >> extractSpaces (spanWith ("",["lettrine"],[])) <$> tok) + , ("(", mathInline . toksToString <$> manyTill anyTok (controlSeq ")")) + , ("[", mathDisplay . toksToString <$> manyTill anyTok (controlSeq "]")) + , ("ensuremath", mathInline . toksToString <$> braced) + , ("texorpdfstring", (\_ x -> x) <$> tok <*> tok) + , ("P", lit "¶") + , ("S", lit "§") + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + -- old TeX commands + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) + , ("rm", inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) + , ("/", pure mempty) -- italic correction + , ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("textasciicircum", lit "^") + , ("textasciitilde", lit "~") + , ("H", accent '\779' hungarumlaut) + , ("`", accent '`' grave) + , ("'", accent '\'' acute) + , ("^", accent '^' circ) + , ("~", accent '~' tilde) + , ("\"", accent '\776' umlaut) + , (".", accent '\775' dot) + , ("=", accent '\772' macron) + , ("c", accent '\807' cedilla) + , ("v", accent 'ˇ' hacek) + , ("u", accent '\774' breve) + , ("k", accent '\808' ogonek) + , ("textogonekcentered", accent '\808' ogonek) + , ("i", lit "i") + , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState + guard $ not inTableCell + optional opt + spaces)) + , (",", lit "\8198") + , ("@", pure mempty) + , (" ", lit "\160") + , ("ps", pure $ str "PS." <> space) + , ("TeX", lit "TeX") + , ("LaTeX", lit "LaTeX") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("thanks", skipopts >> note <$> grouped block) + , ("footnote", skipopts >> note <$> grouped block) + , ("verb", doverb) + , ("lstinline", dolstinline) + , ("Verb", doverb) + , ("url", ((unescapeURL . T.unpack . untokenize) <$> braced) >>= \url -> + pure (link url "" (str url))) + , ("href", (unescapeURL . toksToString <$> + braced <* optional sp) >>= \url -> + tok >>= \lab -> pure (link url "" lab)) + , ("includegraphics", do options <- option [] keyvals + src <- unescapeURL . T.unpack . + removeDoubleQuotes . untokenize <$> braced + mkImage options src) + , ("enquote", enquote) + , ("figurename", doTerm Translations.Figure) + , ("prefacename", doTerm Translations.Preface) + , ("refname", doTerm Translations.References) + , ("bibname", doTerm Translations.Bibliography) + , ("chaptername", doTerm Translations.Chapter) + , ("partname", doTerm Translations.Part) + , ("contentsname", doTerm Translations.Contents) + , ("listfigurename", doTerm Translations.ListOfFigures) + , ("listtablename", doTerm Translations.ListOfTables) + , ("indexname", doTerm Translations.Index) + , ("abstractname", doTerm Translations.Abstract) + , ("tablename", doTerm Translations.Table) + , ("enclname", doTerm Translations.Encl) + , ("ccname", doTerm Translations.Cc) + , ("headtoname", doTerm Translations.To) + , ("pagename", doTerm Translations.Page) + , ("seename", doTerm Translations.See) + , ("seealsoname", doTerm Translations.SeeAlso) + , ("proofname", doTerm Translations.Proof) + , ("glossaryname", doTerm Translations.Glossary) + , ("lstlistingname", doTerm Translations.Listing) + , ("cite", citation "cite" NormalCitation False) + , ("Cite", citation "Cite" NormalCitation False) + , ("citep", citation "citep" NormalCitation False) + , ("citep*", citation "citep*" NormalCitation False) + , ("citeal", citation "citeal" NormalCitation False) + , ("citealp", citation "citealp" NormalCitation False) + , ("citealp*", citation "citealp*" NormalCitation False) + , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) + , ("footcite", inNote <$> citation "footcite" NormalCitation False) + , ("parencite", citation "parencite" NormalCitation False) + , ("supercite", citation "supercite" NormalCitation False) + , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) + , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) + , ("citeyear", citation "citeyear" SuppressAuthor False) + , ("autocite*", citation "autocite*" SuppressAuthor False) + , ("cite*", citation "cite*" SuppressAuthor False) + , ("parencite*", citation "parencite*" SuppressAuthor False) + , ("textcite", citation "textcite" AuthorInText False) + , ("citet", citation "citet" AuthorInText False) + , ("citet*", citation "citet*" AuthorInText False) + , ("citealt", citation "citealt" AuthorInText False) + , ("citealt*", citation "citealt*" AuthorInText False) + , ("textcites", citation "textcites" AuthorInText True) + , ("cites", citation "cites" NormalCitation True) + , ("autocites", citation "autocites" NormalCitation True) + , ("footcites", inNote <$> citation "footcites" NormalCitation True) + , ("parencites", citation "parencites" NormalCitation True) + , ("supercites", citation "supercites" NormalCitation True) + , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) + , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) + , ("Footcite", citation "Footcite" NormalCitation False) + , ("Parencite", citation "Parencite" NormalCitation False) + , ("Supercite", citation "Supercite" NormalCitation False) + , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) + , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) + , ("Citeyear", citation "Citeyear" SuppressAuthor False) + , ("Autocite*", citation "Autocite*" SuppressAuthor False) + , ("Cite*", citation "Cite*" SuppressAuthor False) + , ("Parencite*", citation "Parencite*" SuppressAuthor False) + , ("Textcite", citation "Textcite" AuthorInText False) + , ("Textcites", citation "Textcites" AuthorInText True) + , ("Cites", citation "Cites" NormalCitation True) + , ("Autocites", citation "Autocites" NormalCitation True) + , ("Footcites", citation "Footcites" NormalCitation True) + , ("Parencites", citation "Parencites" NormalCitation True) + , ("Supercites", citation "Supercites" NormalCitation True) + , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) + , ("citetext", complexNatbibCitation NormalCitation) + , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> + complexNatbibCitation AuthorInText) + <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) + , ("hyperlink", hyperlink) + , ("hypertarget", hypertargetInline) + -- glossaries package + , ("gls", doAcronym "short") + , ("Gls", doAcronym "short") + , ("glsdesc", doAcronym "long") + , ("Glsdesc", doAcronym "long") + , ("GLSdesc", doAcronym "long") + , ("acrlong", doAcronym "long") + , ("Acrlong", doAcronym "long") + , ("acrfull", doAcronym "full") + , ("Acrfull", doAcronym "full") + , ("acrshort", doAcronym "abbrv") + , ("Acrshort", doAcronym "abbrv") + , ("glspl", doAcronymPlural "short") + , ("Glspl", doAcronymPlural "short") + , ("glsdescplural", doAcronymPlural "long") + , ("Glsdescplural", doAcronymPlural "long") + , ("GLSdescplural", doAcronymPlural "long") + -- acronyms package + , ("ac", doAcronym "short") + , ("acf", doAcronym "full") + , ("acs", doAcronym "abbrv") + , ("acp", doAcronymPlural "short") + , ("acfp", doAcronymPlural "full") + , ("acsp", doAcronymPlural "abbrv") + -- siuntix + , ("SI", dosiunitx) + -- hyphenat + , ("bshyp", lit "\\\173") + , ("fshyp", lit "/\173") + , ("dothyp", lit ".\173") + , ("colonhyp", lit ":\173") + , ("hyp", lit "-") + , ("nohyphens", tok) + , ("textnhtt", ttfamily) + , ("nhttfamily", ttfamily) + -- LaTeX colors + , ("textcolor", coloredInline "color") + , ("colorbox", coloredInline "background-color") + -- fontawesome + , ("faCheck", lit "\10003") + , ("faClose", lit "\10007") + -- xspace + , ("xspace", doxspace) + -- etoolbox + , ("ifstrequal", ifstrequal) + , ("newtoggle", braced >>= newToggle) + , ("toggletrue", braced >>= setToggle True) + , ("togglefalse", braced >>= setToggle False) + , ("iftoggle", try $ ifToggle >> inline) + -- biblatex misc + , ("RN", romanNumeralUpper) + , ("Rn", romanNumeralLower) + -- babel + , ("foreignlanguage", foreignlanguage) + ] -include' :: IncludeParser -include' = do - fs' <- try $ do - char '\\' - name <- try (string "include") - <|> try (string "input") - <|> string "usepackage" - -- skip options - skipMany $ try $ char '[' *> manyTill anyChar (char ']') - fs <- (map trim . splitBy (==',')) <$> braced' - return $ if name == "usepackage" - then map (maybeAddExtension ".sty") fs - else map (maybeAddExtension ".tex") fs - pos <- getPosition - containers <- getState - let fn = case containers of - (f':_) -> f' - [] -> "input" - -- now process each include file in order... - rest <- getInput - results' <- forM fs' (\f -> do - when (f `elem` containers) $ - fail "Include file loop!" - contents <- lift $ readTeXFile f - return $ "\\PandocStartInclude{" ++ f ++ "}" ++ - contents ++ "\\PandocEndInclude{" ++ - fn ++ "}{" ++ show (sourceLine pos) ++ "}{" - ++ show (sourceColumn pos) ++ "}") - setInput $ concat results' ++ rest - return "" - -startMarker' :: IncludeParser -startMarker' = try $ do - string "\\PandocStartInclude" - fn <- braced' - updateState (fn:) - setPosition $ newPos fn 1 1 - return $ "\\PandocStartInclude{" ++ fn ++ "}" - -endMarker' :: IncludeParser -endMarker' = try $ do - string "\\PandocEndInclude" - fn <- braced' - ln <- braced' - co <- braced' - updateState tail - setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) - return $ "\\PandocEndInclude{" ++ fn ++ "}{" ++ ln ++ "}{" ++ - co ++ "}" - -readTeXFile :: FilePath -> IO String -readTeXFile f = do - texinputs <- E.catch (getEnv "TEXINPUTS") $ \(_ :: E.SomeException) -> - return "." - let ds = splitBy (==':') texinputs - readFileFromDirs ds f - -readFileFromDirs :: [FilePath] -> FilePath -> IO String -readFileFromDirs [] _ = return "" -readFileFromDirs (d:ds) f = - E.catch (UTF8.readFile $ d </> f) $ \(_ :: E.SomeException) -> - readFileFromDirs ds f - ----- - -keyval :: LP (String, String) -keyval = try $ do - key <- many1 alphaNum - val <- option "" $ char '=' >> many1 (alphaNum <|> char '.' <|> char '\\') - skipMany spaceChar - optional (char ',') - skipMany spaceChar - return (key, val) +foreignlanguage :: PandocMonad m => LP m Inlines +foreignlanguage = do + babelLang <- T.unpack . untokenize <$> braced + case babelLangToBCP47 babelLang of + Just lang -> spanWith ("", [], [("lang", renderLang lang)]) <$> tok + _ -> tok +inlineLanguageCommands :: PandocMonad m => M.Map Text (LP m Inlines) +inlineLanguageCommands = M.fromList $ mk <$> M.toList polyglossiaLangToBCP47 + where + mk (polyglossia, bcp47Func) = + ("text" <> T.pack polyglossia, inlineLanguage bcp47Func) + +inlineLanguage :: PandocMonad m => (String -> Lang) -> LP m Inlines +inlineLanguage bcp47Func = do + o <- option "" $ (T.unpack . T.filter (\c -> c /= '[' && c /= ']')) + <$> rawopt + let lang = renderLang $ bcp47Func o + extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok + +hyperlink :: PandocMonad m => LP m Inlines +hyperlink = try $ do + src <- toksToString <$> braced + lab <- tok + return $ link ('#':src) "" lab + +hypertargetBlock :: PandocMonad m => LP m Blocks +hypertargetBlock = try $ do + ref <- toksToString <$> braced + bs <- grouped block + case toList bs of + [Header 1 (ident,_,_) _] | ident == ref -> return bs + _ -> return $ divWith (ref, [], []) bs + +hypertargetInline :: PandocMonad m => LP m Inlines +hypertargetInline = try $ do + ref <- toksToString <$> braced + ils <- grouped inline + return $ spanWith (ref, [], []) ils -keyvals :: LP [(String, String)] -keyvals = try $ char '[' *> manyTill keyval (char ']') +romanNumeralUpper :: (PandocMonad m) => LP m Inlines +romanNumeralUpper = + str . toRomanNumeral <$> romanNumeralArg -alltt :: String -> LP Blocks -alltt t = walk strToCode <$> parseFromString blocks - (substitute " " "\\ " $ substitute "%" "\\%" $ - intercalate "\\\\\n" $ lines t) - where strToCode (Str s) = Code nullAttr s - strToCode x = x +romanNumeralLower :: (PandocMonad m) => LP m Inlines +romanNumeralLower = + str . map toLower . toRomanNumeral <$> romanNumeralArg -rawLaTeXBlock :: LP String -rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) +romanNumeralArg :: (PandocMonad m) => LP m Int +romanNumeralArg = spaces *> (parser <|> inBraces) + where + inBraces = do + symbol '{' + spaces + res <- parser + spaces + symbol '}' + return res + parser = do + Tok _ Word s <- satisfyTok isWordTok + let (digits, rest) = T.span isDigit s + unless (T.null rest) $ + fail "Non-digits in argument to \\Rn or \\RN" + safeRead $ T.unpack digits + +newToggle :: (Monoid a, PandocMonad m) => [Tok] -> LP m a +newToggle name = do + updateState $ \st -> + st{ sToggles = M.insert (toksToString name) False (sToggles st) } + return mempty -rawLaTeXInline :: LP Inline -rawLaTeXInline = do - raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand) - RawInline "latex" <$> applyMacros' raw +setToggle :: (Monoid a, PandocMonad m) => Bool -> [Tok] -> LP m a +setToggle on name = do + updateState $ \st -> + st{ sToggles = M.adjust (const on) (toksToString name) (sToggles st) } + return mempty -addImageCaption :: Blocks -> LP Blocks -addImageCaption = walkM go - where go (Image attr alt (src,tit)) = do - mbcapt <- stateCaption <$> getState - return $ case mbcapt of - Just ils -> Image attr (toList ils) (src, "fig:") - Nothing -> Image attr alt (src,tit) - go x = return x +ifToggle :: PandocMonad m => LP m () +ifToggle = do + name <- braced + spaces + yes <- braced + spaces + no <- braced + toggles <- sToggles <$> getState + inp <- getInput + let name' = toksToString name + case M.lookup name' toggles of + Just True -> setInput (yes ++ inp) + Just False -> setInput (no ++ inp) + Nothing -> do + pos <- getPosition + report $ UndefinedToggle name' pos + return () -addTableCaption :: Blocks -> LP Blocks -addTableCaption = walkM go - where go (Table c als ws hs rs) = do - mbcapt <- stateCaption <$> getState - return $ case mbcapt of - Just ils -> Table (toList ils) als ws hs rs - Nothing -> Table c als ws hs rs - go x = return x +doTerm :: PandocMonad m => Translations.Term -> LP m Inlines +doTerm term = str <$> translateTerm term + +ifstrequal :: (PandocMonad m, Monoid a) => LP m a +ifstrequal = do + str1 <- tok + str2 <- tok + ifequal <- braced + ifnotequal <- braced + if str1 == str2 + then getInput >>= setInput . (ifequal ++) + else getInput >>= setInput . (ifnotequal ++) + return mempty -environments :: M.Map String (LP Blocks) -environments = M.fromList - [ ("document", env "document" blocks <* skipMany anyChar) - , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) - , ("letter", env "letter" letterContents) - , ("minipage", env "minipage" $ - skipopts *> spaces' *> optional braced *> spaces' *> blocks) - , ("figure", env "figure" $ - resetCaption *> skipopts *> blocks >>= addImageCaption) - , ("center", env "center" blocks) - , ("longtable", env "longtable" $ - resetCaption *> simpTable False >>= addTableCaption) - , ("table", env "table" $ - resetCaption *> skipopts *> blocks >>= addTableCaption) - , ("tabular*", env "tabular" $ simpTable True) - , ("tabular", env "tabular" $ simpTable False) - , ("quote", blockQuote <$> env "quote" blocks) - , ("quotation", blockQuote <$> env "quotation" blocks) - , ("verse", blockQuote <$> env "verse" blocks) - , ("itemize", bulletList <$> listenv "itemize" (many item)) - , ("description", definitionList <$> listenv "description" (many descItem)) - , ("enumerate", orderedList') - , ("alltt", alltt =<< verbEnv "alltt") - , ("code", guardEnabled Ext_literate_haskell *> - (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> - verbEnv "code")) - , ("comment", mempty <$ verbEnv "comment") - , ("verbatim", codeBlock <$> verbEnv "verbatim") - , ("Verbatim", fancyverbEnv "Verbatim") - , ("BVerbatim", fancyverbEnv "BVerbatim") - , ("lstlisting", do options <- option [] keyvals - let kvs = [ (if k == "firstnumber" - then "startFrom" - else k, v) | (k,v) <- options ] - let classes = [ "numberLines" | - lookup "numbers" options == Just "left" ] - ++ maybeToList (lookup "language" options - >>= fromListingsLanguage) - let attr = (fromMaybe "" (lookup "label" options),classes,kvs) - codeBlockWith attr <$> verbEnv "lstlisting") - , ("minted", do options <- option [] keyvals - lang <- grouped (many1 $ satisfy (/='}')) - let kvs = [ (if k == "firstnumber" - then "startFrom" - else k, v) | (k,v) <- options ] - let classes = [ lang | not (null lang) ] ++ - [ "numberLines" | - lookup "linenos" options == Just "true" ] - let attr = ("",classes,kvs) - codeBlockWith attr <$> verbEnv "minted") - , ("obeylines", parseFromString - (para . trimInlines . mconcat <$> many inline) =<< - intercalate "\\\\\n" . lines <$> verbEnv "obeylines") - , ("displaymath", mathEnv para Nothing "displaymath") - , ("equation", mathEnv para Nothing "equation") - , ("equation*", mathEnv para Nothing "equation*") - , ("gather", mathEnv para (Just "gathered") "gather") - , ("gather*", mathEnv para (Just "gathered") "gather*") - , ("multline", mathEnv para (Just "gathered") "multline") - , ("multline*", mathEnv para (Just "gathered") "multline*") - , ("eqnarray", mathEnv para (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnv para (Just "aligned") "eqnarray*") - , ("align", mathEnv para (Just "aligned") "align") - , ("align*", mathEnv para (Just "aligned") "align*") - , ("alignat", mathEnv para (Just "aligned") "alignat") - , ("alignat*", mathEnv para (Just "aligned") "alignat*") +coloredInline :: PandocMonad m => String -> LP m Inlines +coloredInline stylename = do + skipopts + color <- braced + spanWith ("",[],[("style",stylename ++ ": " ++ toksToString color)]) <$> tok + +ttfamily :: PandocMonad m => LP m Inlines +ttfamily = (code . stringify . toList) <$> tok + +rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines +rawInlineOr name' fallback = do + parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions + if parseRaw + then rawInline "latex" <$> getRawCommand name' ("\\" <> name') + else fallback + +getRawCommand :: PandocMonad m => Text -> Text -> LP m String +getRawCommand name txt = do + (_, rawargs) <- withRaw $ + case name of + "write" -> do + void $ satisfyTok isWordTok -- digits + void braced + "titleformat" -> do + void braced + skipopts + void $ count 4 braced + "def" -> + void $ manyTill anyTok braced + _ -> do + skipangles + skipopts + option "" (try (optional sp *> dimenarg)) + void $ many braced + return $ T.unpack (txt <> untokenize rawargs) + +isBlockCommand :: Text -> Bool +isBlockCommand s = + s `M.member` (blockCommands :: M.Map Text (LP PandocPure Blocks)) + || s `Set.member` treatAsBlock + +treatAsBlock :: Set.Set Text +treatAsBlock = Set.fromList + [ "let", "def", "DeclareRobustCommand" + , "newcommand", "renewcommand" + , "newenvironment", "renewenvironment" + , "providecommand", "provideenvironment" + -- newcommand, etc. should be parsed by macroDef, but we need this + -- here so these aren't parsed as inline commands to ignore + , "special", "pdfannot", "pdfstringdef" + , "bibliographystyle" + , "maketitle", "makeindex", "makeglossary" + , "addcontentsline", "addtocontents", "addtocounter" + -- \ignore{} is used conventionally in literate haskell for definitions + -- that are to be processed by the compiler but not printed. + , "ignore" + , "hyperdef" + , "markboth", "markright", "markleft" + , "hspace", "vspace" + , "newpage" + , "clearpage" + , "pagebreak" + , "titleformat" + ] + +isInlineCommand :: Text -> Bool +isInlineCommand s = + s `M.member` (inlineCommands :: M.Map Text (LP PandocPure Inlines)) + || s `Set.member` treatAsInline + +treatAsInline :: Set.Set Text +treatAsInline = Set.fromList + [ "index" + , "hspace" + , "vspace" + , "noindent" + , "newpage" + , "clearpage" + , "pagebreak" ] -letterContents :: LP Blocks -letterContents = do - bs <- blocks - st <- getState - -- add signature (author) and address (title) - let addr = case lookupMeta "address" (stateMeta st) of - Just (MetaBlocks [Plain xs]) -> - para $ trimInlines $ fromList xs - _ -> mempty - return $ addr <> bs -- sig added by \closing +dolabel :: PandocMonad m => LP m Inlines +dolabel = do + v <- braced + let refstr = toksToString v + return $ spanWith (refstr,[],[("label", refstr)]) + $ inBrackets $ str $ toksToString v + +doref :: PandocMonad m => String -> LP m Inlines +doref cls = do + v <- braced + let refstr = toksToString v + return $ linkWith ("",[],[ ("reference-type", cls) + , ("reference", refstr)]) + ('#':refstr) + "" + (inBrackets $ str refstr) + +lookupListDefault :: (Show k, Ord k) => v -> [k] -> M.Map k v -> v +lookupListDefault d = (fromMaybe d .) . lookupList + where lookupList l m = msum $ map (`M.lookup` m) l + +inline :: PandocMonad m => LP m Inlines +inline = (mempty <$ comment) + <|> (space <$ whitespace) + <|> (softbreak <$ endline) + <|> word + <|> inlineCommand' + <|> inlineEnvironment + <|> inlineGroup + <|> (symbol '-' *> + option (str "-") (symbol '-' *> + option (str "–") (str "—" <$ symbol '-'))) + <|> doubleQuote + <|> singleQuote + <|> (str "”" <$ try (symbol '\'' >> symbol '\'')) + <|> (str "”" <$ symbol '”') + <|> (str "’" <$ symbol '\'') + <|> (str "’" <$ symbol '’') + <|> (str "\160" <$ symbol '~') + <|> dollarsMath + <|> (guardEnabled Ext_literate_haskell *> symbol '|' *> doLHSverb) + <|> (str . (:[]) <$> primEscape) + <|> regularSymbol + <|> (do res <- symbolIn "#^'`\"[]&" + pos <- getPosition + let s = T.unpack (untoken res) + report $ ParsingUnescaped s pos + return $ str s) + +inlines :: PandocMonad m => LP m Inlines +inlines = mconcat <$> many inline + +-- block elements: + +begin_ :: PandocMonad m => Text -> LP m () +begin_ t = try (do + controlSeq "begin" + spaces + txt <- untokenize <$> braced + guard (t == txt)) <?> ("\\begin{" ++ T.unpack t ++ "}") + +end_ :: PandocMonad m => Text -> LP m () +end_ t = try (do + controlSeq "end" + spaces + txt <- untokenize <$> braced + guard $ t == txt) <?> ("\\end{" ++ T.unpack t ++ "}") + +preamble :: PandocMonad m => LP m Blocks +preamble = mempty <$ many preambleBlock + where preambleBlock = spaces1 + <|> void include + <|> void macroDef + <|> void blockCommand + <|> void braced + <|> (notFollowedBy (begin_ "document") >> void anyTok) + +paragraph :: PandocMonad m => LP m Blocks +paragraph = do + x <- trimInlines . mconcat <$> many1 inline + if x == mempty + then return mempty + else return $ para x + +include :: PandocMonad m => LP m Blocks +include = do + (Tok _ (CtrlSeq name) _) <- + controlSeq "include" <|> controlSeq "input" <|> + controlSeq "subfile" <|> controlSeq "usepackage" + skipMany opt + fs <- (map (T.unpack . removeDoubleQuotes . T.strip) . T.splitOn "," . + untokenize) <$> braced + let fs' = if name == "usepackage" + then map (maybeAddExtension ".sty") fs + else map (maybeAddExtension ".tex") fs + dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" + mapM_ (insertIncluded dirs) fs' + return mempty + +insertIncluded :: PandocMonad m + => [FilePath] + -> FilePath + -> LP m () +insertIncluded dirs f = do + pos <- getPosition + containers <- getIncludeFiles <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show pos + updateState $ addIncludeFile f + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return s + Nothing -> do + report $ CouldNotLoadIncludeFile f pos + return "" + getInput >>= setInput . (tokenize f (T.pack contents) ++) + updateState dropLatestIncludeFile + +maybeAddExtension :: String -> FilePath -> FilePath +maybeAddExtension ext fp = + if null (takeExtension fp) + then addExtension fp ext + else fp + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> LP m () +addMeta field val = updateState $ \st -> + st{ sMeta = addMetaField field val $ sMeta st } + +authors :: PandocMonad m => LP m () +authors = try $ do + bgroup + let oneAuthor = mconcat <$> + many1 (notFollowedBy' (controlSeq "and") >> + (inline <|> mempty <$ blockCommand)) + -- skip e.g. \vspace{10pt} + auths <- sepBy oneAuthor (controlSeq "and") + egroup + addMeta "author" (map trimInlines auths) + +macroDef :: PandocMonad m => LP m Blocks +macroDef = + mempty <$ ((commandDef <|> environmentDef) <* doMacros 0) + where commandDef = do + (name, macro') <- newcommand <|> letmacro <|> defmacro + guardDisabled Ext_latex_macros <|> + updateState (\s -> s{ sMacros = M.insert name macro' (sMacros s) }) + environmentDef = do + (name, macro1, macro2) <- newenvironment + guardDisabled Ext_latex_macros <|> + do updateState $ \s -> s{ sMacros = + M.insert name macro1 (sMacros s) } + updateState $ \s -> s{ sMacros = + M.insert ("end" <> name) macro2 (sMacros s) } + -- @\newenvironment{envname}[n-args][default]{begin}{end}@ + -- is equivalent to + -- @\newcommand{\envname}[n-args][default]{begin}@ + -- @\newcommand{\endenvname}@ + +letmacro :: PandocMonad m => LP m (Text, Macro) +letmacro = do + controlSeq "let" + Tok _ (CtrlSeq name) _ <- anyControlSeq + optional $ symbol '=' + spaces + contents <- bracedOrToken + return (name, Macro ExpandWhenDefined 0 Nothing contents) + +defmacro :: PandocMonad m => LP m (Text, Macro) +defmacro = try $ do + controlSeq "def" + Tok _ (CtrlSeq name) _ <- anyControlSeq + numargs <- option 0 $ argSeq 1 + -- we use withVerbatimMode, because macros are to be expanded + -- at point of use, not point of definition + contents <- withVerbatimMode bracedOrToken + return (name, Macro ExpandWhenUsed numargs Nothing contents) + +-- Note: we don't yet support fancy things like #1.#2 +argSeq :: PandocMonad m => Int -> LP m Int +argSeq n = do + Tok _ (Arg i) _ <- satisfyTok isArgTok + guard $ i == n + argSeq (n+1) <|> return n + +isArgTok :: Tok -> Bool +isArgTok (Tok _ (Arg _) _) = True +isArgTok _ = False + +bracedOrToken :: PandocMonad m => LP m [Tok] +bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar)) + +newcommand :: PandocMonad m => LP m (Text, Macro) +newcommand = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> + controlSeq "renewcommand" <|> + controlSeq "providecommand" <|> + controlSeq "DeclareRobustCommand" + optional $ symbol '*' + Tok _ (CtrlSeq name) txt <- withVerbatimMode $ anyControlSeq <|> + (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + contents <- withVerbatimMode bracedOrToken + when (mtype == "newcommand") $ do + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ -> report $ MacroAlreadyDefined (T.unpack txt) pos + Nothing -> return () + return (name, Macro ExpandWhenUsed numargs optarg contents) + +newenvironment :: PandocMonad m => LP m (Text, Macro, Macro) +newenvironment = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> + controlSeq "renewenvironment" <|> + controlSeq "provideenvironment" + optional $ symbol '*' + spaces + name <- untokenize <$> braced + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + startcontents <- withVerbatimMode bracedOrToken + spaces + endcontents <- withVerbatimMode bracedOrToken + when (mtype == "newenvironment") $ do + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ -> report $ MacroAlreadyDefined (T.unpack name) pos + Nothing -> return () + return (name, Macro ExpandWhenUsed numargs optarg startcontents, + Macro ExpandWhenUsed 0 Nothing endcontents) + +bracketedToks :: PandocMonad m => LP m [Tok] +bracketedToks = do + symbol '[' + mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']') + +bracketedNum :: PandocMonad m => LP m Int +bracketedNum = do + ds <- untokenize <$> bracketedToks + case safeRead (T.unpack ds) of + Just i -> return i + _ -> return 0 + +setCaption :: PandocMonad m => LP m Blocks +setCaption = do + ils <- tok + mblabel <- option Nothing $ + try $ spaces >> controlSeq "label" >> (Just <$> tok) + let ils' = case mblabel of + Just lab -> ils <> spanWith + ("",[],[("label", stringify lab)]) mempty + Nothing -> ils + updateState $ \st -> st{ sCaption = Just ils' } + return mempty -closing :: LP Blocks +looseItem :: PandocMonad m => LP m Blocks +looseItem = do + inListItem <- sInListItem <$> getState + guard $ not inListItem + skipopts + return mempty + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ sCaption = Nothing } + +section :: PandocMonad m => Bool -> Attr -> Int -> LP m Blocks +section starred (ident, classes, kvs) lvl = do + skipopts + contents <- grouped inline + lab <- option ident $ + try (spaces >> controlSeq "label" + >> spaces >> toksToString <$> braced) + let classes' = if starred then "unnumbered" : classes else classes + unless starred $ do + hn <- sLastHeaderNum <$> getState + let num = incrementHeaderNum lvl hn + updateState $ \st -> st{ sLastHeaderNum = num } + updateState $ \st -> st{ sLabels = M.insert lab + [Str (renderHeaderNum num)] + (sLabels st) } + attr' <- registerHeader (lab, classes', kvs) contents + return $ headerWith attr' lvl contents + +blockCommand :: PandocMonad m => LP m Blocks +blockCommand = try $ do + Tok _ (CtrlSeq name) txt <- anyControlSeq + guard $ name /= "begin" && name /= "end" + star <- option "" ("*" <$ symbol '*' <* optional sp) + let name' = name <> star + let names = ordNub [name', name] + let rawDefiniteBlock = do + guard $ isBlockCommand name + rawBlock "latex" <$> getRawCommand name (txt <> star) + -- heuristic: if it could be either block or inline, we + -- treat it if block if we have a sequence of block + -- commands followed by a newline. But we stop if we + -- hit a \startXXX, since this might start a raw ConTeXt + -- environment (this is important because this parser is + -- used by the Markdown reader). + let startCommand = try $ do + Tok _ (CtrlSeq n) _ <- anyControlSeq + guard $ "start" `T.isPrefixOf` n + let rawMaybeBlock = try $ do + guard $ not $ isInlineCommand name + curr <- rawBlock "latex" <$> getRawCommand name (txt <> star) + rest <- many $ notFollowedBy startCommand *> blockCommand + lookAhead $ blankline <|> startCommand + return $ curr <> mconcat rest + let raw = rawDefiniteBlock <|> rawMaybeBlock + lookupListDefault raw names blockCommands + +closing :: PandocMonad m => LP m Blocks closing = do contents <- tok st <- getState let extractInlines (MetaBlocks [Plain ys]) = ys extractInlines (MetaBlocks [Para ys ]) = ys - extractInlines _ = [] - let sigs = case lookupMeta "author" (stateMeta st) of + extractInlines _ = [] + let sigs = case lookupMeta "author" (sMeta st) of Just (MetaList xs) -> para $ trimInlines $ fromList $ intercalate [LineBreak] $ map extractInlines xs _ -> mempty return $ para (trimInlines contents) <> sigs -item :: LP Blocks -item = blocks *> controlSeq "item" *> skipopts *> blocks - -looseItem :: LP Blocks -looseItem = do - ctx <- stateParserContext `fmap` getState - if ctx == ListItemState - then mzero - else return mempty - -descItem :: LP (Inlines, [Blocks]) -descItem = do - blocks -- skip blocks before item - controlSeq "item" - optional sp - ils <- opt - bs <- blocks - return (ils, [bs]) - -env :: String -> LP a -> LP a -env name p = p <* - (try (controlSeq "end" *> braced >>= guard . (== name)) - <?> ("\\end{" ++ name ++ "}")) +blockCommands :: PandocMonad m => M.Map Text (LP m Blocks) +blockCommands = M.fromList + [ ("par", mempty <$ skipopts) + , ("parbox", skipopts >> braced >> grouped blocks) + , ("title", mempty <$ (skipopts *> + (grouped inline >>= addMeta "title") + <|> (grouped block >>= addMeta "title"))) + , ("subtitle", mempty <$ (skipopts *> tok >>= addMeta "subtitle")) + , ("author", mempty <$ (skipopts *> authors)) + -- -- in letter class, temp. store address & sig as title, author + , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) + , ("signature", mempty <$ (skipopts *> authors)) + , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) + -- Koma-script metadata commands + , ("dedication", mempty <$ (skipopts *> tok >>= addMeta "dedication")) + -- sectioning + , ("part", section False nullAttr (-1)) + , ("part*", section True nullAttr (-1)) + , ("chapter", section False nullAttr 0) + , ("chapter*", section True ("",["unnumbered"],[]) 0) + , ("section", section False nullAttr 1) + , ("section*", section True ("",["unnumbered"],[]) 1) + , ("subsection", section False nullAttr 2) + , ("subsection*", section True ("",["unnumbered"],[]) 2) + , ("subsubsection", section False nullAttr 3) + , ("subsubsection*", section True ("",["unnumbered"],[]) 3) + , ("paragraph", section False nullAttr 4) + , ("paragraph*", section True ("",["unnumbered"],[]) 4) + , ("subparagraph", section False nullAttr 5) + , ("subparagraph*", section True ("",["unnumbered"],[]) 5) + -- beamer slides + , ("frametitle", section False nullAttr 3) + , ("framesubtitle", section False nullAttr 4) + -- letters + , ("opening", (para . trimInlines) <$> (skipopts *> tok)) + , ("closing", skipopts *> closing) + -- memoir + , ("plainbreak", braced >> pure horizontalRule) + , ("plainbreak*", braced >> pure horizontalRule) + , ("fancybreak", braced >> pure horizontalRule) + , ("fancybreak*", braced >> pure horizontalRule) + , ("plainfancybreak", braced >> braced >> braced >> pure horizontalRule) + , ("plainfancybreak*", braced >> braced >> braced >> pure horizontalRule) + , ("pfbreak", pure horizontalRule) + , ("pfbreak*", pure horizontalRule) + -- + , ("hrule", pure horizontalRule) + , ("strut", pure mempty) + , ("rule", skipopts *> tok *> tok *> pure horizontalRule) + , ("item", looseItem) + , ("documentclass", skipopts *> braced *> preamble) + , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> setCaption) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs . toksToString)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs . toksToString)) + -- includes + , ("lstinputlisting", inputListing) + , ("graphicspath", graphicsPath) + -- polyglossia + , ("setdefaultlanguage", setDefaultLanguage) + , ("setmainlanguage", setDefaultLanguage) + -- hyperlink + , ("hypertarget", hypertargetBlock) + -- LaTeX colors + , ("textcolor", coloredBlock "color") + , ("colorbox", coloredBlock "background-color") + ] + + +environments :: PandocMonad m => M.Map Text (LP m Blocks) +environments = M.fromList + [ ("document", env "document" blocks) + , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) + , ("letter", env "letter" letterContents) + , ("minipage", env "minipage" $ + skipopts *> spaces *> optional braced *> spaces *> blocks) + , ("figure", env "figure" $ skipopts *> figure) + , ("subfigure", env "subfigure" $ skipopts *> tok *> figure) + , ("center", env "center" blocks) + , ("longtable", env "longtable" $ + resetCaption *> simpTable "longtable" False >>= addTableCaption) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) + , ("tabular*", env "tabular*" $ simpTable "tabular*" True) + , ("tabularx", env "tabularx" $ simpTable "tabularx" True) + , ("tabular", env "tabular" $ simpTable "tabular" False) + , ("quote", blockQuote <$> env "quote" blocks) + , ("quotation", blockQuote <$> env "quotation" blocks) + , ("verse", blockQuote <$> env "verse" blocks) + , ("itemize", bulletList <$> listenv "itemize" (many item)) + , ("description", definitionList <$> listenv "description" (many descItem)) + , ("enumerate", orderedList') + , ("alltt", alltt <$> env "alltt" blocks) + , ("code", guardEnabled Ext_literate_haskell *> + (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> + verbEnv "code")) + , ("comment", mempty <$ verbEnv "comment") + , ("verbatim", codeBlock <$> verbEnv "verbatim") + , ("Verbatim", fancyverbEnv "Verbatim") + , ("BVerbatim", fancyverbEnv "BVerbatim") + , ("lstlisting", do attr <- parseListingsOptions <$> option [] keyvals + codeBlockWith attr <$> verbEnv "lstlisting") + , ("minted", minted) + , ("obeylines", obeylines) + , ("displaymath", mathEnvWith para Nothing "displaymath") + , ("equation", mathEnvWith para Nothing "equation") + , ("equation*", mathEnvWith para Nothing "equation*") + , ("gather", mathEnvWith para (Just "gathered") "gather") + , ("gather*", mathEnvWith para (Just "gathered") "gather*") + , ("multline", mathEnvWith para (Just "gathered") "multline") + , ("multline*", mathEnvWith para (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith para (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith para (Just "aligned") "eqnarray*") + , ("align", mathEnvWith para (Just "aligned") "align") + , ("align*", mathEnvWith para (Just "aligned") "align*") + , ("alignat", mathEnvWith para (Just "aligned") "alignat") + , ("alignat*", mathEnvWith para (Just "aligned") "alignat*") + , ("tikzpicture", rawVerbEnv "tikzpicture") + -- etoolbox + , ("ifstrequal", ifstrequal) + , ("newtoggle", braced >>= newToggle) + , ("toggletrue", braced >>= setToggle True) + , ("togglefalse", braced >>= setToggle False) + , ("iftoggle", try $ ifToggle >> block) + ] + +environment :: PandocMonad m => LP m Blocks +environment = do + controlSeq "begin" + name <- untokenize <$> braced + M.findWithDefault mzero name environments + <|> rawEnv name -listenv :: String -> LP a -> LP a -listenv name p = try $ do - oldCtx <- stateParserContext `fmap` getState - updateState $ \st -> st{ stateParserContext = ListItemState } - res <- env name p - updateState $ \st -> st{ stateParserContext = oldCtx } - return res +env :: PandocMonad m => Text -> LP m a -> LP m a +env name p = p <* end_ name -mathEnv :: (Inlines -> a) -> Maybe String -> String -> LP a -mathEnv f innerEnv name = f <$> mathDisplay (inner <$> verbEnv name) - where inner x = case innerEnv of - Nothing -> x - Just y -> "\\begin{" ++ y ++ "}\n" ++ x ++ - "\\end{" ++ y ++ "}" +rawEnv :: PandocMonad m => Text -> LP m Blocks +rawEnv name = do + exts <- getOption readerExtensions + let parseRaw = extensionEnabled Ext_raw_tex exts + rawOptions <- mconcat <$> many rawopt + let beginCommand = "\\begin{" <> name <> "}" <> rawOptions + pos1 <- getPosition + (bs, raw) <- withRaw $ env name blocks + if parseRaw + then return $ rawBlock "latex" + $ T.unpack $ beginCommand <> untokenize raw + else do + report $ SkippedContent (T.unpack beginCommand) pos1 + pos2 <- getPosition + report $ SkippedContent ("\\end{" ++ T.unpack name ++ "}") pos2 + return bs + +rawVerbEnv :: PandocMonad m => Text -> LP m Blocks +rawVerbEnv name = do + pos <- getPosition + (_, raw) <- withRaw $ verbEnv name + let raw' = "\\begin{tikzpicture}" ++ toksToString raw + exts <- getOption readerExtensions + let parseRaw = extensionEnabled Ext_raw_tex exts + if parseRaw + then return $ rawBlock "latex" raw' + else do + report $ SkippedContent raw' pos + return mempty -verbEnv :: String -> LP String -verbEnv name = do +verbEnv :: PandocMonad m => Text -> LP m String +verbEnv name = withVerbatimMode $ do skipopts optional blankline - let endEnv = try $ controlSeq "end" *> braced >>= guard . (== name) - res <- manyTill anyChar endEnv - return $ stripTrailingNewlines res + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ toksToString res -fancyverbEnv :: String -> LP Blocks +fancyverbEnv :: PandocMonad m => Text -> LP m Blocks fancyverbEnv name = do options <- option [] keyvals let kvs = [ (if k == "firstnumber" @@ -1242,142 +2207,176 @@ fancyverbEnv name = do let attr = ("",classes,kvs) codeBlockWith attr <$> verbEnv name -orderedList' :: LP Blocks -orderedList' = do - optional sp - (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ - try $ char '[' *> anyOrderedListMarker <* char ']' - spaces - optional $ try $ controlSeq "setlength" *> grouped (controlSeq "itemindent") *> braced - spaces - start <- option 1 $ try $ do controlSeq "setcounter" - grouped (string "enum" *> many1 (oneOf "iv")) - optional sp - num <- grouped (many1 digit) - spaces - return (read num + 1 :: Int) - bs <- listenv "enumerate" (many item) - return $ orderedListWith (start, style, delim) bs - -paragraph :: LP Blocks -paragraph = do - x <- trimInlines . mconcat <$> many1 inline - if x == mempty - then return mempty - else return $ para x - -preamble :: LP Blocks -preamble = mempty <$> manyTill preambleBlock beginDoc - where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" - preambleBlock = void comment - <|> void sp - <|> void blanklines - <|> void macro - <|> void blockCommand - <|> void anyControlSeq - <|> void braced - <|> void anyChar - -------- +obeylines :: PandocMonad m => LP m Blocks +obeylines = + para . fromList . removeLeadingTrailingBreaks . + walk softBreakToHard . toList <$> env "obeylines" inlines + where softBreakToHard SoftBreak = LineBreak + softBreakToHard x = x + removeLeadingTrailingBreaks = reverse . dropWhile isLineBreak . + reverse . dropWhile isLineBreak + isLineBreak LineBreak = True + isLineBreak _ = False + +minted :: PandocMonad m => LP m Blocks +minted = do + options <- option [] keyvals + lang <- toksToString <$> braced + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + let classes = [ lang | not (null lang) ] ++ + [ "numberLines" | + lookup "linenos" options == Just "true" ] + let attr = ("",classes,kvs) + codeBlockWith attr <$> verbEnv "minted" --- citations +letterContents :: PandocMonad m => LP m Blocks +letterContents = do + bs <- blocks + st <- getState + -- add signature (author) and address (title) + let addr = case lookupMeta "address" (sMeta st) of + Just (MetaBlocks [Plain xs]) -> + para $ trimInlines $ fromList xs + _ -> mempty + return $ addr <> bs -- sig added by \closing -addPrefix :: [Inline] -> [Citation] -> [Citation] -addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks -addPrefix _ _ = [] +figure :: PandocMonad m => LP m Blocks +figure = try $ do + resetCaption + blocks >>= addImageCaption -addSuffix :: [Inline] -> [Citation] -> [Citation] -addSuffix s ks@(_:_) = - let k = last ks - in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] -addSuffix _ _ = [] +addImageCaption :: PandocMonad m => Blocks -> LP m Blocks +addImageCaption = walkM go + where go (Image attr alt (src,tit)) + | not ("fig:" `isPrefixOf` tit) = do + mbcapt <- sCaption <$> getState + return $ case mbcapt of + Just ils -> Image attr (toList ils) (src, "fig:" ++ tit) + Nothing -> Image attr alt (src,tit) + go x = return x -simpleCiteArgs :: LP [Citation] -simpleCiteArgs = try $ do - first <- optionMaybe $ toList <$> opt - second <- optionMaybe $ toList <$> opt - char '{' - optional sp - keys <- manyTill citationLabel (char '}') - let (pre, suf) = case (first , second ) of - (Just s , Nothing) -> (mempty, s ) - (Just s , Just t ) -> (s , t ) - _ -> (mempty, mempty) - conv k = Citation { citationId = k - , citationPrefix = [] - , citationSuffix = [] - , citationMode = NormalCitation - , citationHash = 0 - , citationNoteNum = 0 - } - return $ addPrefix pre $ addSuffix suf $ map conv keys +coloredBlock :: PandocMonad m => String -> LP m Blocks +coloredBlock stylename = try $ do + skipopts + color <- braced + notFollowedBy (grouped inline) + let constructor = divWith ("",[],[("style",stylename ++ ": " ++ toksToString color)]) + constructor <$> grouped block + +graphicsPath :: PandocMonad m => LP m Blocks +graphicsPath = do + ps <- map toksToString <$> (bgroup *> manyTill braced egroup) + getResourcePath >>= setResourcePath . (++ ps) + return mempty -citationLabel :: LP String -citationLabel = optional sp *> - (many1 (satisfy isBibtexKeyChar) - <* optional sp - <* optional (char ',') - <* optional sp) - where isBibtexKeyChar c = isAlphaNum c || c `elem` (".:;?!`'()/*@_+=-[]" :: String) +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') -cites :: CitationMode -> Bool -> LP [Citation] -cites mode multi = try $ do - cits <- if multi - then many1 simpleCiteArgs - else count 1 simpleCiteArgs - let cs = concat cits - return $ case mode of - AuthorInText -> case cs of - (c:rest) -> c {citationMode = mode} : rest - [] -> [] - _ -> map (\a -> a {citationMode = mode}) cs +alltt :: Blocks -> Blocks +alltt = walk strToCode + where strToCode (Str s) = Code nullAttr s + strToCode Space = RawInline (Format "latex") "\\ " + strToCode SoftBreak = LineBreak + strToCode x = x -citation :: String -> CitationMode -> Bool -> LP Inlines -citation name mode multi = do - (c,raw) <- withRaw $ cites mode multi - return $ cite c (rawInline "latex" $ "\\" ++ name ++ raw) +parseListingsOptions :: [(String, String)] -> Attr +parseListingsOptions options = + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + classes = [ "numberLines" | + lookup "numbers" options == Just "left" ] + ++ maybeToList (lookup "language" options + >>= fromListingsLanguage) + in (fromMaybe "" (lookup "label" options), classes, kvs) -complexNatbibCitation :: CitationMode -> LP Inlines -complexNatbibCitation mode = try $ do - let ils = (toList . trimInlines . mconcat) <$> - many (notFollowedBy (oneOf "\\};") >> inline) - let parseOne = try $ do - skipSpaces - pref <- ils - cit' <- inline -- expect a citation - let citlist = toList cit' - cits' <- case citlist of - [Cite cs _] -> return cs - _ -> mzero - suff <- ils - skipSpaces - optional $ char ';' - return $ addPrefix pref $ addSuffix suff cits' - (c:cits, raw) <- withRaw $ grouped parseOne - return $ cite (c{ citationMode = mode }:cits) - (rawInline "latex" $ "\\citetext" ++ raw) +inputListing :: PandocMonad m => LP m Blocks +inputListing = do + pos <- getPosition + options <- option [] keyvals + f <- filter (/='"') . toksToString <$> braced + dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" + mbCode <- readFileFromDirs dirs f + codeLines <- case mbCode of + Just s -> return $ lines s + Nothing -> do + report $ CouldNotLoadIncludeFile f pos + return [] + let (ident,classes,kvs) = parseListingsOptions options + let language = case lookup "language" options >>= fromListingsLanguage of + Just l -> [l] + Nothing -> take 1 $ languagesByExtension (takeExtension f) + let firstline = fromMaybe 1 $ lookup "firstline" options >>= safeRead + let lastline = fromMaybe (length codeLines) $ + lookup "lastline" options >>= safeRead + let codeContents = intercalate "\n" $ take (1 + lastline - firstline) $ + drop (firstline - 1) codeLines + return $ codeBlockWith (ident,ordNub (classes ++ language),kvs) codeContents + +-- lists + +item :: PandocMonad m => LP m Blocks +item = void blocks *> controlSeq "item" *> skipopts *> blocks + +descItem :: PandocMonad m => LP m (Inlines, [Blocks]) +descItem = do + blocks -- skip blocks before item + controlSeq "item" + optional sp + ils <- opt + bs <- blocks + return (ils, [bs]) --- tables +listenv :: PandocMonad m => Text -> LP m a -> LP m a +listenv name p = try $ do + oldInListItem <- sInListItem `fmap` getState + updateState $ \st -> st{ sInListItem = True } + res <- env name p + updateState $ \st -> st{ sInListItem = oldInListItem } + return res -parseAligns :: LP [Alignment] -parseAligns = try $ do - char '{' - let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ (char '@' >> braced) - maybeBar - let cAlign = AlignCenter <$ char 'c' - let lAlign = AlignLeft <$ char 'l' - let rAlign = AlignRight <$ char 'r' - let parAlign = AlignLeft <$ (char 'p' >> braced) - let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign - aligns' <- sepEndBy alignChar maybeBar +orderedList' :: PandocMonad m => LP m Blocks +orderedList' = try $ do spaces - char '}' + let markerSpec = do + symbol '[' + ts <- toksToString <$> manyTill anyTok (symbol ']') + case runParser anyOrderedListMarker def "option" ts of + Right r -> return r + Left _ -> do + pos <- getPosition + report $ SkippedContent ("[" ++ ts ++ "]") pos + return (1, DefaultStyle, DefaultDelim) + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) markerSpec spaces - return aligns' + optional $ try $ controlSeq "setlength" + *> grouped (count 1 $ controlSeq "itemindent") + *> braced + spaces + start <- option 1 $ try $ do pos <- getPosition + controlSeq "setcounter" + ctr <- toksToString <$> braced + guard $ "enum" `isPrefixOf` ctr + guard $ all (`elem` ['i','v']) (drop 4 ctr) + optional sp + num <- toksToString <$> braced + case safeRead num of + Just i -> return (i + 1 :: Int) + Nothing -> do + report $ SkippedContent + ("\\setcounter{" ++ ctr ++ + "}{" ++ num ++ "}") pos + return 1 + bs <- listenv "enumerate" (many item) + return $ orderedListWith (start, style, delim) bs + +-- tables -hline :: LP () +hline :: PandocMonad m => LP m () hline = try $ do - spaces' + spaces controlSeq "hline" <|> -- booktabs rules: controlSeq "toprule" <|> @@ -1385,80 +2384,312 @@ hline = try $ do controlSeq "midrule" <|> controlSeq "endhead" <|> controlSeq "endfirsthead" - spaces' - optional $ bracketed (many1 (satisfy (/=']'))) + spaces + optional opt return () -lbreak :: LP () -lbreak = () <$ try (spaces' *> - (controlSeq "\\" <|> controlSeq "tabularnewline") <* - spaces') - -amp :: LP () -amp = () <$ try (spaces' *> char '&' <* spaces') - -parseTableRow :: Int -- ^ number of columns - -> LP [Blocks] -parseTableRow cols = try $ do - let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline - let minipage = try $ controlSeq "begin" *> string "{minipage}" *> - env "minipage" - (skipopts *> spaces' *> optional braced *> spaces' *> blocks) - let tableCell = minipage <|> - ((plain . trimInlines . mconcat) <$> many tableCellInline) - cells' <- sepBy1 tableCell amp - let numcells = length cells' - guard $ numcells <= cols && numcells >= 1 - guard $ cells' /= [mempty] - -- note: a & b in a three-column table leaves an empty 3rd cell: - let cells'' = cells' ++ replicate (cols - numcells) mempty - spaces' - return cells'' +lbreak :: PandocMonad m => LP m Tok +lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") <* spaces + +amp :: PandocMonad m => LP m Tok +amp = symbol '&' -spaces' :: LP () -spaces' = spaces *> skipMany (comment *> spaces) +-- Split a Word into individual Symbols (for parseAligns) +splitWordTok :: PandocMonad m => LP m () +splitWordTok = do + inp <- getInput + case inp of + (Tok spos Word t : rest) -> + setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) ++ rest + _ -> return () -simpTable :: Bool -> LP Blocks -simpTable hasWidthParameter = try $ do - when hasWidthParameter $ () <$ (spaces' >> tok) +parseAligns :: PandocMonad m => LP m [(Alignment, Double, ([Tok], [Tok]))] +parseAligns = try $ do + let maybeBar = skipMany $ + sp <|> () <$ symbol '|' <|> () <$ (symbol '@' >> braced) + let cAlign = AlignCenter <$ symbol 'c' + let lAlign = AlignLeft <$ symbol 'l' + let rAlign = AlignRight <$ symbol 'r' + let parAlign = AlignLeft <$ symbol 'p' + -- aligns from tabularx + let xAlign = AlignLeft <$ symbol 'X' + let mAlign = AlignLeft <$ symbol 'm' + let bAlign = AlignLeft <$ symbol 'b' + let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign + <|> xAlign <|> mAlign <|> bAlign ) + let alignPrefix = symbol '>' >> braced + let alignSuffix = symbol '<' >> braced + let colWidth = try $ do + symbol '{' + ds <- trim . toksToString <$> manyTill anyTok (controlSeq "linewidth") + spaces + symbol '}' + case safeRead ds of + Just w -> return w + Nothing -> return 0.0 + let alignSpec = do + pref <- option [] alignPrefix + spaces + al <- alignChar + width <- colWidth <|> option 0.0 (do s <- toksToString <$> braced + pos <- getPosition + report $ SkippedContent s pos + return 0.0) + spaces + suff <- option [] alignSuffix + return (al, width, (pref, suff)) + let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro + symbol '*' + spaces + ds <- trim . toksToString <$> braced + spaces + spec <- braced + case safeRead ds of + Just n -> + getInput >>= setInput . (mconcat (replicate n spec) ++) + Nothing -> fail $ "Could not parse " ++ ds ++ " as number" + bgroup + spaces + maybeBar + aligns' <- many $ try $ spaces >> optional starAlign >> + (alignSpec <* maybeBar) + spaces + egroup + spaces + return aligns' + +parseTableRow :: PandocMonad m + => Text -- ^ table environment name + -> [([Tok], [Tok])] -- ^ pref/suffixes + -> LP m [Blocks] +parseTableRow envname prefsufs = do + notFollowedBy (spaces *> end_ envname) + let cols = length prefsufs + -- add prefixes and suffixes in token stream: + let celltoks (pref, suff) = do + prefpos <- getPosition + contents <- many (notFollowedBy + (() <$ amp <|> () <$ lbreak <|> end_ envname) + >> anyTok) + suffpos <- getPosition + option [] (count 1 amp) + return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff + rawcells <- mapM celltoks prefsufs + oldInput <- getInput + cells <- mapM (\ts -> setInput ts >> parseTableCell) rawcells + setInput oldInput + spaces + let numcells = length cells + guard $ numcells <= cols && numcells >= 1 + guard $ cells /= [mempty] + -- note: a & b in a three-column table leaves an empty 3rd cell: + return $ cells ++ replicate (cols - numcells) mempty + +parseTableCell :: PandocMonad m => LP m Blocks +parseTableCell = do + let plainify bs = case toList bs of + [Para ils] -> plain (fromList ils) + _ -> bs + updateState $ \st -> st{ sInTableCell = True } + cells <- plainify <$> blocks + updateState $ \st -> st{ sInTableCell = False } + return cells + +simpTable :: PandocMonad m => Text -> Bool -> LP m Blocks +simpTable envname hasWidthParameter = try $ do + when hasWidthParameter $ () <$ (spaces >> tok) skipopts - aligns <- parseAligns - let cols = length aligns + colspecs <- parseAligns + let (aligns, widths, prefsufs) = unzip3 colspecs + let cols = length colspecs optional $ controlSeq "caption" *> skipopts *> setCaption optional lbreak - spaces' + spaces skipMany hline - spaces' - header' <- option [] $ try (parseTableRow cols <* lbreak <* many1 hline) - spaces' - rows <- sepEndBy (parseTableRow cols) (lbreak <* optional (skipMany hline)) - spaces' + spaces + header' <- option [] $ try (parseTableRow envname prefsufs <* + lbreak <* many1 hline) + spaces + rows <- sepEndBy (parseTableRow envname prefsufs) + (lbreak <* optional (skipMany hline)) + spaces optional $ controlSeq "caption" *> skipopts *> setCaption optional lbreak - spaces' + spaces let header'' = if null header' then replicate cols mempty else header' lookAhead $ controlSeq "end" -- make sure we're at end - return $ table mempty (zip aligns (repeat 0)) header'' rows + return $ table mempty (zip aligns widths) header'' rows -startInclude :: LP Blocks -startInclude = do - fn <- braced - setPosition $ newPos fn 1 1 - return mempty +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- sCaption <$> getState + return $ case mbcapt of + Just ils -> Table (toList ils) als ws hs rs + Nothing -> Table c als ws hs rs + go x = return x -endInclude :: LP Blocks -endInclude = do - fn <- braced - ln <- braced - co <- braced - setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) - return mempty -removeDoubleQuotes :: String -> String -removeDoubleQuotes ('"':xs) = - case reverse xs of - '"':ys -> reverse ys - _ -> '"':xs -removeDoubleQuotes xs = xs +block :: PandocMonad m => LP m Blocks +block = do + res <- (mempty <$ spaces1) + <|> environment + <|> include + <|> macroDef + <|> blockCommand + <|> paragraph + <|> grouped block + trace (take 60 $ show $ B.toList res) + return res + +blocks :: PandocMonad m => LP m Blocks +blocks = mconcat <$> many block + +setDefaultLanguage :: PandocMonad m => LP m Blocks +setDefaultLanguage = do + o <- option "" $ (T.unpack . T.filter (\c -> c /= '[' && c /= ']')) + <$> rawopt + polylang <- toksToString <$> braced + case M.lookup polylang polyglossiaLangToBCP47 of + Nothing -> return mempty -- TODO mzero? warning? + Just langFunc -> do + let l = langFunc o + setTranslations l + updateState $ setMeta "lang" $ str (renderLang l) + return mempty + +polyglossiaLangToBCP47 :: M.Map String (String -> Lang) +polyglossiaLangToBCP47 = M.fromList + [ ("arabic", \o -> case filter (/=' ') o of + "locale=algeria" -> Lang "ar" "" "DZ" [] + "locale=mashriq" -> Lang "ar" "" "SY" [] + "locale=libya" -> Lang "ar" "" "LY" [] + "locale=morocco" -> Lang "ar" "" "MA" [] + "locale=mauritania" -> Lang "ar" "" "MR" [] + "locale=tunisia" -> Lang "ar" "" "TN" [] + _ -> Lang "ar" "" "" []) + , ("german", \o -> case filter (/=' ') o of + "spelling=old" -> Lang "de" "" "DE" ["1901"] + "variant=austrian,spelling=old" + -> Lang "de" "" "AT" ["1901"] + "variant=austrian" -> Lang "de" "" "AT" [] + "variant=swiss,spelling=old" + -> Lang "de" "" "CH" ["1901"] + "variant=swiss" -> Lang "de" "" "CH" [] + _ -> Lang "de" "" "" []) + , ("lsorbian", \_ -> Lang "dsb" "" "" []) + , ("greek", \o -> case filter (/=' ') o of + "variant=poly" -> Lang "el" "" "polyton" [] + "variant=ancient" -> Lang "grc" "" "" [] + _ -> Lang "el" "" "" []) + , ("english", \o -> case filter (/=' ') o of + "variant=australian" -> Lang "en" "" "AU" [] + "variant=canadian" -> Lang "en" "" "CA" [] + "variant=british" -> Lang "en" "" "GB" [] + "variant=newzealand" -> Lang "en" "" "NZ" [] + "variant=american" -> Lang "en" "" "US" [] + _ -> Lang "en" "" "" []) + , ("usorbian", \_ -> Lang "hsb" "" "" []) + , ("latin", \o -> case filter (/=' ') o of + "variant=classic" -> Lang "la" "" "" ["x-classic"] + _ -> Lang "la" "" "" []) + , ("slovenian", \_ -> Lang "sl" "" "" []) + , ("serbianc", \_ -> Lang "sr" "cyrl" "" []) + , ("pinyin", \_ -> Lang "zh" "Latn" "" ["pinyin"]) + , ("afrikaans", \_ -> Lang "af" "" "" []) + , ("amharic", \_ -> Lang "am" "" "" []) + , ("assamese", \_ -> Lang "as" "" "" []) + , ("asturian", \_ -> Lang "ast" "" "" []) + , ("bulgarian", \_ -> Lang "bg" "" "" []) + , ("bengali", \_ -> Lang "bn" "" "" []) + , ("tibetan", \_ -> Lang "bo" "" "" []) + , ("breton", \_ -> Lang "br" "" "" []) + , ("catalan", \_ -> Lang "ca" "" "" []) + , ("welsh", \_ -> Lang "cy" "" "" []) + , ("czech", \_ -> Lang "cs" "" "" []) + , ("coptic", \_ -> Lang "cop" "" "" []) + , ("danish", \_ -> Lang "da" "" "" []) + , ("divehi", \_ -> Lang "dv" "" "" []) + , ("esperanto", \_ -> Lang "eo" "" "" []) + , ("spanish", \_ -> Lang "es" "" "" []) + , ("estonian", \_ -> Lang "et" "" "" []) + , ("basque", \_ -> Lang "eu" "" "" []) + , ("farsi", \_ -> Lang "fa" "" "" []) + , ("finnish", \_ -> Lang "fi" "" "" []) + , ("french", \_ -> Lang "fr" "" "" []) + , ("friulan", \_ -> Lang "fur" "" "" []) + , ("irish", \_ -> Lang "ga" "" "" []) + , ("scottish", \_ -> Lang "gd" "" "" []) + , ("ethiopic", \_ -> Lang "gez" "" "" []) + , ("galician", \_ -> Lang "gl" "" "" []) + , ("hebrew", \_ -> Lang "he" "" "" []) + , ("hindi", \_ -> Lang "hi" "" "" []) + , ("croatian", \_ -> Lang "hr" "" "" []) + , ("magyar", \_ -> Lang "hu" "" "" []) + , ("armenian", \_ -> Lang "hy" "" "" []) + , ("interlingua", \_ -> Lang "ia" "" "" []) + , ("indonesian", \_ -> Lang "id" "" "" []) + , ("icelandic", \_ -> Lang "is" "" "" []) + , ("italian", \_ -> Lang "it" "" "" []) + , ("japanese", \_ -> Lang "jp" "" "" []) + , ("khmer", \_ -> Lang "km" "" "" []) + , ("kurmanji", \_ -> Lang "kmr" "" "" []) + , ("kannada", \_ -> Lang "kn" "" "" []) + , ("korean", \_ -> Lang "ko" "" "" []) + , ("lao", \_ -> Lang "lo" "" "" []) + , ("lithuanian", \_ -> Lang "lt" "" "" []) + , ("latvian", \_ -> Lang "lv" "" "" []) + , ("malayalam", \_ -> Lang "ml" "" "" []) + , ("mongolian", \_ -> Lang "mn" "" "" []) + , ("marathi", \_ -> Lang "mr" "" "" []) + , ("dutch", \_ -> Lang "nl" "" "" []) + , ("nynorsk", \_ -> Lang "nn" "" "" []) + , ("norsk", \_ -> Lang "no" "" "" []) + , ("nko", \_ -> Lang "nqo" "" "" []) + , ("occitan", \_ -> Lang "oc" "" "" []) + , ("panjabi", \_ -> Lang "pa" "" "" []) + , ("polish", \_ -> Lang "pl" "" "" []) + , ("piedmontese", \_ -> Lang "pms" "" "" []) + , ("portuguese", \_ -> Lang "pt" "" "" []) + , ("romansh", \_ -> Lang "rm" "" "" []) + , ("romanian", \_ -> Lang "ro" "" "" []) + , ("russian", \_ -> Lang "ru" "" "" []) + , ("sanskrit", \_ -> Lang "sa" "" "" []) + , ("samin", \_ -> Lang "se" "" "" []) + , ("slovak", \_ -> Lang "sk" "" "" []) + , ("albanian", \_ -> Lang "sq" "" "" []) + , ("serbian", \_ -> Lang "sr" "" "" []) + , ("swedish", \_ -> Lang "sv" "" "" []) + , ("syriac", \_ -> Lang "syr" "" "" []) + , ("tamil", \_ -> Lang "ta" "" "" []) + , ("telugu", \_ -> Lang "te" "" "" []) + , ("thai", \_ -> Lang "th" "" "" []) + , ("turkmen", \_ -> Lang "tk" "" "" []) + , ("turkish", \_ -> Lang "tr" "" "" []) + , ("ukrainian", \_ -> Lang "uk" "" "" []) + , ("urdu", \_ -> Lang "ur" "" "" []) + , ("vietnamese", \_ -> Lang "vi" "" "" []) + ] + +babelLangToBCP47 :: String -> Maybe Lang +babelLangToBCP47 s = + case s of + "austrian" -> Just $ Lang "de" "" "AT" ["1901"] + "naustrian" -> Just $ Lang "de" "" "AT" [] + "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"] + "nswissgerman" -> Just $ Lang "de" "" "CH" [] + "german" -> Just $ Lang "de" "" "DE" ["1901"] + "ngerman" -> Just $ Lang "de" "" "DE" [] + "lowersorbian" -> Just $ Lang "dsb" "" "" [] + "uppersorbian" -> Just $ Lang "hsb" "" "" [] + "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"] + "slovene" -> Just $ Lang "sl" "" "" [] + "australian" -> Just $ Lang "en" "" "AU" [] + "canadian" -> Just $ Lang "en" "" "CA" [] + "british" -> Just $ Lang "en" "" "GB" [] + "newzealand" -> Just $ Lang "en" "" "NZ" [] + "american" -> Just $ Lang "en" "" "US" [] + "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"] + _ -> fmap ($ "") $ M.lookup s polyglossiaLangToBCP47 diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs new file mode 100644 index 000000000..c9cbaa9b9 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -0,0 +1,51 @@ +{- +Copyright (C) 2017-2018 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.LaTeX.Types + Copyright : Copyright (C) 2017-2018 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Types for LaTeX tokens and macros. +-} +module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) + , TokType(..) + , Macro(..) + , ExpansionPoint(..) + , SourcePos + ) +where +import Data.Text (Text) +import Text.Parsec.Pos (SourcePos) + +data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | + Esc1 | Esc2 | Arg Int + deriving (Eq, Ord, Show) + +data Tok = Tok SourcePos TokType Text + deriving (Eq, Ord, Show) + +data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed + deriving (Eq, Ord, Show) + +data Macro = Macro ExpansionPoint Int (Maybe [Tok]) [Tok] + deriving Show diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index cd35a8738..14cf73de4 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,7 +1,8 @@ -{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{-# LANGUAGE RelaxedPolyRec #-} {-# LANGUAGE ScopedTypeVariables #-} + {- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2015 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -29,63 +30,55 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of markdown-formatted plain text to 'Pandoc' document. -} -module Text.Pandoc.Readers.Markdown ( readMarkdown, - readMarkdownWithWarnings ) where +module Text.Pandoc.Readers.Markdown ( readMarkdown ) where -import Data.List ( transpose, sortBy, findIndex, intercalate ) +import Control.Monad +import Control.Monad.Except (throwError) +import Data.Char (isAlphaNum, isPunctuation, isSpace, toLower) +import qualified Data.HashMap.Strict as H +import Data.List (intercalate, sortBy, transpose, elemIndex) import qualified Data.Map as M -import Data.Scientific (coefficient, base10Exponent) -import Data.Ord ( comparing ) -import Data.Char ( isSpace, isAlphaNum, toLower, isPunctuation ) import Data.Maybe -import Text.Pandoc.Definition -import Text.Pandoc.Emoji (emojis) -import Text.Pandoc.Generic (bottomUp) -import qualified Data.Text as T +import Data.Monoid ((<>)) +import Data.Ord (comparing) +import Data.Scientific (base10Exponent, coefficient) +import qualified Data.Set as Set import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Vector as V +import Data.Yaml (ParseException (..), YamlException (..), YamlMark (..)) import qualified Data.Yaml as Yaml -import Data.Yaml (ParseException(..), YamlException(..), YamlMark(..)) -import qualified Data.HashMap.Strict as H +import System.FilePath (addExtension, takeExtension) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines) import qualified Text.Pandoc.Builder as B -import qualified Text.Pandoc.UTF8 as UTF8 -import qualified Data.Vector as V -import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import Text.Pandoc.Class (PandocMonad (..), report) +import Text.Pandoc.Definition +import Text.Pandoc.Emoji (emojis) +import Text.Pandoc.Error +import Text.Pandoc.Logging import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (tableWith) +import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag, + isCommentTag, isInlineTag, isTextTag) +import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline) import Text.Pandoc.Shared -import Text.Pandoc.Pretty (charWidth) +import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.XML (fromEntities) -import Text.Pandoc.Parsing hiding (tableWith) -import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) -import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, - isTextTag, isCommentTag ) -import Control.Monad -import System.FilePath (takeExtension, addExtension) -import Text.HTML.TagSoup -import qualified Data.Set as Set -import Text.Printf (printf) -import Debug.Trace (trace) -import Data.Monoid ((<>)) -import Text.Pandoc.Error -type MarkdownParser = Parser [Char] ParserState +type MarkdownParser m = ParserT [Char] ParserState m -- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readMarkdown opts s = - (readWith parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n") - --- | Read markdown from an input string and return a pair of a Pandoc document --- and a list of warnings. -readMarkdownWithWarnings :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError (Pandoc, [String]) -readMarkdownWithWarnings opts s = - (readWithWarnings parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n") - -trimInlinesF :: F Inlines -> F Inlines -trimInlinesF = liftM trimInlines +readMarkdown :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readMarkdown opts s = do + parsed <- readWithM parseMarkdown def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e -- -- Constants and data structure definitions @@ -117,44 +110,43 @@ isBlank _ = False -- -- | Succeeds when we're in list context. -inList :: MarkdownParser () +inList :: PandocMonad m => MarkdownParser m () inList = do ctx <- stateParserContext <$> getState guard (ctx == ListItemState) -spnl :: Parser [Char] st () +spnl :: PandocMonad m => ParserT [Char] st m () spnl = try $ do skipSpaces optional newline skipSpaces notFollowedBy (char '\n') -indentSpaces :: MarkdownParser String +spnl' :: PandocMonad m => ParserT [Char] st m String +spnl' = try $ do + xs <- many spaceChar + ys <- option "" $ try $ (:) <$> newline + <*> (many spaceChar <* notFollowedBy (char '\n')) + return (xs ++ ys) + +indentSpaces :: PandocMonad m => MarkdownParser m String indentSpaces = try $ do tabStop <- getOption readerTabStop count tabStop (char ' ') <|> string "\t" <?> "indentation" -nonindentSpaces :: MarkdownParser String +nonindentSpaces :: PandocMonad m => MarkdownParser m String nonindentSpaces = do - tabStop <- getOption readerTabStop - sps <- many (char ' ') - if length sps < tabStop - then return sps - else unexpected "indented line" + n <- skipNonindentSpaces + return $ replicate n ' ' -- returns number of spaces parsed -skipNonindentSpaces :: MarkdownParser Int +skipNonindentSpaces :: PandocMonad m => MarkdownParser m Int skipNonindentSpaces = do tabStop <- getOption readerTabStop - atMostSpaces (tabStop - 1) <* notFollowedBy (char ' ') + gobbleAtMostSpaces (tabStop - 1) <* notFollowedBy spaceChar -atMostSpaces :: Int -> MarkdownParser Int -atMostSpaces n - | n > 0 = (char ' ' >> (+1) <$> atMostSpaces (n-1)) <|> return 0 - | otherwise = return 0 - -litChar :: MarkdownParser Char +litChar :: PandocMonad m => MarkdownParser m Char litChar = escapedChar' <|> characterReference <|> noneOf "\n" @@ -162,30 +154,32 @@ litChar = escapedChar' -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. -inlinesInBalancedBrackets :: MarkdownParser (F Inlines) -inlinesInBalancedBrackets = do - char '[' - (_, raw) <- withRaw $ charsInBalancedBrackets 1 - guard $ not $ null raw - parseFromString (trimInlinesF . mconcat <$> many inline) (init raw) - -charsInBalancedBrackets :: Int -> MarkdownParser () -charsInBalancedBrackets 0 = return () -charsInBalancedBrackets openBrackets = - (char '[' >> charsInBalancedBrackets (openBrackets + 1)) - <|> (char ']' >> charsInBalancedBrackets (openBrackets - 1)) - <|> (( (() <$ code) - <|> (() <$ (escapedChar')) - <|> (newline >> notFollowedBy blankline) - <|> skipMany1 (noneOf "[]`\n\\") - <|> (() <$ count 1 (oneOf "`\\")) - ) >> charsInBalancedBrackets openBrackets) +inlinesInBalancedBrackets :: PandocMonad m => MarkdownParser m (F Inlines) +inlinesInBalancedBrackets = + try $ char '[' >> withRaw (go 1) >>= + parseFromString inlines . stripBracket . snd + where stripBracket [] = [] + stripBracket xs = if last xs == ']' then init xs else xs + go :: PandocMonad m => Int -> MarkdownParser m () + go 0 = return () + go openBrackets = + (() <$ (escapedChar <|> + code <|> + rawHtmlInline <|> + rawLaTeXInline') >> go openBrackets) + <|> + (do char ']' + Control.Monad.when (openBrackets > 1) $ go (openBrackets - 1)) + <|> + (char '[' >> go (openBrackets + 1)) + <|> + (anyChar >> go openBrackets) -- -- document structure -- -rawTitleBlockLine :: MarkdownParser String +rawTitleBlockLine :: PandocMonad m => MarkdownParser m String rawTitleBlockLine = do char '%' skipSpaces @@ -196,13 +190,13 @@ rawTitleBlockLine = do anyLine return $ trim $ unlines (first:rest) -titleLine :: MarkdownParser (F Inlines) +titleLine :: PandocMonad m => MarkdownParser m (F Inlines) titleLine = try $ do raw <- rawTitleBlockLine - res <- parseFromString (many inline) raw - return $ trimInlinesF $ mconcat res + res <- parseFromString' inlines raw + return $ trimInlinesF res -authorsLine :: MarkdownParser (F [Inlines]) +authorsLine :: PandocMonad m => MarkdownParser m (F [Inlines]) authorsLine = try $ do raw <- rawTitleBlockLine let sep = (char ';' <* spaces) <|> newline @@ -210,18 +204,18 @@ authorsLine = try $ do (trimInlinesF . mconcat <$> many (try $ notFollowedBy sep >> inline)) sep - sequence <$> parseFromString pAuthors raw + sequence <$> parseFromString' pAuthors raw -dateLine :: MarkdownParser (F Inlines) +dateLine :: PandocMonad m => MarkdownParser m (F Inlines) dateLine = try $ do raw <- rawTitleBlockLine - res <- parseFromString (many inline) raw - return $ trimInlinesF $ mconcat res + res <- parseFromString' inlines raw + return $ trimInlinesF res -titleBlock :: MarkdownParser () +titleBlock :: PandocMonad m => MarkdownParser m () titleBlock = pandocTitleBlock <|> mmdTitleBlock -pandocTitleBlock :: MarkdownParser () +pandocTitleBlock :: PandocMonad m => MarkdownParser m () pandocTitleBlock = try $ do guardEnabled Ext_pandoc_title_block lookAhead (char '%') @@ -239,7 +233,8 @@ pandocTitleBlock = try $ do $ nullMeta updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } -yamlMetaBlock :: MarkdownParser (F Blocks) + +yamlMetaBlock :: PandocMonad m => MarkdownParser m (F Blocks) yamlMetaBlock = try $ do guardEnabled Ext_yaml_metadata_block pos <- getPosition @@ -250,87 +245,105 @@ yamlMetaBlock = try $ do -- by including --- and ..., we allow yaml blocks with just comments: let rawYaml = unlines ("---" : (rawYamlLines ++ ["..."])) optional blanklines - opts <- stateOptions <$> getState - meta' <- case Yaml.decodeEither' $ UTF8.fromString rawYaml of - Right (Yaml.Object hashmap) -> return $ return $ - H.foldrWithKey (\k v m -> - if ignorable k - then m - else case yamlToMeta opts v of - Left _ -> m - Right v' -> B.setMeta (T.unpack k) v' m) - nullMeta hashmap - Right Yaml.Null -> return $ return nullMeta - Right _ -> do - addWarning (Just pos) "YAML header is not an object" - return $ return nullMeta - Left err' -> do - case err' of - InvalidYaml (Just YamlParseException{ - yamlProblem = problem - , yamlContext = _ctxt - , yamlProblemMark = Yaml.YamlMark { - yamlLine = yline - , yamlColumn = ycol - }}) -> - addWarning (Just $ setSourceLine - (setSourceColumn pos - (sourceColumn pos + ycol)) - (sourceLine pos + 1 + yline)) - $ "Could not parse YAML header: " ++ - problem - _ -> addWarning (Just pos) - $ "Could not parse YAML header: " ++ - show err' - return $ return nullMeta - updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + case Yaml.decodeEither' $ UTF8.fromString rawYaml of + Right (Yaml.Object hashmap) -> do + let alist = H.toList hashmap + mapM_ (\(k, v) -> + if ignorable k + then return () + else do + v' <- yamlToMeta v + let k' = T.unpack k + updateState $ \st -> st{ stateMeta' = + do m <- stateMeta' st + -- if there's already a value, leave it unchanged + case lookupMeta k' m of + Just _ -> return m + Nothing -> do + v'' <- v' + return $ B.setMeta (T.unpack k) v'' m} + ) alist + Right Yaml.Null -> return () + Right _ -> do + logMessage $ + CouldNotParseYamlMetadata "not an object" + pos + return () + Left err' -> do + case err' of + InvalidYaml (Just YamlParseException{ + yamlProblem = problem + , yamlContext = _ctxt + , yamlProblemMark = Yaml.YamlMark { + yamlLine = yline + , yamlColumn = ycol + }}) -> + logMessage $ CouldNotParseYamlMetadata + problem (setSourceLine + (setSourceColumn pos + (sourceColumn pos + ycol)) + (sourceLine pos + 1 + yline)) + _ -> logMessage $ CouldNotParseYamlMetadata + (show err') pos + return () return mempty -- ignore fields ending with _ ignorable :: Text -> Bool ignorable t = (T.pack "_") `T.isSuffixOf` t -toMetaValue :: ReaderOptions -> Text -> Either PandocError MetaValue -toMetaValue opts x = toMeta <$> readMarkdown opts' (T.unpack x) - where - toMeta p = - case p of - Pandoc _ [Plain xs] -> MetaInlines xs - Pandoc _ [Para xs] - | endsWithNewline x -> MetaBlocks [Para xs] - | otherwise -> MetaInlines xs - Pandoc _ bs -> MetaBlocks bs - endsWithNewline t = T.pack "\n" `T.isSuffixOf` t - opts' = opts{readerExtensions=readerExtensions opts `Set.difference` meta_exts} - meta_exts = Set.fromList [ Ext_pandoc_title_block - , Ext_mmd_title_block - , Ext_yaml_metadata_block - ] - -yamlToMeta :: ReaderOptions -> Yaml.Value -> Either PandocError MetaValue -yamlToMeta opts (Yaml.String t) = toMetaValue opts t -yamlToMeta _ (Yaml.Number n) +toMetaValue :: PandocMonad m + => Text -> MarkdownParser m (F MetaValue) +toMetaValue x = + parseFromString' parser' (T.unpack x) + where parser' = (asInlines <$> ((trimInlinesF . mconcat) + <$> try (guard (not endsWithNewline) + *> manyTill inline eof))) + <|> (asBlocks <$> parseBlocks) + asBlocks p = do + p' <- p + return $ MetaBlocks (B.toList p') + asInlines p = do + p' <- p + return $ MetaInlines (B.toList p') + endsWithNewline = T.pack "\n" `T.isSuffixOf` x + -- Note: a standard quoted or unquoted YAML value will + -- not end in a newline, but a "block" set off with + -- `|` or `>` will. + +yamlToMeta :: PandocMonad m + => Yaml.Value -> MarkdownParser m (F MetaValue) +yamlToMeta (Yaml.String t) = toMetaValue t +yamlToMeta (Yaml.Number n) -- avoid decimal points for numbers that don't need them: - | base10Exponent n >= 0 = return $ MetaString $ show + | base10Exponent n >= 0 = return $ return $ MetaString $ show $ coefficient n * (10 ^ base10Exponent n) - | otherwise = return $ MetaString $ show n -yamlToMeta _ (Yaml.Bool b) = return $ MetaBool b -yamlToMeta opts (Yaml.Array xs) = B.toMetaValue <$> mapM (yamlToMeta opts) - (V.toList xs) -yamlToMeta opts (Yaml.Object o) = MetaMap <$> H.foldrWithKey (\k v m -> - if ignorable k - then m - else (do - v' <- yamlToMeta opts v - m' <- m - return (M.insert (T.unpack k) v' m'))) - (return M.empty) o -yamlToMeta _ _ = return $ MetaString "" - -stopLine :: MarkdownParser () + | otherwise = return $ return $ MetaString $ show n +yamlToMeta (Yaml.Bool b) = return $ return $ MetaBool b +yamlToMeta (Yaml.Array xs) = do + xs' <- mapM yamlToMeta (V.toList xs) + return $ do + xs'' <- sequence xs' + return $ B.toMetaValue xs'' +yamlToMeta (Yaml.Object o) = do + let alist = H.toList o + foldM (\m (k,v) -> + if ignorable k + then return m + else do + v' <- yamlToMeta v + return $ do + MetaMap m' <- m + v'' <- v' + return (MetaMap $ M.insert (T.unpack k) v'' m')) + (return $ MetaMap M.empty) + alist +yamlToMeta _ = return $ return $ MetaString "" + +stopLine :: PandocMonad m => MarkdownParser m () stopLine = try $ (string "---" <|> string "...") >> blankline >> return () -mmdTitleBlock :: MarkdownParser () +mmdTitleBlock :: PandocMonad m => MarkdownParser m () mmdTitleBlock = try $ do guardEnabled Ext_mmd_title_block firstPair <- kvPair False @@ -340,49 +353,44 @@ mmdTitleBlock = try $ do updateState $ \st -> st{ stateMeta' = stateMeta' st <> return (Meta $ M.fromList kvPairs) } -kvPair :: Bool -> MarkdownParser (String, MetaValue) +kvPair :: PandocMonad m => Bool -> MarkdownParser m (String, MetaValue) kvPair allowEmpty = try $ do key <- many1Till (alphaNum <|> oneOf "_- ") (char ':') val <- trim <$> manyTill anyChar (try $ newline >> lookAhead (blankline <|> nonspaceChar)) guard $ allowEmpty || not (null val) let key' = concat $ words $ map toLower key - let val' = MetaBlocks $ B.toList $ B.plain $ B.text $ val + let val' = MetaBlocks $ B.toList $ B.plain $B.text val return (key',val') -parseMarkdown :: MarkdownParser Pandoc +parseMarkdown :: PandocMonad m => MarkdownParser m Pandoc parseMarkdown = do - -- markdown allows raw HTML - updateState $ \state -> state { stateOptions = - let oldOpts = stateOptions state in - oldOpts{ readerParseRaw = True } } optional titleBlock blocks <- parseBlocks st <- getState - let meta = runF (stateMeta' st) st - let Pandoc _ bs = B.doc $ runF blocks st - eastAsianLineBreaks <- option False $ - True <$ guardEnabled Ext_east_asian_line_breaks - return $ (if eastAsianLineBreaks - then bottomUp softBreakFilter - else id) $ Pandoc meta bs - -softBreakFilter :: [Inline] -> [Inline] -softBreakFilter (x:SoftBreak:y:zs) = - case (stringify x, stringify y) of - (xs@(_:_), (c:_)) - | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs - _ -> x:SoftBreak:y:zs -softBreakFilter xs = xs - -referenceKey :: MarkdownParser (F Blocks) + -- check for notes with no corresponding note references + let notesUsed = stateNoteRefs st + let notesDefined = M.keys (stateNotes' st) + mapM_ (\n -> unless (n `Set.member` notesUsed) $ + case M.lookup n (stateNotes' st) of + Just (pos, _) -> report (NoteDefinedButNotUsed n pos) + Nothing -> throwError $ + PandocShouldNeverHappenError "note not found") + notesDefined + let doc = runF (do Pandoc _ bs <- B.doc <$> blocks + meta <- stateMeta' st + return $ Pandoc meta bs) st + reportLogMessages + return doc + +referenceKey :: PandocMonad m => MarkdownParser m (F Blocks) referenceKey = try $ do pos <- getPosition skipNonindentSpaces (_,raw) <- reference char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') - let sourceURL = liftM unwords $ many $ try $ do + let sourceURL = fmap unwords $ many $ try $ do skipMany spaceChar notFollowedBy' referenceTitle notFollowedBy' $ guardEnabled Ext_link_attributes >> attributes @@ -392,7 +400,9 @@ referenceKey = try $ do src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle attr <- option nullAttr $ try $ - guardEnabled Ext_link_attributes >> skipSpaces >> attributes + do guardEnabled Ext_link_attributes + skipSpaces >> optional newline >> skipSpaces + attributes addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes >> many (try $ spnl >> keyValAttr) blanklines @@ -402,18 +412,22 @@ referenceKey = try $ do let oldkeys = stateKeys st let key = toKey raw case M.lookup key oldkeys of - Just _ -> addWarning (Just pos) $ "Duplicate link reference `" ++ raw ++ "'" - Nothing -> return () + Just (t,a) | not (t == target && a == attr') -> + -- We don't warn on two duplicate keys if the targets are also + -- the same. This can happen naturally with --reference-location=block + -- or section. See #3701. + logMessage $ DuplicateLinkReference raw pos + _ -> return () updateState $ \s -> s { stateKeys = M.insert key (target, attr') oldkeys } return $ return mempty -referenceTitle :: MarkdownParser String +referenceTitle :: PandocMonad m => MarkdownParser m String referenceTitle = try $ do skipSpaces >> optional newline >> skipSpaces quotedTitle '"' <|> quotedTitle '\'' <|> charsInBalanced '(' ')' litChar -- A link title in quotes -quotedTitle :: Char -> MarkdownParser String +quotedTitle :: PandocMonad m => Char -> MarkdownParser m String quotedTitle c = try $ do char c notFollowedBy spaces @@ -425,7 +439,7 @@ quotedTitle c = try $ do -- | PHP Markdown Extra style abbreviation key. Currently -- we just skip them, since Pandoc doesn't have an element for -- an abbreviation. -abbrevKey :: MarkdownParser (F Blocks) +abbrevKey :: PandocMonad m => MarkdownParser m (F Blocks) abbrevKey = do guardEnabled Ext_abbreviations try $ do @@ -436,23 +450,23 @@ abbrevKey = do blanklines return $ return mempty -noteMarker :: MarkdownParser String +noteMarker :: PandocMonad m => MarkdownParser m String noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']') -rawLine :: MarkdownParser String +rawLine :: PandocMonad m => MarkdownParser m String rawLine = try $ do notFollowedBy blankline notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker optional indentSpaces anyLine -rawLines :: MarkdownParser String +rawLines :: PandocMonad m => MarkdownParser m String rawLines = do first <- anyLine rest <- many rawLine return $ unlines (first:rest) -noteBlock :: MarkdownParser (F Blocks) +noteBlock :: PandocMonad m => MarkdownParser m (F Blocks) noteBlock = try $ do pos <- getPosition skipNonindentSpaces @@ -464,26 +478,23 @@ noteBlock = try $ do rest <- many $ try $ blanklines >> indentSpaces >> rawLines let raw = unlines (first:rest) ++ "\n" optional blanklines - parsed <- parseFromString parseBlocks raw - let newnote = (ref, parsed) + parsed <- parseFromString' parseBlocks raw oldnotes <- stateNotes' <$> getState - case lookup ref oldnotes of - Just _ -> addWarning (Just pos) $ "Duplicate note reference `" ++ ref ++ "'" + case M.lookup ref oldnotes of + Just _ -> logMessage $ DuplicateNoteReference ref pos Nothing -> return () - updateState $ \s -> s { stateNotes' = newnote : oldnotes } + updateState $ \s -> s { stateNotes' = M.insert ref (pos, parsed) oldnotes } return mempty -- -- parsing blocks -- -parseBlocks :: MarkdownParser (F Blocks) +parseBlocks :: PandocMonad m => MarkdownParser m (F Blocks) parseBlocks = mconcat <$> manyTill block eof -block :: MarkdownParser (F Blocks) +block :: PandocMonad m => MarkdownParser m (F Blocks) block = do - tr <- getOption readerTrace - pos <- getPosition res <- choice [ mempty <$ blanklines , codeBlockFenced , yamlMetaBlock @@ -493,10 +504,10 @@ block = do , header , lhsCodeBlock , divHtml + , divFenced , htmlBlock , table , codeBlockIndented - , guardEnabled Ext_latex_macros *> (macro >>= return . return) , rawTeXBlock , lineBlock , blockQuote @@ -509,30 +520,29 @@ block = do , para , plain ] <?> "block" - when tr $ do - st <- getState - trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList $ runF res st)) (return ()) + trace (take 60 $ show $ B.toList $ runF res defaultParserState) return res -- -- header blocks -- -header :: MarkdownParser (F Blocks) +header :: PandocMonad m => MarkdownParser m (F Blocks) header = setextHeader <|> atxHeader <?> "header" -atxChar :: MarkdownParser Char +atxChar :: PandocMonad m => MarkdownParser m Char atxChar = do exts <- getOption readerExtensions - return $ if Set.member Ext_literate_haskell exts - then '=' else '#' + return $ if extensionEnabled Ext_literate_haskell exts + then '=' + else '#' -atxHeader :: MarkdownParser (F Blocks) +atxHeader :: PandocMonad m => MarkdownParser m (F Blocks) atxHeader = try $ do - level <- atxChar >>= many1 . char >>= return . length + level <- fmap length (atxChar >>= many1 . char) notFollowedBy $ guardEnabled Ext_fancy_lists >> (char '.' <|> char ')') -- this would be a list + guardDisabled Ext_space_in_atx_header <|> notFollowedBy nonspaceChar skipSpaces (text, raw) <- withRaw $ trimInlinesF . mconcat <$> many (notFollowedBy atxClosing >> inline) @@ -542,7 +552,7 @@ atxHeader = try $ do <|> registerImplicitHeader raw attr' return $ B.headerWith attr' level <$> text -atxClosing :: MarkdownParser Attr +atxClosing :: PandocMonad m => MarkdownParser m Attr atxClosing = try $ do attr' <- option nullAttr (guardEnabled Ext_mmd_header_identifiers >> mmdHeaderIdentifier) @@ -553,7 +563,7 @@ atxClosing = try $ do blanklines return attr -setextHeaderEnd :: MarkdownParser Attr +setextHeaderEnd :: PandocMonad m => MarkdownParser m Attr setextHeaderEnd = try $ do attr <- option nullAttr $ (guardEnabled Ext_mmd_header_identifiers >> mmdHeaderIdentifier) @@ -561,13 +571,18 @@ setextHeaderEnd = try $ do blanklines return attr -mmdHeaderIdentifier :: MarkdownParser Attr +mmdHeaderIdentifier :: PandocMonad m => MarkdownParser m Attr mmdHeaderIdentifier = do - ident <- stripFirstAndLast . snd <$> reference + (_, raw) <- reference + let raw' = trim $ stripFirstAndLast raw + let ident = concat $ words $ map toLower raw' + let attr = (ident, [], []) + guardDisabled Ext_implicit_header_references + <|> registerImplicitHeader raw' attr skipSpaces - return (ident,[],[]) + return attr -setextHeader :: MarkdownParser (F Blocks) +setextHeader :: PandocMonad m => MarkdownParser m (F Blocks) setextHeader = try $ do -- This lookahead prevents us from wasting time parsing Inlines -- unless necessary -- it gives a significant performance boost. @@ -579,13 +594,13 @@ setextHeader = try $ do underlineChar <- oneOf setextHChars many (char underlineChar) blanklines - let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 + let level = fromMaybe 0 (elemIndex underlineChar setextHChars) + 1 attr' <- registerHeader attr (runF text defaultParserState) guardDisabled Ext_implicit_header_references <|> registerImplicitHeader raw attr' return $ B.headerWith attr' level <$> text -registerImplicitHeader :: String -> Attr -> MarkdownParser () +registerImplicitHeader :: PandocMonad m => String -> Attr -> MarkdownParser m () registerImplicitHeader raw attr@(ident, _, _) = do let key = toKey $ "[" ++ raw ++ "]" updateState (\s -> s { stateHeaderKeys = @@ -595,7 +610,7 @@ registerImplicitHeader raw attr@(ident, _, _) = do -- hrule block -- -hrule :: Parser [Char] st (F Blocks) +hrule :: PandocMonad m => ParserT [Char] st m (F Blocks) hrule = try $ do skipSpaces start <- satisfy isHruleChar @@ -609,20 +624,21 @@ hrule = try $ do -- code blocks -- -indentedLine :: MarkdownParser String -indentedLine = indentSpaces >> anyLine >>= return . (++ "\n") +indentedLine :: PandocMonad m => MarkdownParser m String +indentedLine = indentSpaces >> anyLineNewline -blockDelimiter :: (Char -> Bool) +blockDelimiter :: PandocMonad m + => (Char -> Bool) -> Maybe Int - -> Parser [Char] st Int + -> ParserT [Char] ParserState m Int blockDelimiter f len = try $ do + skipNonindentSpaces c <- lookAhead (satisfy f) case len of Just l -> count l (char c) >> many (char c) >> return l - Nothing -> count 3 (char c) >> many (char c) >>= - return . (+ 3) . length + Nothing -> fmap ((+ 3) . length) (count 3 (char c) >> many (char c)) -attributes :: MarkdownParser Attr +attributes :: PandocMonad m => MarkdownParser m Attr attributes = try $ do char '{' spnl @@ -630,28 +646,28 @@ attributes = try $ do char '}' return $ foldl (\x f -> f x) nullAttr attrs -attribute :: MarkdownParser (Attr -> Attr) +attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr) attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr -identifier :: MarkdownParser String +identifier :: PandocMonad m => MarkdownParser m String identifier = do first <- letter rest <- many $ alphaNum <|> oneOf "-_:." return (first:rest) -identifierAttr :: MarkdownParser (Attr -> Attr) +identifierAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) identifierAttr = try $ do char '#' result <- identifier return $ \(_,cs,kvs) -> (result,cs,kvs) -classAttr :: MarkdownParser (Attr -> Attr) +classAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) classAttr = try $ do char '.' result <- identifier return $ \(id',cs,kvs) -> (id',cs ++ [result],kvs) -keyValAttr :: MarkdownParser (Attr -> Attr) +keyValAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) keyValAttr = try $ do key <- identifier char '=' @@ -664,33 +680,53 @@ keyValAttr = try $ do "class" -> (id',cs ++ words val,kvs) _ -> (id',cs,kvs ++ [(key,val)]) -specialAttr :: MarkdownParser (Attr -> Attr) +specialAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) specialAttr = do char '-' return $ \(id',cs,kvs) -> (id',cs ++ ["unnumbered"],kvs) -codeBlockFenced :: MarkdownParser (F Blocks) +rawAttribute :: PandocMonad m => MarkdownParser m String +rawAttribute = do + char '{' + skipMany spaceChar + char '=' + format <- many1 $ satisfy (\c -> isAlphaNum c || c `elem` "-_") + skipMany spaceChar + char '}' + return format + +codeBlockFenced :: PandocMonad m => MarkdownParser m (F Blocks) codeBlockFenced = try $ do + indentchars <- nonindentSpaces + let indentLevel = length indentchars c <- try (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) <|> (guardEnabled Ext_backtick_code_blocks >> lookAhead (char '`')) size <- blockDelimiter (== c) Nothing skipMany spaceChar - attr <- option ([],[],[]) $ - try (guardEnabled Ext_fenced_code_attributes >> attributes) - <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar) + rawattr <- + (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + <|> + (Right <$> option ("",[],[]) + (try (guardEnabled Ext_fenced_code_attributes >> attributes) + <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar))) blankline - contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) + contents <- intercalate "\n" <$> + manyTill (gobbleAtMostSpaces indentLevel >> anyLine) + (blockDelimiter (== c) (Just size)) blanklines - return $ return $ B.codeBlockWith attr $ intercalate "\n" contents + return $ return $ + case rawattr of + Left syn -> B.rawBlock syn contents + Right attr -> B.codeBlockWith attr contents -- correctly handle github language identifiers toLanguageId :: String -> String toLanguageId = map toLower . go - where go "c++" = "cpp" + where go "c++" = "cpp" go "objective-c" = "objectivec" - go x = x + go x = x -codeBlockIndented :: MarkdownParser (F Blocks) +codeBlockIndented :: PandocMonad m => MarkdownParser m (F Blocks) codeBlockIndented = do contents <- many1 (indentedLine <|> try (do b <- blanklines @@ -701,7 +737,7 @@ codeBlockIndented = do return $ return $ B.codeBlockWith ("", classes, []) $ stripTrailingNewlines $ concat contents -lhsCodeBlock :: MarkdownParser (F Blocks) +lhsCodeBlock :: PandocMonad m => MarkdownParser m (F Blocks) lhsCodeBlock = do guardEnabled Ext_literate_haskell (return . B.codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> @@ -709,7 +745,7 @@ lhsCodeBlock = do <|> (return . B.codeBlockWith ("",["sourceCode","haskell"],[]) <$> lhsCodeBlockInverseBird) -lhsCodeBlockLaTeX :: MarkdownParser String +lhsCodeBlockLaTeX :: PandocMonad m => MarkdownParser m String lhsCodeBlockLaTeX = try $ do string "\\begin{code}" manyTill spaceChar newline @@ -717,13 +753,13 @@ lhsCodeBlockLaTeX = try $ do blanklines return $ stripTrailingNewlines contents -lhsCodeBlockBird :: MarkdownParser String +lhsCodeBlockBird :: PandocMonad m => MarkdownParser m String lhsCodeBlockBird = lhsCodeBlockBirdWith '>' -lhsCodeBlockInverseBird :: MarkdownParser String +lhsCodeBlockInverseBird :: PandocMonad m => MarkdownParser m String lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<' -lhsCodeBlockBirdWith :: Char -> MarkdownParser String +lhsCodeBlockBirdWith :: PandocMonad m => Char -> MarkdownParser m String lhsCodeBlockBirdWith c = try $ do pos <- getPosition when (sourceColumn pos /= 1) $ fail "Not in first column" @@ -735,7 +771,7 @@ lhsCodeBlockBirdWith c = try $ do blanklines return $ intercalate "\n" lns' -birdTrackLine :: Char -> Parser [Char] st String +birdTrackLine :: PandocMonad m => Char -> ParserT [Char] st m String birdTrackLine c = try $ do char c -- allow html tags on left margin: @@ -746,10 +782,10 @@ birdTrackLine c = try $ do -- block quotes -- -emailBlockQuoteStart :: MarkdownParser Char +emailBlockQuoteStart :: PandocMonad m => MarkdownParser m Char emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ') -emailBlockQuote :: MarkdownParser [String] +emailBlockQuote :: PandocMonad m => MarkdownParser m [String] emailBlockQuote = try $ do emailBlockQuoteStart let emailLine = many $ nonEndline <|> try @@ -763,113 +799,136 @@ emailBlockQuote = try $ do optional blanklines return raw -blockQuote :: MarkdownParser (F Blocks) +blockQuote :: PandocMonad m => MarkdownParser m (F Blocks) blockQuote = do raw <- emailBlockQuote -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + contents <- parseFromString' parseBlocks $ intercalate "\n" raw ++ "\n\n" return $ B.blockQuote <$> contents -- -- list blocks -- -bulletListStart :: MarkdownParser () +bulletListStart :: PandocMonad m => MarkdownParser m () bulletListStart = try $ do optional newline -- if preceded by a Plain block in a list context - startpos <- sourceColumn <$> getPosition skipNonindentSpaces notFollowedBy' (() <$ hrule) -- because hrules start out just like lists satisfy isBulletListMarker - endpos <- sourceColumn <$> getPosition - tabStop <- getOption readerTabStop - lookAhead (newline <|> spaceChar) - () <$ atMostSpaces (tabStop - (endpos - startpos)) + gobbleSpaces 1 <|> () <$ lookAhead newline + try (gobbleAtMostSpaces 3 >> notFollowedBy spaceChar) <|> return () -anyOrderedListStart :: MarkdownParser (Int, ListNumberStyle, ListNumberDelim) -anyOrderedListStart = try $ do +orderedListStart :: PandocMonad m + => Maybe (ListNumberStyle, ListNumberDelim) + -> MarkdownParser m (Int, ListNumberStyle, ListNumberDelim) +orderedListStart mbstydelim = try $ do optional newline -- if preceded by a Plain block in a list context - startpos <- sourceColumn <$> getPosition skipNonindentSpaces notFollowedBy $ string "p." >> spaceChar >> digit -- page number - res <- do guardDisabled Ext_fancy_lists - start <- many1 digit >>= safeRead - char '.' - return (start, DefaultStyle, DefaultDelim) - <|> do (num, style, delim) <- anyOrderedListMarker - -- if it could be an abbreviated first name, - -- insist on more than one space - when (delim == Period && (style == UpperAlpha || - (style == UpperRoman && - num `elem` [1, 5, 10, 50, 100, 500, 1000]))) $ - () <$ spaceChar - return (num, style, delim) - endpos <- sourceColumn <$> getPosition - tabStop <- getOption readerTabStop - lookAhead (newline <|> spaceChar) - atMostSpaces (tabStop - (endpos - startpos)) - return res - -listStart :: MarkdownParser () -listStart = bulletListStart <|> (anyOrderedListStart >> return ()) - -listLine :: MarkdownParser String -listLine = try $ do - notFollowedBy' (do indentSpaces - many spaceChar + (do guardDisabled Ext_fancy_lists + start <- many1 digit >>= safeRead + char '.' + gobbleSpaces 1 <|> () <$ lookAhead newline + optional $ try (gobbleAtMostSpaces 3 >> notFollowedBy spaceChar) + return (start, DefaultStyle, DefaultDelim)) + <|> + (do (num, style, delim) <- maybe + anyOrderedListMarker + (\(sty,delim) -> (\start -> (start,sty,delim)) <$> + orderedListMarker sty delim) + mbstydelim + gobbleSpaces 1 <|> () <$ lookAhead newline + -- if it could be an abbreviated first name, + -- insist on more than one space + when (delim == Period && (style == UpperAlpha || + (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000]))) $ + () <$ lookAhead (newline <|> spaceChar) + optional $ try (gobbleAtMostSpaces 3 >> notFollowedBy spaceChar) + return (num, style, delim)) + +listStart :: PandocMonad m => MarkdownParser m () +listStart = bulletListStart <|> Control.Monad.void (orderedListStart Nothing) + +listLine :: PandocMonad m => Int -> MarkdownParser m String +listLine continuationIndent = try $ do + notFollowedBy' (do gobbleSpaces continuationIndent + skipMany spaceChar listStart) notFollowedByHtmlCloser - optional (() <$ indentSpaces) + notFollowedByDivCloser + optional (() <$ gobbleSpaces continuationIndent) listLineCommon -listLineCommon :: MarkdownParser String +listLineCommon :: PandocMonad m => MarkdownParser m String listLineCommon = concat <$> manyTill ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') - <|> liftM snd (htmlTag isCommentTag) + <|> fmap snd (htmlTag isCommentTag) <|> count 1 anyChar ) newline -- parse raw text for one list item, excluding start marker and continuations -rawListItem :: MarkdownParser a - -> MarkdownParser String -rawListItem start = try $ do +rawListItem :: PandocMonad m + => Bool -- four space rule + -> MarkdownParser m a + -> MarkdownParser m (String, Int) +rawListItem fourSpaceRule start = try $ do + pos1 <- getPosition start + pos2 <- getPosition + let continuationIndent = if fourSpaceRule + then 4 + else sourceColumn pos2 - sourceColumn pos1 first <- listLineCommon - rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine) + rest <- many (do notFollowedBy listStart + notFollowedBy (() <$ codeBlockFenced) + notFollowedBy blankline + listLine continuationIndent) blanks <- many blankline - return $ unlines (first:rest) ++ blanks + let result = unlines (first:rest) ++ blanks + return (result, continuationIndent) -- continuation of a list item - indented and separated by blankline -- or (in compact lists) endline. -- note: nested lists are parsed as continuations -listContinuation :: MarkdownParser String -listContinuation = try $ do - lookAhead indentSpaces - result <- many1 listContinuationLine +listContinuation :: PandocMonad m => Int -> MarkdownParser m String +listContinuation continuationIndent = try $ do + x <- try $ do + notFollowedBy blankline + notFollowedByHtmlCloser + notFollowedByDivCloser + gobbleSpaces continuationIndent + anyLineNewline + xs <- many $ try $ do + notFollowedBy blankline + notFollowedByHtmlCloser + notFollowedByDivCloser + gobbleSpaces continuationIndent <|> notFollowedBy' listStart + anyLineNewline blanks <- many blankline - return $ concat result ++ blanks + return $ concat (x:xs) ++ blanks + +notFollowedByDivCloser :: PandocMonad m => MarkdownParser m () +notFollowedByDivCloser = + guardDisabled Ext_fenced_divs <|> + do divLevel <- stateFencedDivLevel <$> getState + guard (divLevel < 1) <|> notFollowedBy divFenceEnd -notFollowedByHtmlCloser :: MarkdownParser () +notFollowedByHtmlCloser :: PandocMonad m => MarkdownParser m () notFollowedByHtmlCloser = do inHtmlBlock <- stateInHtmlBlock <$> getState case inHtmlBlock of Just t -> notFollowedBy' $ htmlTag (~== TagClose t) Nothing -> return () -listContinuationLine :: MarkdownParser String -listContinuationLine = try $ do - notFollowedBy blankline - notFollowedBy' listStart - notFollowedByHtmlCloser - optional indentSpaces - result <- anyLine - return $ result ++ "\n" - -listItem :: MarkdownParser a - -> MarkdownParser (F Blocks) -listItem start = try $ do - first <- rawListItem start - continuations <- many listContinuation +listItem :: PandocMonad m + => Bool -- four-space rule + -> MarkdownParser m a + -> MarkdownParser m (F Blocks) +listItem fourSpaceRule start = try $ do + (first, continuationIndent) <- rawListItem fourSpaceRule start + continuations <- many (listContinuation continuationIndent) -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. -- see definition of "endline" @@ -878,39 +937,34 @@ listItem start = try $ do setState $ state {stateParserContext = ListItemState} -- parse the extracted block, which may contain various block elements: let raw = concat (first:continuations) - contents <- parseFromString parseBlocks raw + contents <- parseFromString' parseBlocks raw updateState (\st -> st {stateParserContext = oldContext}) return contents -orderedList :: MarkdownParser (F Blocks) +orderedList :: PandocMonad m => MarkdownParser m (F Blocks) orderedList = try $ do - (start, style, delim) <- lookAhead anyOrderedListStart + (start, style, delim) <- lookAhead (orderedListStart Nothing) unless (style `elem` [DefaultStyle, Decimal, Example] && delim `elem` [DefaultDelim, Period]) $ guardEnabled Ext_fancy_lists when (style == Example) $ guardEnabled Ext_example_lists - items <- fmap sequence $ many1 $ listItem - ( try $ do - optional newline -- if preceded by Plain block in a list - startpos <- sourceColumn <$> getPosition - skipNonindentSpaces - res <- orderedListMarker style delim - endpos <- sourceColumn <$> getPosition - tabStop <- getOption readerTabStop - lookAhead (newline <|> spaceChar) - atMostSpaces (tabStop - (endpos - startpos)) - return res ) - start' <- option 1 $ guardEnabled Ext_startnum >> return start - return $ B.orderedListWith (start', style, delim) <$> fmap compactify' items - -bulletList :: MarkdownParser (F Blocks) + fourSpaceRule <- (True <$ guardEnabled Ext_four_space_rule) + <|> return (style == Example) + items <- fmap sequence $ many1 $ listItem fourSpaceRule + (orderedListStart (Just (style, delim))) + start' <- (start <$ guardEnabled Ext_startnum) <|> return 1 + return $ B.orderedListWith (start', style, delim) <$> fmap compactify items + +bulletList :: PandocMonad m => MarkdownParser m (F Blocks) bulletList = do - items <- fmap sequence $ many1 $ listItem bulletListStart - return $ B.bulletList <$> fmap compactify' items + fourSpaceRule <- (True <$ guardEnabled Ext_four_space_rule) + <|> return False + items <- fmap sequence $ many1 $ listItem fourSpaceRule bulletListStart + return $ B.bulletList <$> fmap compactify items -- definition lists -defListMarker :: MarkdownParser () +defListMarker :: PandocMonad m => MarkdownParser m () defListMarker = do sps <- nonindentSpaces char ':' <|> char '~' @@ -921,52 +975,53 @@ defListMarker = do else mzero return () -definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks])) +definitionListItem :: PandocMonad m => Bool -> MarkdownParser m (F (Inlines, [Blocks])) definitionListItem compact = try $ do rawLine' <- anyLine raw <- many1 $ defRawBlock compact - term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine' - contents <- mapM (parseFromString parseBlocks . (++"\n")) raw + term <- parseFromString' (trimInlinesF <$> inlines) rawLine' + contents <- mapM (parseFromString' parseBlocks . (++"\n")) raw optional blanklines return $ liftM2 (,) term (sequence contents) -defRawBlock :: Bool -> MarkdownParser String +defRawBlock :: PandocMonad m => Bool -> MarkdownParser m String defRawBlock compact = try $ do hasBlank <- option False $ blankline >> return True defListMarker - firstline <- anyLine + firstline <- anyLineNewline let dline = try ( do notFollowedBy blankline notFollowedByHtmlCloser + notFollowedByDivCloser if compact -- laziness not compatible with compact then () <$ indentSpaces else (() <$ indentSpaces) <|> notFollowedBy defListMarker anyLine ) rawlines <- many dline - cont <- liftM concat $ many $ try $ do + cont <- fmap concat $ many $ try $ do trailing <- option "" blanklines ln <- indentSpaces >> notFollowedBy blankline >> anyLine lns <- many dline return $ trailing ++ unlines (ln:lns) - return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++ + return $ trimr (firstline ++ unlines rawlines ++ cont) ++ if hasBlank || not (null cont) then "\n\n" else "" -definitionList :: MarkdownParser (F Blocks) +definitionList :: PandocMonad m => MarkdownParser m (F Blocks) definitionList = try $ do lookAhead (anyLine >> - optional (blankline >> notFollowedBy (table >> return ())) >> + optional (blankline >> notFollowedBy (Control.Monad.void table)) >> -- don't capture table caption as def list! defListMarker) compactDefinitionList <|> normalDefinitionList -compactDefinitionList :: MarkdownParser (F Blocks) +compactDefinitionList :: PandocMonad m => MarkdownParser m (F Blocks) compactDefinitionList = do guardEnabled Ext_compact_definition_lists items <- fmap sequence $ many1 $ definitionListItem True - return $ B.definitionList <$> fmap compactify'DL items + return $ B.definitionList <$> fmap compactifyDL items -normalDefinitionList :: MarkdownParser (F Blocks) +normalDefinitionList :: PandocMonad m => MarkdownParser m (F Blocks) normalDefinitionList = do guardEnabled Ext_definition_lists items <- fmap sequence $ many1 $ definitionListItem False @@ -976,10 +1031,10 @@ normalDefinitionList = do -- paragraph block -- -para :: MarkdownParser (F Blocks) +para :: PandocMonad m => MarkdownParser m (F Blocks) para = try $ do exts <- getOption readerExtensions - result <- trimInlinesF . mconcat <$> many1 inline + result <- trimInlinesF <$> inlines1 option (B.plain <$> result) $ try $ do newline @@ -997,29 +1052,35 @@ para = try $ do Just "div" -> () <$ lookAhead (htmlTag (~== TagClose "div")) _ -> mzero + <|> do guardEnabled Ext_fenced_divs + divLevel <- stateFencedDivLevel <$> getState + if divLevel > 0 + then lookAhead divFenceEnd + else mzero return $ do result' <- result case B.toList result' of [Image attr alt (src,tit)] - | Ext_implicit_figures `Set.member` exts -> + | not (null alt) && + Ext_implicit_figures `extensionEnabled` exts -> -- the fig: at beginning of title indicates a figure return $ B.para $ B.singleton $ Image attr alt (src,'f':'i':'g':':':tit) _ -> return $ B.para result' -plain :: MarkdownParser (F Blocks) -plain = fmap B.plain . trimInlinesF . mconcat <$> many1 inline +plain :: PandocMonad m => MarkdownParser m (F Blocks) +plain = fmap B.plain . trimInlinesF <$> inlines1 -- -- raw html -- -htmlElement :: MarkdownParser String +htmlElement :: PandocMonad m => MarkdownParser m String htmlElement = rawVerbatimBlock <|> strictHtmlBlock - <|> liftM snd (htmlTag isBlockTag) + <|> fmap snd (htmlTag isBlockTag) -htmlBlock :: MarkdownParser (F Blocks) +htmlBlock :: PandocMonad m => MarkdownParser m (F Blocks) htmlBlock = do guardEnabled Ext_raw_html try (do @@ -1044,43 +1105,59 @@ htmlBlock = do <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)) <|> htmlBlock' -htmlBlock' :: MarkdownParser (F Blocks) +htmlBlock' :: PandocMonad m => MarkdownParser m (F Blocks) htmlBlock' = try $ do first <- htmlElement skipMany spaceChar optional blanklines - return $ return $ B.rawBlock "html" first + return $ if null first + then mempty + else return $ B.rawBlock "html" first -strictHtmlBlock :: MarkdownParser String +strictHtmlBlock :: PandocMonad m => MarkdownParser m String strictHtmlBlock = htmlInBalanced (not . isInlineTag) -rawVerbatimBlock :: MarkdownParser String +rawVerbatimBlock :: PandocMonad m => MarkdownParser m String rawVerbatimBlock = htmlInBalanced isVerbTag where isVerbTag (TagOpen "pre" _) = True isVerbTag (TagOpen "style" _) = True isVerbTag (TagOpen "script" _) = True isVerbTag _ = False -rawTeXBlock :: MarkdownParser (F Blocks) +rawTeXBlock :: PandocMonad m => MarkdownParser m (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" . concat <$> - rawLaTeXBlock `sepEndBy1` blankline) - <|> (B.rawBlock "context" . concat <$> - rawConTeXtEnvironment `sepEndBy1` blankline) - spaces - return $ return result - -rawHtmlBlocks :: MarkdownParser (F Blocks) + lookAhead $ try $ char '\\' >> letter + result <- (B.rawBlock "context" . trim . concat <$> + many1 ((++) <$> (rawConTeXtEnvironment <|> conTeXtCommand) + <*> spnl')) + <|> (B.rawBlock "latex" . trim . concat <$> + many1 ((++) <$> rawLaTeXBlock <*> spnl')) + return $ case B.toList result of + [RawBlock _ cs] + | all (`elem` [' ','\t','\n']) cs -> return mempty + -- don't create a raw block for suppressed macro defs + _ -> return result + +conTeXtCommand :: PandocMonad m => MarkdownParser m String +conTeXtCommand = oneOfStrings ["\\placeformula"] + +rawHtmlBlocks :: PandocMonad m => MarkdownParser m (F Blocks) rawHtmlBlocks = do (TagOpen tagtype _, raw) <- htmlTag isBlockTag + -- we don't want '<td> text' to be a code block: + skipMany spaceChar + indentlevel <- (blankline >> length <$> many (char ' ')) <|> return 0 -- try to find closing tag -- we set stateInHtmlBlock so that closing tags that can be either block or -- inline will not be parsed as inline tags oldInHtmlBlock <- stateInHtmlBlock <$> getState updateState $ \st -> st{ stateInHtmlBlock = Just tagtype } let closer = htmlTag (\x -> x ~== TagClose tagtype) - contents <- mconcat <$> many (notFollowedBy' closer >> block) + let block' = do notFollowedBy' closer + gobbleAtMostSpaces indentlevel + block + contents <- mconcat <$> many block' result <- (closer >>= \(_, rawcloser) -> return ( return (B.rawBlock "html" $ stripMarkdownAttribute raw) <> @@ -1101,11 +1178,11 @@ stripMarkdownAttribute s = renderTags' $ map filterAttrib $ parseTags s -- line block -- -lineBlock :: MarkdownParser (F Blocks) +lineBlock :: PandocMonad m => MarkdownParser m (F Blocks) lineBlock = try $ do guardEnabled Ext_line_blocks lines' <- lineBlockLines >>= - mapM (parseFromString (trimInlinesF . mconcat <$> many inline)) + mapM (parseFromString' (trimInlinesF <$> inlines)) return $ B.lineBlock <$> sequence lines' -- @@ -1114,8 +1191,9 @@ lineBlock = try $ do -- Parse a dashed line with optional trailing spaces; return its length -- and the length including trailing space. -dashedLine :: Char - -> Parser [Char] st (Int, Int) +dashedLine :: PandocMonad m + => Char + -> ParserT [Char] st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many spaceChar @@ -1125,8 +1203,9 @@ dashedLine ch = do -- Parse a table header with dashed lines of '-' preceded by -- one (or zero) line of text. -simpleTableHeader :: Bool -- ^ Headerless table - -> MarkdownParser (F [Blocks], [Alignment], [Int]) +simpleTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m (F [Blocks], [Alignment], [Int]) simpleTableHeader headless = try $ do rawContent <- if headless then return "" @@ -1137,17 +1216,17 @@ simpleTableHeader headless = try $ do let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' -- If no header, calculate alignment on basis of first row of text - rawHeads <- liftM (tail . splitStringByIndices (init indices)) $ + rawHeads <- fmap (tail . splitStringByIndices (init indices)) $ if headless then lookAhead anyLine else return rawContent - let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths + let aligns = zipWith alignType (map (: []) rawHeads) lengths let rawHeads' = if headless then replicate (length dashes) "" else rawHeads heads <- fmap sequence - $ mapM (parseFromString (mconcat <$> many plain)) - $ map trim rawHeads' + $ + mapM ((parseFromString' (mconcat <$> many plain)).trim) rawHeads' return (heads, aligns, indices) -- Returns an alignment type for a table, based on a list of strings @@ -1161,25 +1240,26 @@ alignType strLst len = let nonempties = filter (not . null) $ map trimr strLst (leftSpace, rightSpace) = case sortBy (comparing length) nonempties of - (x:_) -> (head x `elem` " \t", length x < len) - [] -> (False, False) + (x:_) -> (head x `elem` " \t", length x < len) + [] -> (False, False) in case (leftSpace, rightSpace) of - (True, False) -> AlignRight - (False, True) -> AlignLeft - (True, True) -> AlignCenter - (False, False) -> AlignDefault + (True, False) -> AlignRight + (False, True) -> AlignLeft + (True, True) -> AlignCenter + (False, False) -> AlignDefault -- Parse a table footer - dashed lines followed by blank line. -tableFooter :: MarkdownParser String +tableFooter :: PandocMonad m => MarkdownParser m String tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines -- Parse a table separator - dashed line. -tableSep :: MarkdownParser Char +tableSep :: PandocMonad m => MarkdownParser m Char tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n' -- Parse a raw line and split it into chunks by indices. -rawTableLine :: [Int] - -> MarkdownParser [String] +rawTableLine :: PandocMonad m + => [Int] + -> MarkdownParser m [String] rawTableLine indices = do notFollowedBy' (blanklines <|> tableFooter) line <- many1Till anyChar newline @@ -1187,31 +1267,34 @@ rawTableLine indices = do splitStringByIndices (init indices) line -- Parse a table line and return a list of lists of blocks (columns). -tableLine :: [Int] - -> MarkdownParser (F [Blocks]) +tableLine :: PandocMonad m + => [Int] + -> MarkdownParser m (F [Blocks]) tableLine indices = rawTableLine indices >>= - fmap sequence . mapM (parseFromString (mconcat <$> many plain)) + fmap sequence . mapM (parseFromString' (mconcat <$> many plain)) -- Parse a multiline table row and return a list of blocks (columns). -multilineRow :: [Int] - -> MarkdownParser (F [Blocks]) +multilineRow :: PandocMonad m + => [Int] + -> MarkdownParser m (F [Blocks]) multilineRow indices = do colLines <- many1 (rawTableLine indices) let cols = map unlines $ transpose colLines - fmap sequence $ mapM (parseFromString (mconcat <$> many plain)) cols + fmap sequence $ mapM (parseFromString' (mconcat <$> many plain)) cols -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. -tableCaption :: MarkdownParser (F Inlines) +tableCaption :: PandocMonad m => MarkdownParser m (F Inlines) tableCaption = try $ do guardEnabled Ext_table_captions skipNonindentSpaces - string ":" <|> string "Table:" - trimInlinesF . mconcat <$> many1 inline <* blanklines + (string ":" <* notFollowedBy (satisfy isPunctuation)) <|> string "Table:" + trimInlinesF <$> inlines1 <* blanklines -- Parse a simple table with '---' header and one line per row. -simpleTable :: Bool -- ^ Headerless table - -> MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) +simpleTable :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) simpleTable headless = do (aligns, _widths, heads', lines') <- tableWith (simpleTableHeader headless) tableLine @@ -1224,13 +1307,15 @@ simpleTable headless = do -- (which may be multiline), then the rows, -- which may be multiline, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -multilineTable :: Bool -- ^ Headerless table - -> MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) +multilineTable :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) multilineTable headless = tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter -multilineTableHeader :: Bool -- ^ Headerless table - -> MarkdownParser (F [Blocks], [Alignment], [Int]) +multilineTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m (F [Blocks], [Alignment], [Int]) multilineTableHeader headless = try $ do unless headless $ tableSep >> notFollowedBy blankline @@ -1243,7 +1328,7 @@ multilineTableHeader headless = try $ do let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' rawHeadsList <- if headless - then liftM (map (:[]) . tail . + then fmap (map (:[]) . tail . splitStringByIndices (init indices)) $ lookAhead anyLine else return $ transpose $ map (tail . splitStringByIndices (init indices)) @@ -1253,101 +1338,18 @@ multilineTableHeader headless = try $ do then replicate (length dashes) "" else map (unlines . map trim) rawHeadsList heads <- fmap sequence $ - mapM (parseFromString (mconcat <$> many plain)) $ - map trim rawHeads + mapM ((parseFromString' (mconcat <$> many plain)).trim) rawHeads return (heads, aligns, indices) -- Parse a grid table: starts with row of '-' on top, then header -- (which may be grid), then the rows, -- which may be grid, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -gridTable :: Bool -- ^ Headerless table - -> MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) -gridTable headless = - tableWith (gridTableHeader headless) gridTableRow - (gridTableSep '-') gridTableFooter - -gridTableSplitLine :: [Int] -> String -> [String] -gridTableSplitLine indices line = map removeFinalBar $ tail $ - splitStringByIndices (init indices) $ trimr line - -gridPart :: Char -> Parser [Char] st ((Int, Int), Alignment) -gridPart ch = do - leftColon <- option False (True <$ char ':') - dashes <- many1 (char ch) - rightColon <- option False (True <$ char ':') - char '+' - let lengthDashes = length dashes + (if leftColon then 1 else 0) + - (if rightColon then 1 else 0) - let alignment = case (leftColon, rightColon) of - (True, True) -> AlignCenter - (True, False) -> AlignLeft - (False, True) -> AlignRight - (False, False) -> AlignDefault - return ((lengthDashes, lengthDashes + 1), alignment) - -gridDashedLines :: Char -> Parser [Char] st [((Int, Int), Alignment)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline - -removeFinalBar :: String -> String -removeFinalBar = - reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse - --- | Separator between rows of grid table. -gridTableSep :: Char -> MarkdownParser Char -gridTableSep ch = try $ gridDashedLines ch >> return '\n' - --- | Parse header for a grid table. -gridTableHeader :: Bool -- ^ Headerless table - -> MarkdownParser (F [Blocks], [Alignment], [Int]) -gridTableHeader headless = try $ do - optional blanklines - dashes <- gridDashedLines '-' - rawContent <- if headless - then return [] - else many1 (try (char '|' >> anyLine)) - underDashes <- if headless - then return dashes - else gridDashedLines '=' - guard $ length dashes == length underDashes - let lines' = map (snd . fst) underDashes - let indices = scanl (+) 0 lines' - let aligns = map snd underDashes - let rawHeads = if headless - then replicate (length underDashes) "" - else map (unlines . map trim) $ transpose - $ map (gridTableSplitLine indices) rawContent - heads <- fmap sequence $ mapM (parseFromString parseBlocks . trim) rawHeads - return (heads, aligns, indices) +gridTable :: PandocMonad m => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +gridTable headless = gridTableWith' parseBlocks headless -gridTableRawLine :: [Int] -> MarkdownParser [String] -gridTableRawLine indices = do - char '|' - line <- anyLine - return (gridTableSplitLine indices line) - --- | Parse row of grid table. -gridTableRow :: [Int] - -> MarkdownParser (F [Blocks]) -gridTableRow indices = do - colLines <- many1 (gridTableRawLine indices) - let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ - transpose colLines - fmap compactify' <$> fmap sequence (mapM (parseFromString parseBlocks) cols) - -removeOneLeadingSpace :: [String] -> [String] -removeOneLeadingSpace xs = - if all startsWithSpace xs - then map (drop 1) xs - else xs - where startsWithSpace "" = True - startsWithSpace (y:_) = y == ' ' - --- | Parse footer for a grid table. -gridTableFooter :: MarkdownParser [Char] -gridTableFooter = blanklines - -pipeBreak :: MarkdownParser ([Alignment], [Int]) +pipeBreak :: PandocMonad m => MarkdownParser m ([Alignment], [Int]) pipeBreak = try $ do nonindentSpaces openPipe <- (True <$ char '|') <|> return False @@ -1359,7 +1361,7 @@ pipeBreak = try $ do blankline return $ unzip (first:rest) -pipeTable :: MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) +pipeTable :: PandocMonad m => MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) pipeTable = try $ do nonindentSpaces lookAhead nonspaceChar @@ -1372,41 +1374,40 @@ pipeTable = try $ do numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> - fromIntegral (len + 1) / fromIntegral numColumns) - seplengths + fromIntegral len / fromIntegral (sum seplengths)) + seplengths else replicate (length aligns) 0.0 - return $ (aligns, widths, heads', sequence lines'') + return (aligns, widths, heads', sequence lines'') -sepPipe :: MarkdownParser () +sepPipe :: PandocMonad m => MarkdownParser m () sepPipe = try $ do char '|' <|> char '+' notFollowedBy blankline -- parse a row, also returning probable alignments for org-table cells -pipeTableRow :: MarkdownParser (F [Blocks]) +pipeTableRow :: PandocMonad m => MarkdownParser m (F [Blocks]) pipeTableRow = try $ do scanForPipe skipMany spaceChar openPipe <- (True <$ char '|') <|> return False -- split into cells - let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') + let chunk = void (code <|> math <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') <|> void (noneOf "|\n\r") let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>= - parseFromString pipeTableCell - cells <- cellContents `sepEndBy1` (char '|') + parseFromString' pipeTableCell + cells <- cellContents `sepEndBy1` char '|' -- surrounding pipes needed for a one-column table: guard $ not (length cells == 1 && not openPipe) blankline return $ sequence cells -pipeTableCell :: MarkdownParser (F Blocks) -pipeTableCell = do - result <- many inline - if null result - then return mempty - else return $ B.plain . mconcat <$> sequence result +pipeTableCell :: PandocMonad m => MarkdownParser m (F Blocks) +pipeTableCell = + (do result <- inlines1 + return $ B.plain <$> result) + <|> return mempty -pipeTableHeaderPart :: Parser [Char] st (Alignment, Int) +pipeTableHeaderPart :: PandocMonad m => ParserT [Char] st m (Alignment, Int) pipeTableHeaderPart = try $ do skipMany spaceChar left <- optionMaybe (char ':') @@ -1414,15 +1415,15 @@ pipeTableHeaderPart = try $ do right <- optionMaybe (char ':') skipMany spaceChar let len = length pipe + maybe 0 (const 1) left + maybe 0 (const 1) right - return $ - ((case (left,right) of - (Nothing,Nothing) -> AlignDefault - (Just _,Nothing) -> AlignLeft - (Nothing,Just _) -> AlignRight - (Just _,Just _) -> AlignCenter), len) + return + (case (left,right) of + (Nothing,Nothing) -> AlignDefault + (Just _,Nothing) -> AlignLeft + (Nothing,Just _) -> AlignRight + (Just _,Just _) -> AlignCenter, len) -- Succeed only if current line contains a pipe. -scanForPipe :: Parser [Char] st () +scanForPipe :: PandocMonad m => ParserT [Char] st m () scanForPipe = do inp <- getInput case break (\c -> c == '\n' || c == '|') inp of @@ -1432,22 +1433,23 @@ scanForPipe = do -- | Parse a table using 'headerParser', 'rowParser', -- 'lineParser', and 'footerParser'. Variant of the version in -- Text.Pandoc.Parsing. -tableWith :: MarkdownParser (F [Blocks], [Alignment], [Int]) - -> ([Int] -> MarkdownParser (F [Blocks])) - -> MarkdownParser sep - -> MarkdownParser end - -> MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) +tableWith :: PandocMonad m + => MarkdownParser m (F [Blocks], [Alignment], [Int]) + -> ([Int] -> MarkdownParser m (F [Blocks])) + -> MarkdownParser m sep + -> MarkdownParser m end + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) tableWith headerParser rowParser lineParser footerParser = try $ do (heads, aligns, indices) <- headerParser lines' <- fmap sequence $ rowParser indices `sepEndBy1` lineParser footerParser numColumns <- getOption readerColumns - let widths = if (indices == []) + let widths = if null indices then replicate (length aligns) 0.0 else widthsFromIndices numColumns indices - return $ (aligns, widths, heads, lines') + return (aligns, widths, heads, lines') -table :: MarkdownParser (F Blocks) +table :: PandocMonad m => MarkdownParser m (F Blocks) table = try $ do frontCaption <- option Nothing (Just <$> tableCaption) (aligns, widths, heads, lns) <- @@ -1462,8 +1464,8 @@ table = try $ do (gridTable False <|> gridTable True)) <?> "table" optional blanklines caption <- case frontCaption of - Nothing -> option (return mempty) tableCaption - Just c -> return c + Nothing -> option (return mempty) tableCaption + Just c -> return c -- renormalize widths if greater than 100%: let totalWidth = sum widths let widths' = if totalWidth < 1 @@ -1479,7 +1481,13 @@ table = try $ do -- inline -- -inline :: MarkdownParser (F Inlines) +inlines :: PandocMonad m => MarkdownParser m (F Inlines) +inlines = mconcat <$> many inline + +inlines1 :: PandocMonad m => MarkdownParser m (F Inlines) +inlines1 = mconcat <$> many1 inline + +inline :: PandocMonad m => MarkdownParser m (F Inlines) inline = choice [ whitespace , bareURL , str @@ -1499,6 +1507,7 @@ inline = choice [ whitespace , autoLink , spanHtml , rawHtmlInline + , escapedNewline , escapedChar , rawLaTeXInline' , exampleRef @@ -1509,32 +1518,36 @@ inline = choice [ whitespace , ltSign ] <?> "inline" -escapedChar' :: MarkdownParser Char +escapedChar' :: PandocMonad m => MarkdownParser m Char escapedChar' = try $ do char '\\' (guardEnabled Ext_all_symbols_escapable >> satisfy (not . isAlphaNum)) <|> (guardEnabled Ext_angle_brackets_escapable >> oneOf "\\`*_{}[]()>#+-.!~\"<>") - <|> (guardEnabled Ext_escaped_line_breaks >> char '\n') <|> oneOf "\\`*_{}[]()>#+-.!~\"" -escapedChar :: MarkdownParser (F Inlines) +escapedNewline :: PandocMonad m => MarkdownParser m (F Inlines) +escapedNewline = try $ do + guardEnabled Ext_escaped_line_breaks + char '\\' + lookAhead (char '\n') -- don't consume the newline (see #3730) + return $ return B.linebreak + +escapedChar :: PandocMonad m => MarkdownParser m (F Inlines) escapedChar = do result <- escapedChar' case result of - ' ' -> return $ return $ B.str "\160" -- "\ " is a nonbreaking space - '\n' -> guardEnabled Ext_escaped_line_breaks >> - return (return B.linebreak) -- "\[newline]" is a linebreak - _ -> return $ return $ B.str [result] + ' ' -> return $ return $ B.str "\160" -- "\ " is a nonbreaking space + _ -> return $ return $ B.str [result] -ltSign :: MarkdownParser (F Inlines) +ltSign :: PandocMonad m => MarkdownParser m (F Inlines) ltSign = do guardDisabled Ext_raw_html <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag)) char '<' return $ return $ B.str "<" -exampleRef :: MarkdownParser (F Inlines) +exampleRef :: PandocMonad m => MarkdownParser m (F Inlines) exampleRef = try $ do guardEnabled Ext_example_lists char '@' @@ -1542,10 +1555,10 @@ exampleRef = try $ do return $ do st <- askF return $ case M.lookup lab (stateExamples st) of - Just n -> B.str (show n) - Nothing -> B.str ('@':lab) + Just n -> B.str (show n) + Nothing -> B.str ('@':lab) -symbol :: MarkdownParser (F Inlines) +symbol :: PandocMonad m => MarkdownParser m (F Inlines) symbol = do result <- noneOf "<\\\n\t " <|> try (do lookAhead $ char '\\' @@ -1554,28 +1567,36 @@ symbol = do return $ return $ B.str [result] -- parses inline code, between n `s and n `s -code :: MarkdownParser (F Inlines) +code :: PandocMonad m => MarkdownParser m (F Inlines) code = try $ do starts <- many1 (char '`') skipSpaces - result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> + result <- (trim . concat) <$> + many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> (char '\n' >> notFollowedBy' blankline >> return " ")) (try (skipSpaces >> count (length starts) (char '`') >> notFollowedBy (char '`'))) - attr <- option ([],[],[]) (try $ guardEnabled Ext_inline_code_attributes - >> attributes) - return $ return $ B.codeWith attr $ trim $ concat result - -math :: MarkdownParser (F Inlines) -math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) - <|> (return . B.math <$> (mathInline >>= applyMacros')) <+?> - ((getOption readerSmart >>= guard) *> (return <$> apostrophe) + rawattr <- + (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + <|> + (Right <$> option ("",[],[]) + (try (guardEnabled Ext_inline_code_attributes >> attributes))) + return $ return $ + case rawattr of + Left syn -> B.rawInline syn result + Right attr -> B.codeWith attr result + +math :: PandocMonad m => MarkdownParser m (F Inlines) +math = (return . B.displayMath <$> (mathDisplay >>= applyMacros)) + <|> (return . B.math <$> (mathInline >>= applyMacros)) <+?> + (guardEnabled Ext_smart *> (return <$> apostrophe) <* notFollowedBy (space <|> satisfy isPunctuation)) -- Parses material enclosed in *s, **s, _s, or __s. -- Designed to avoid backtracking. -enclosure :: Char - -> MarkdownParser (F Inlines) +enclosure :: PandocMonad m + => Char + -> MarkdownParser m (F Inlines) enclosure c = do -- we can't start an enclosure with _ if after a string and -- the intraword_underscores extension is enabled: @@ -1584,14 +1605,14 @@ enclosure c = do <|> (guard =<< notAfterString) cs <- many1 (char c) (return (B.str cs) <>) <$> whitespace - <|> do + <|> case length cs of - 3 -> three c - 2 -> two c mempty - 1 -> one c mempty - _ -> return (return $ B.str cs) + 3 -> three c + 2 -> two c mempty + 1 -> one c mempty + _ -> return (return $ B.str cs) -ender :: Char -> Int -> MarkdownParser () +ender :: PandocMonad m => Char -> Int -> MarkdownParser m () ender c n = try $ do count n (char c) guard (c == '*') @@ -1602,104 +1623,96 @@ ender c n = try $ do -- If one c, emit emph and then parse two. -- If two cs, emit strong and then parse one. -- Otherwise, emit ccc then the results. -three :: Char -> MarkdownParser (F Inlines) +three :: PandocMonad m => Char -> MarkdownParser m (F Inlines) three c = do contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline) - (ender c 3 >> return ((B.strong . B.emph) <$> contents)) - <|> (ender c 2 >> one c (B.strong <$> contents)) - <|> (ender c 1 >> two c (B.emph <$> contents)) + (ender c 3 >> updateLastStrPos >> return ((B.strong . B.emph) <$> contents)) + <|> (ender c 2 >> updateLastStrPos >> one c (B.strong <$> contents)) + <|> (ender c 1 >> updateLastStrPos >> two c (B.emph <$> contents)) <|> return (return (B.str [c,c,c]) <> contents) -- Parse inlines til you hit two c's, and emit strong. -- If you never do hit two cs, emit ** plus inlines parsed. -two :: Char -> F Inlines -> MarkdownParser (F Inlines) +two :: PandocMonad m => Char -> F Inlines -> MarkdownParser m (F Inlines) two c prefix' = do contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline) - (ender c 2 >> return (B.strong <$> (prefix' <> contents))) + (ender c 2 >> updateLastStrPos >> + return (B.strong <$> (prefix' <> contents))) <|> return (return (B.str [c,c]) <> (prefix' <> contents)) -- Parse inlines til you hit a c, and emit emph. -- If you never hit a c, emit * plus inlines parsed. -one :: Char -> F Inlines -> MarkdownParser (F Inlines) +one :: PandocMonad m => Char -> F Inlines -> MarkdownParser m (F Inlines) one c prefix' = do contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline) <|> try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty) ) - (ender c 1 >> return (B.emph <$> (prefix' <> contents))) + (ender c 1 >> updateLastStrPos >> return (B.emph <$> (prefix' <> contents))) <|> return (return (B.str [c]) <> (prefix' <> contents)) -strongOrEmph :: MarkdownParser (F Inlines) +strongOrEmph :: PandocMonad m => MarkdownParser m (F Inlines) strongOrEmph = enclosure '*' <|> enclosure '_' -- | Parses a list of inlines between start and end delimiters. -inlinesBetween :: (Show b) - => MarkdownParser a - -> MarkdownParser b - -> MarkdownParser (F Inlines) +inlinesBetween :: PandocMonad m + => (Show b) + => MarkdownParser m a + -> MarkdownParser m b + -> MarkdownParser m (F Inlines) inlinesBetween start end = (trimInlinesF . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) innerSpace = try $ whitespace <* notFollowedBy' end -strikeout :: MarkdownParser (F Inlines) +strikeout :: PandocMonad m => MarkdownParser m (F Inlines) strikeout = fmap B.strikeout <$> (guardEnabled Ext_strikeout >> inlinesBetween strikeStart strikeEnd) where strikeStart = string "~~" >> lookAhead nonspaceChar >> notFollowedBy (char '~') strikeEnd = try $ string "~~" -superscript :: MarkdownParser (F Inlines) +superscript :: PandocMonad m => MarkdownParser m (F Inlines) superscript = fmap B.superscript <$> try (do guardEnabled Ext_superscript char '^' mconcat <$> many1Till (notFollowedBy spaceChar >> inline) (char '^')) -subscript :: MarkdownParser (F Inlines) +subscript :: PandocMonad m => MarkdownParser m (F Inlines) subscript = fmap B.subscript <$> try (do guardEnabled Ext_subscript char '~' mconcat <$> many1Till (notFollowedBy spaceChar >> inline) (char '~')) -whitespace :: MarkdownParser (F Inlines) +whitespace :: PandocMonad m => MarkdownParser m (F Inlines) whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace" where lb = spaceChar >> skipMany spaceChar >> option B.space (endline >> return B.linebreak) regsp = skipMany spaceChar >> return B.space -nonEndline :: Parser [Char] st Char +nonEndline :: PandocMonad m => ParserT [Char] st m Char nonEndline = satisfy (/='\n') -str :: MarkdownParser (F Inlines) +str :: PandocMonad m => MarkdownParser m (F Inlines) str = do - result <- many1 alphaNum + result <- many1 (alphaNum <|> try (char '.' <* notFollowedBy (char '.'))) updateLastStrPos - let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) - isSmart <- getOption readerSmart - if isSmart - then case likelyAbbrev result of - [] -> return $ return $ B.str result - xs -> choice (map (\x -> - try (string x >> oneOf " \n" >> - lookAhead alphaNum >> - return (return $ B.str - $ result ++ spacesToNbr x ++ "\160"))) xs) - <|> (return $ return $ B.str result) - else return $ return $ B.str result - --- | if the string matches the beginning of an abbreviation (before --- the first period, return strings that would finish the abbreviation. -likelyAbbrev :: String -> [String] -likelyAbbrev x = - let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", - "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", - "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", - "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", - "ch.", "sec.", "cf.", "cp."] - abbrPairs = map (break (=='.')) abbrevs - in map snd $ filter (\(y,_) -> y == x) abbrPairs + (do guardEnabled Ext_smart + abbrevs <- getOption readerAbbreviations + if not (null result) && last result == '.' && result `Set.member` abbrevs + then try (do ils <- whitespace <|> endline + lookAhead alphaNum + return $ do + ils' <- ils + if ils' == B.space + then return (B.str result <> B.str "\160") + else -- linebreak or softbreak + return (ils' <> B.str result <> B.str "\160")) + <|> return (return (B.str result)) + else return (return (B.str result))) + <|> return (return (B.str result)) -- an endline character that can be treated as a space, not a structural break -endline :: MarkdownParser (F Inlines) +endline :: PandocMonad m => MarkdownParser m (F Inlines) endline = try $ do newline notFollowedBy blankline @@ -1711,6 +1724,7 @@ endline = try $ do guardDisabled Ext_backtick_code_blocks <|> notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced)) notFollowedByHtmlCloser + notFollowedByDivCloser (eof >> return mempty) <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) <|> (guardEnabled Ext_ignore_line_breaks >> return mempty) @@ -1721,23 +1735,25 @@ endline = try $ do -- -- a reference label for a link -reference :: MarkdownParser (F Inlines, String) -reference = do notFollowedBy' (string "[^") -- footnote reference - withRaw $ trimInlinesF <$> inlinesInBalancedBrackets +reference :: PandocMonad m => MarkdownParser m (F Inlines, String) +reference = do + guardDisabled Ext_footnotes <|> notFollowedBy' (string "[^") + guardDisabled Ext_citations <|> notFollowedBy' (string "[@") + withRaw $ trimInlinesF <$> inlinesInBalancedBrackets -parenthesizedChars :: MarkdownParser [Char] +parenthesizedChars :: PandocMonad m => MarkdownParser m [Char] parenthesizedChars = do result <- charsInBalanced '(' ')' litChar return $ '(' : result ++ ")" -- source for a link, with optional title -source :: MarkdownParser (String, String) +source :: PandocMonad m => MarkdownParser m (String, String) source = do char '(' skipSpaces let urlChunk = try parenthesizedChars - <|> (notFollowedBy (oneOf " )") >> (count 1 litChar)) + <|> (notFollowedBy (oneOf " )") >> count 1 litChar) <|> try (many1 spaceChar <* notFollowedBy (oneOf "\"')")) let sourceURL = (unwords . words . concat) <$> many urlChunk let betweenAngles = try $ @@ -1748,10 +1764,10 @@ source = do char ')' return (escapeURI $ trimr src, tit) -linkTitle :: MarkdownParser String +linkTitle :: PandocMonad m => MarkdownParser m String linkTitle = quotedTitle '"' <|> quotedTitle '\'' -link :: MarkdownParser (F Inlines) +link :: PandocMonad m => MarkdownParser m (F Inlines) link = try $ do st <- getState guard $ stateAllowLinks st @@ -1760,21 +1776,31 @@ link = try $ do setState $ st{ stateAllowLinks = True } regLink B.linkWith lab <|> referenceLink B.linkWith (lab,raw) -bracketedSpan :: MarkdownParser (F Inlines) +bracketedSpan :: PandocMonad m => MarkdownParser m (F Inlines) bracketedSpan = try $ do guardEnabled Ext_bracketed_spans (lab,_) <- reference attr <- attributes - let (ident,classes,keyvals) = attr - case lookup "style" keyvals of - Just s | null ident && null classes && - map toLower (filter (`notElem` " \t;") s) == - "font-variant:small-caps" - -> return $ B.smallcaps <$> lab - _ -> return $ B.spanWith attr <$> lab - -regLink :: (Attr -> String -> String -> Inlines -> Inlines) - -> F Inlines -> MarkdownParser (F Inlines) + return $ if isSmallCaps attr + then B.smallcaps <$> lab + else B.spanWith attr <$> lab + +-- | We treat a span as SmallCaps if class is "smallcaps" (and +-- no other attributes are set or if style is "font-variant:small-caps" +-- (and no other attributes are set). +isSmallCaps :: Attr -> Bool +isSmallCaps ("",["smallcaps"],[]) = True +isSmallCaps ("",[],kvs) = + case lookup "style" kvs of + Just s -> map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + Nothing -> False +isSmallCaps _ = False + +regLink :: PandocMonad m + => (Attr -> String -> String -> Inlines -> Inlines) + -> F Inlines + -> MarkdownParser m (F Inlines) regLink constructor lab = try $ do (src, tit) <- source attr <- option nullAttr $ @@ -1782,20 +1808,24 @@ regLink constructor lab = try $ do return $ constructor attr src tit <$> lab -- a link like [this][ref] or [this][] or [this] -referenceLink :: (Attr -> String -> String -> Inlines -> Inlines) - -> (F Inlines, String) -> MarkdownParser (F Inlines) +referenceLink :: PandocMonad m + => (Attr -> String -> String -> Inlines -> Inlines) + -> (F Inlines, String) + -> MarkdownParser m (F Inlines) referenceLink constructor (lab, raw) = do sp <- (True <$ lookAhead (char ' ')) <|> return False (_,raw') <- option (mempty, "") $ - lookAhead (try (guardEnabled Ext_citations >> - spnl >> normalCite >> return (mempty, ""))) + lookAhead (try (do guardEnabled Ext_citations + guardDisabled Ext_spaced_reference_links <|> spnl + normalCite + return (mempty, ""))) <|> - try (spnl >> reference) + try ((guardDisabled Ext_spaced_reference_links <|> spnl) >> reference) when (raw' == "") $ guardEnabled Ext_shortcut_reference_links let labIsRef = raw' == "" || raw' == "[]" let key = toKey $ if labIsRef then raw else raw' - parsedRaw <- parseFromString (mconcat <$> many inline) raw' - fallback <- parseFromString (mconcat <$> many inline) $ dropBrackets raw + parsedRaw <- parseFromString' inlines raw' + fallback <- parseFromString' inlines $ dropBrackets raw implicitHeaderRefs <- option False $ True <$ guardEnabled Ext_implicit_header_references let makeFallback = do @@ -1820,11 +1850,11 @@ referenceLink constructor (lab, raw) = do dropBrackets :: String -> String dropBrackets = reverse . dropRB . reverse . dropLB where dropRB (']':xs) = xs - dropRB xs = xs + dropRB xs = xs dropLB ('[':xs) = xs - dropLB xs = xs + dropLB xs = xs -bareURL :: MarkdownParser (F Inlines) +bareURL :: PandocMonad m => MarkdownParser m (F Inlines) bareURL = try $ do guardEnabled Ext_autolink_bare_uris getState >>= guard . stateAllowLinks @@ -1832,7 +1862,7 @@ bareURL = try $ do notFollowedBy $ try $ spaces >> htmlTag (~== TagClose "a") return $ return $ B.link src "" (B.str orig) -autoLink :: MarkdownParser (F Inlines) +autoLink :: PandocMonad m => MarkdownParser m (F Inlines) autoLink = try $ do getState >>= guard . stateAllowLinks char '<' @@ -1846,7 +1876,7 @@ autoLink = try $ do guardEnabled Ext_link_attributes >> attributes return $ return $ B.linkWith attr (src ++ escapeURI extra) "" (B.str $ orig ++ extra) -image :: MarkdownParser (F Inlines) +image :: PandocMonad m => MarkdownParser m (F Inlines) image = try $ do char '!' (lab,raw) <- reference @@ -1856,54 +1886,55 @@ image = try $ do _ -> B.imageWith attr' src regLink constructor lab <|> referenceLink constructor (lab,raw) -note :: MarkdownParser (F Inlines) +note :: PandocMonad m => MarkdownParser m (F Inlines) note = try $ do guardEnabled Ext_footnotes ref <- noteMarker + updateState $ \st -> st{ stateNoteRefs = Set.insert ref (stateNoteRefs st) } return $ do notes <- asksF stateNotes' - case lookup ref notes of + case M.lookup ref notes of Nothing -> return $ B.str $ "[^" ++ ref ++ "]" - Just contents -> do + Just (_pos, contents) -> do st <- askF -- process the note in a context that doesn't resolve -- notes, to avoid infinite looping with notes inside -- notes: - let contents' = runF contents st{ stateNotes' = [] } + let contents' = runF contents st{ stateNotes' = M.empty } return $ B.note contents' -inlineNote :: MarkdownParser (F Inlines) +inlineNote :: PandocMonad m => MarkdownParser m (F Inlines) inlineNote = try $ do guardEnabled Ext_inline_notes char '^' contents <- inlinesInBalancedBrackets return $ B.note . B.para <$> contents -rawLaTeXInline' :: MarkdownParser (F Inlines) +rawLaTeXInline' :: PandocMonad m => MarkdownParser m (F Inlines) rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex - lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env - RawInline _ s <- rawLaTeXInline - return $ return $ B.rawInline "tex" s - -- "tex" because it might be context or latex + lookAhead $ try $ char '\\' >> letter + notFollowedBy' rawConTeXtEnvironment + s <- rawLaTeXInline + return $ return $ B.rawInline "tex" s -- "tex" because it might be context -rawConTeXtEnvironment :: Parser [Char] st String +rawConTeXtEnvironment :: PandocMonad m => ParserT [Char] st m String rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) - <|> (many1 letter) - contents <- manyTill (rawConTeXtEnvironment <|> (count 1 anyChar)) + <|> many1 letter + contents <- manyTill (rawConTeXtEnvironment <|> count 1 anyChar) (try $ string "\\stop" >> string completion) return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion -inBrackets :: (Parser [Char] st Char) -> Parser [Char] st String +inBrackets :: PandocMonad m => ParserT [Char] st m Char -> ParserT [Char] st m String inBrackets parser = do char '[' contents <- many parser char ']' return $ "[" ++ contents ++ "]" -spanHtml :: MarkdownParser (F Inlines) +spanHtml :: PandocMonad m => MarkdownParser m (F Inlines) spanHtml = try $ do guardEnabled Ext_native_spans (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) @@ -1911,14 +1942,11 @@ spanHtml = try $ do let ident = fromMaybe "" $ lookup "id" attrs let classes = maybe [] words $ lookup "class" attrs let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - case lookup "style" keyvals of - Just s | null ident && null classes && - map toLower (filter (`notElem` " \t;") s) == - "font-variant:small-caps" - -> return $ B.smallcaps <$> contents - _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents - -divHtml :: MarkdownParser (F Blocks) + return $ if isSmallCaps (ident, classes, keyvals) + then B.smallcaps <$> contents + else B.spanWith (ident, classes, keyvals) <$> contents + +divHtml :: PandocMonad m => MarkdownParser m (F Blocks) divHtml = try $ do guardEnabled Ext_native_divs (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) @@ -1940,7 +1968,29 @@ divHtml = try $ do else -- avoid backtracing return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents -rawHtmlInline :: MarkdownParser (F Inlines) +divFenced :: PandocMonad m => MarkdownParser m (F Blocks) +divFenced = try $ do + guardEnabled Ext_fenced_divs + string ":::" + skipMany (char ':') + skipMany spaceChar + attribs <- attributes <|> ((\x -> ("",[x],[])) <$> many1 nonspaceChar) + skipMany spaceChar + skipMany (char ':') + blankline + updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st + 1 } + bs <- mconcat <$> manyTill block divFenceEnd + updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st - 1 } + return $ B.divWith attribs <$> bs + +divFenceEnd :: PandocMonad m => MarkdownParser m () +divFenceEnd = try $ do + string ":::" + skipMany (char ':') + blanklines + return () + +rawHtmlInline :: PandocMonad m => MarkdownParser m (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html inHtmlBlock <- stateInHtmlBlock <$> getState @@ -1962,7 +2012,7 @@ rawHtmlInline = do emojiChars :: [Char] emojiChars = ['a'..'z'] ++ ['0'..'9'] ++ ['_','+','-'] -emoji :: MarkdownParser (F Inlines) +emoji :: PandocMonad m => MarkdownParser m (F Inlines) emoji = try $ do guardEnabled Ext_emoji char ':' @@ -1974,21 +2024,22 @@ emoji = try $ do -- Citations -cite :: MarkdownParser (F Inlines) +cite :: PandocMonad m => MarkdownParser m (F Inlines) cite = do guardEnabled Ext_citations - citations <- textualCite + textualCite <|> do (cs, raw) <- withRaw normalCite - return $ (flip B.cite (B.text raw)) <$> cs - return citations + return $ flip B.cite (B.text raw) <$> cs -textualCite :: MarkdownParser (F Inlines) +textualCite :: PandocMonad m => MarkdownParser m (F Inlines) textualCite = try $ do - (_, key) <- citeKey + (suppressAuthor, key) <- citeKey let first = Citation{ citationId = key , citationPrefix = [] , citationSuffix = [] - , citationMode = AuthorInText + , citationMode = if suppressAuthor + then SuppressAuthor + else AuthorInText , citationNoteNum = 0 , citationHash = 0 } @@ -2003,7 +2054,7 @@ textualCite = try $ do let (spaces',raw') = span isSpace raw spc | null spaces' = mempty | otherwise = B.space - lab <- parseFromString (mconcat <$> many inline) $ dropBrackets raw' + lab <- parseFromString' inlines $ dropBrackets raw' fallback <- referenceLink B.linkWith (lab,raw') return $ do fallback' <- fallback @@ -2017,7 +2068,7 @@ textualCite = try $ do Just n -> B.str (show n) _ -> B.cite [first] $ B.str $ '@':key) -bareloc :: Citation -> MarkdownParser (F [Citation]) +bareloc :: PandocMonad m => Citation -> MarkdownParser m (F [Citation]) bareloc c = try $ do spnl char '[' @@ -2032,7 +2083,7 @@ bareloc c = try $ do rest' <- rest return $ c{ citationSuffix = B.toList suff' } : rest' -normalCite :: MarkdownParser (F [Citation]) +normalCite :: PandocMonad m => MarkdownParser m (F [Citation]) normalCite = try $ do char '[' spnl @@ -2041,7 +2092,7 @@ normalCite = try $ do char ']' return citations -suffix :: MarkdownParser (F Inlines) +suffix :: PandocMonad m => MarkdownParser m (F Inlines) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) spnl @@ -2050,14 +2101,14 @@ suffix = try $ do then (B.space <>) <$> rest else rest -prefix :: MarkdownParser (F Inlines) +prefix :: PandocMonad m => MarkdownParser m (F Inlines) prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey)) + manyTill inline (char ']' <|> fmap (const ']') (lookAhead citeKey)) -citeList :: MarkdownParser (F [Citation]) +citeList :: PandocMonad m => MarkdownParser m (F [Citation]) citeList = fmap sequence $ sepBy1 citation (try $ char ';' >> spnl) -citation :: MarkdownParser (F Citation) +citation :: PandocMonad m => MarkdownParser m (F Citation) citation = try $ do pref <- prefix (suppress_author, key) <- citeKey @@ -2065,23 +2116,23 @@ citation = try $ do return $ do x <- pref y <- suff - return $ Citation{ citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - -smart :: MarkdownParser (F Inlines) + return Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + +smart :: PandocMonad m => MarkdownParser m (F Inlines) smart = do - getOption readerSmart >>= guard + guardEnabled Ext_smart doubleQuoted <|> singleQuoted <|> choice (map (return <$>) [apostrophe, dash, ellipses]) -singleQuoted :: MarkdownParser (F Inlines) +singleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) singleQuoted = try $ do singleQuoteStart withQuoteContext InSingleQuote $ @@ -2091,10 +2142,10 @@ singleQuoted = try $ do -- doubleQuoted will handle regular double-quoted sections, as well -- as dialogues with an open double-quote without a close double-quote -- in the same paragraph. -doubleQuoted :: MarkdownParser (F Inlines) +doubleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) doubleQuoted = try $ do doubleQuoteStart contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return + withQuoteContext InDoubleQuote (doubleQuoteEnd >> return (fmap B.doubleQuoted . trimInlinesF $ contents)) - <|> (return $ return (B.str "\8220") <> contents) + <|> return (return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 0dea22c53..c19ef2f46 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,7 +1,9 @@ -{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE RelaxedPolyRec #-} +{-# LANGUAGE TypeSynonymInstances #-} -- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- - Copyright (C) 2012-2015 John MacFarlane <jgm@berkeley.edu> + Copyright (C) 2012-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +22,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012-2015 John MacFarlane + Copyright : Copyright (C) 2012-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -36,43 +38,50 @@ _ parse templates? -} module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where -import Text.Pandoc.Definition -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) -import Data.Monoid ((<>)) -import Text.Pandoc.Options -import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag ) -import Text.Pandoc.XML ( fromEntities ) -import Text.Pandoc.Parsing hiding ( nested ) -import Text.Pandoc.Walk ( walk ) -import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead, stringify, trim ) import Control.Monad -import Data.List (intersperse, intercalate, isPrefixOf ) -import Text.HTML.TagSoup -import Data.Sequence (viewl, ViewL(..), (<|)) +import Control.Monad.Except (throwError) +import Data.Char (isDigit, isSpace) import qualified Data.Foldable as F +import Data.List (intercalate, intersperse, isPrefixOf) import qualified Data.Map as M +import Data.Maybe (fromMaybe, maybeToList) +import Data.Monoid ((<>)) +import Data.Sequence (ViewL (..), viewl, (<|)) import qualified Data.Set as Set -import Data.Char (isDigit, isSpace) -import Data.Maybe (fromMaybe) -import Text.Printf (printf) -import Debug.Trace (trace) - -import Text.Pandoc.Error +import Data.Text (Text, unpack) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (nested) +import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag) +import Text.Pandoc.Shared (crFilter, safeRead, stringify, stripTrailingNewlines, + trim) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.XML (fromEntities) -- | Read mediawiki from an input string and return a Pandoc document. -readMediaWiki :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readMediaWiki opts s = - readWith parseMediaWiki MWState{ mwOptions = opts - , mwMaxNestingLevel = 4 - , mwNextLinkNumber = 1 - , mwCategoryLinks = [] - , mwHeaderMap = M.empty - , mwIdentifierList = Set.empty - } - (s ++ "\n") +readMediaWiki :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readMediaWiki opts s = do + parsed <- readWithM parseMediaWiki MWState{ mwOptions = opts + , mwMaxNestingLevel = 4 + , mwNextLinkNumber = 1 + , mwCategoryLinks = [] + , mwHeaderMap = M.empty + , mwIdentifierList = Set.empty + , mwLogMessages = [] + , mwInTT = False + } + (unpack (crFilter s) ++ "\n") + case parsed of + Right result -> return result + Left e -> throwError e data MWState = MWState { mwOptions :: ReaderOptions , mwMaxNestingLevel :: Int @@ -80,9 +89,11 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwCategoryLinks :: [Inlines] , mwHeaderMap :: M.Map Inlines String , mwIdentifierList :: Set.Set String + , mwLogMessages :: [LogMessage] + , mwInTT :: Bool } -type MWParser = Parser [Char] MWState +type MWParser m = ParserT [Char] MWState m instance HasReaderOptions MWState where extractReaderOptions = mwOptions @@ -95,13 +106,17 @@ instance HasIdentifierList MWState where extractIdentifierList = mwIdentifierList updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st } +instance HasLogMessages MWState where + addLogMessage m s = s{ mwLogMessages = m : mwLogMessages s } + getLogMessages = reverse . mwLogMessages + -- -- auxiliary functions -- -- This is used to prevent exponential blowups for things like: -- ''a'''a''a'''a''a'''a''a'''a -nested :: MWParser a -> MWParser a +nested :: PandocMonad m => MWParser m a -> MWParser m a nested p = do nestlevel <- mwMaxNestingLevel `fmap` getState guard $ nestlevel > 0 @@ -116,7 +131,7 @@ specialChars = "'[]<=&*{}|\":\\" spaceChars :: [Char] spaceChars = " \n\t" -sym :: String -> MWParser () +sym :: PandocMonad m => String -> MWParser m () sym s = () <$ try (string s) newBlockTags :: [String] @@ -131,16 +146,16 @@ isBlockTag' tag = isBlockTag tag isInlineTag' :: Tag String -> Bool isInlineTag' (TagComment _) = True -isInlineTag' t = not (isBlockTag' t) +isInlineTag' t = not (isBlockTag' t) eitherBlockOrInline :: [String] eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", "map", "area", "object"] -htmlComment :: MWParser () +htmlComment :: PandocMonad m => MWParser m () htmlComment = () <$ htmlTag isCommentTag -inlinesInTags :: String -> MWParser Inlines +inlinesInTags :: PandocMonad m => String -> MWParser m Inlines inlinesInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) if '/' `elem` raw -- self-closing tag @@ -148,7 +163,7 @@ inlinesInTags tag = try $ do else trimInlines . mconcat <$> manyTill inline (htmlTag (~== TagClose tag)) -blocksInTags :: String -> MWParser Blocks +blocksInTags :: PandocMonad m => String -> MWParser m Blocks blocksInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) let closer = if tag == "li" @@ -162,7 +177,7 @@ blocksInTags tag = try $ do then return mempty else mconcat <$> manyTill block closer -charsInTags :: String -> MWParser [Char] +charsInTags :: PandocMonad m => String -> MWParser m [Char] charsInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) if '/' `elem` raw -- self-closing tag @@ -173,7 +188,7 @@ charsInTags tag = try $ do -- main parser -- -parseMediaWiki :: MWParser Pandoc +parseMediaWiki :: PandocMonad m => MWParser m Pandoc parseMediaWiki = do bs <- mconcat <$> many block spaces @@ -182,16 +197,15 @@ parseMediaWiki = do let categories = if null categoryLinks then mempty else B.para $ mconcat $ intersperse B.space categoryLinks + reportLogMessages return $ B.doc $ bs <> categories -- -- block parsers -- -block :: MWParser Blocks +block :: PandocMonad m => MWParser m Blocks block = do - tr <- getOption readerTrace - pos <- getPosition res <- mempty <$ skipMany1 blankline <|> table <|> header @@ -204,28 +218,28 @@ block = do <|> blockTag <|> (B.rawBlock "mediawiki" <$> template) <|> para - when tr $ - trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList res)) (return ()) + trace (take 60 $ show $ B.toList res) return res -para :: MWParser Blocks +para :: PandocMonad m => MWParser m Blocks para = do contents <- trimInlines . mconcat <$> many1 inline if F.all (==Space) contents then return mempty else return $ B.para contents -table :: MWParser Blocks +table :: PandocMonad m => MWParser m Blocks table = do tableStart - styles <- option [] parseAttrs <* blankline + styles <- option [] parseAttrs + skipMany spaceChar + optional blanklines let tableWidth = case lookup "width" styles of Just w -> fromMaybe 1.0 $ parseWidth w Nothing -> 1.0 caption <- option mempty tableCaption optional rowsep - hasheader <- option False $ True <$ (lookAhead (skipSpaces *> char '!')) + hasheader <- option False $ True <$ lookAhead (skipSpaces *> char '!') (cellspecs',hdr) <- unzip <$> tableRow let widths = map ((tableWidth *) . snd) cellspecs' let restwidth = tableWidth - sum widths @@ -244,10 +258,10 @@ table = do else (replicate cols mempty, hdr:rows') return $ B.table caption cellspecs headers rows -parseAttrs :: MWParser [(String,String)] +parseAttrs :: PandocMonad m => MWParser m [(String,String)] parseAttrs = many1 parseAttr -parseAttr :: MWParser (String, String) +parseAttr :: PandocMonad m => MWParser m (String, String) parseAttr = try $ do skipMany spaceChar k <- many1 letter @@ -256,27 +270,23 @@ parseAttr = try $ do <|> many1 (satisfy $ \c -> not (isSpace c) && c /= '|') return (k,v) -tableStart :: MWParser () +tableStart :: PandocMonad m => MWParser m () tableStart = try $ guardColumnOne *> skipSpaces *> sym "{|" -tableEnd :: MWParser () +tableEnd :: PandocMonad m => MWParser m () tableEnd = try $ guardColumnOne *> skipSpaces *> sym "|}" -rowsep :: MWParser () +rowsep :: PandocMonad m => MWParser m () rowsep = try $ guardColumnOne *> skipSpaces *> sym "|-" <* - optional parseAttr <* blanklines - -cellsep :: MWParser () -cellsep = try $ - (guardColumnOne *> skipSpaces <* - ( (char '|' <* notFollowedBy (oneOf "-}+")) - <|> (char '!') - ) - ) - <|> (() <$ try (string "||")) - <|> (() <$ try (string "!!")) - -tableCaption :: MWParser Inlines + many (char '-') <* optional parseAttr <* blanklines + +cellsep :: PandocMonad m => MWParser m () +cellsep = try $ do + skipSpaces + (char '|' *> notFollowedBy (oneOf "-}+") *> optional (char '|')) + <|> (char '!' *> optional (char '!')) + +tableCaption :: PandocMonad m => MWParser m Inlines tableCaption = try $ do guardColumnOne skipSpaces @@ -284,19 +294,21 @@ tableCaption = try $ do optional (try $ parseAttr *> skipSpaces *> char '|' *> skipSpaces) (trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline) -tableRow :: MWParser [((Alignment, Double), Blocks)] +tableRow :: PandocMonad m => MWParser m [((Alignment, Double), Blocks)] tableRow = try $ skipMany htmlComment *> many tableCell -tableCell :: MWParser ((Alignment, Double), Blocks) +tableCell :: PandocMonad m => MWParser m ((Alignment, Double), Blocks) tableCell = try $ do cellsep skipMany spaceChar attrs <- option [] $ try $ parseAttrs <* skipSpaces <* char '|' <* notFollowedBy (char '|') skipMany spaceChar + pos' <- getPosition ls <- concat <$> many (notFollowedBy (cellsep <|> rowsep <|> tableEnd) *> ((snd <$> withRaw table) <|> count 1 anyChar)) - bs <- parseFromString (mconcat <$> many block) ls + bs <- parseFromString (do setPosition pos' + mconcat <$> many block) ls let align = case lookup "align" attrs of Just "left" -> AlignLeft Just "right" -> AlignRight @@ -311,9 +323,9 @@ parseWidth :: String -> Maybe Double parseWidth s = case reverse s of ('%':ds) | all isDigit ds -> safeRead ('0':'.':reverse ds) - _ -> Nothing + _ -> Nothing -template :: MWParser String +template :: PandocMonad m => MWParser m String template = try $ do string "{{" notFollowedBy (char '{') @@ -322,7 +334,7 @@ template = try $ do contents <- manyTill chunk (try $ string "}}") return $ "{{" ++ concat contents ++ "}}" -blockTag :: MWParser Blocks +blockTag :: PandocMonad m => MWParser m Blocks blockTag = do (tag, _) <- lookAhead $ htmlTag isBlockTag' case tag of @@ -341,27 +353,27 @@ trimCode :: String -> String trimCode ('\n':xs) = stripTrailingNewlines xs trimCode xs = stripTrailingNewlines xs -syntaxhighlight :: String -> [Attribute String] -> MWParser Blocks +syntaxhighlight :: PandocMonad m => String -> [Attribute String] -> MWParser m Blocks syntaxhighlight tag attrs = try $ do let mblang = lookup "lang" attrs let mbstart = lookup "start" attrs let mbline = lookup "line" attrs - let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline + let classes = maybeToList mblang ++ maybe [] (const ["numberLines"]) mbline let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart contents <- charsInTags tag return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents -hrule :: MWParser Blocks +hrule :: PandocMonad m => MWParser m Blocks hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline) -guardColumnOne :: MWParser () +guardColumnOne :: PandocMonad m => MWParser m () guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1) -preformatted :: MWParser Blocks +preformatted :: PandocMonad m => MWParser m Blocks preformatted = try $ do guardColumnOne char ' ' - let endline' = B.linebreak <$ (try $ newline <* char ' ') + let endline' = B.linebreak <$ try (newline <* char ' ') let whitespace' = B.str <$> many1 ('\160' <$ spaceChar) let spToNbsp ' ' = '\160' spToNbsp x = x @@ -370,7 +382,7 @@ preformatted = try $ do (htmlTag (~== TagOpen "nowiki" []) *> manyTill anyChar (htmlTag (~== TagClose "nowiki"))) let inline' = whitespace' <|> endline' <|> nowiki' - <|> (try $ notFollowedBy newline *> inline) + <|> try (notFollowedBy newline *> inline) contents <- mconcat <$> many1 inline' let spacesStr (Str xs) = all isSpace xs spacesStr _ = False @@ -385,10 +397,10 @@ encode = B.fromList . normalizeCode . B.toList . walk strToCode strToCode x = x normalizeCode [] = [] normalizeCode (Code a1 x : Code a2 y : zs) | a1 == a2 = - normalizeCode $ (Code a1 (x ++ y)) : zs + normalizeCode $ Code a1 (x ++ y) : zs normalizeCode (x:xs) = x : normalizeCode xs -header :: MWParser Blocks +header :: PandocMonad m => MWParser m Blocks header = try $ do guardColumnOne eqs <- many1 (char '=') @@ -398,13 +410,13 @@ header = try $ do attr <- registerHeader nullAttr contents return $ B.headerWith attr lev contents -bulletList :: MWParser Blocks +bulletList :: PandocMonad m => MWParser m Blocks bulletList = B.bulletList <$> ( many1 (listItem '*') <|> (htmlTag (~== TagOpen "ul" []) *> spaces *> many (listItem '*' <|> li) <* optional (htmlTag (~== TagClose "ul"))) ) -orderedList :: MWParser Blocks +orderedList :: PandocMonad m => MWParser m Blocks orderedList = (B.orderedList <$> many1 (listItem '#')) <|> try @@ -415,10 +427,10 @@ orderedList = let start = fromMaybe 1 $ safeRead $ fromAttrib "start" tag return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items) -definitionList :: MWParser Blocks +definitionList :: PandocMonad m => MWParser m Blocks definitionList = B.definitionList <$> many1 defListItem -defListItem :: MWParser (Inlines, [Blocks]) +defListItem :: PandocMonad m => MWParser m (Inlines, [Blocks]) defListItem = try $ do terms <- mconcat . intersperse B.linebreak <$> many defListTerm -- we allow dd with no dt, or dt with no dd @@ -429,44 +441,49 @@ defListItem = try $ do else many (listItem ':') return (terms, defs) -defListTerm :: MWParser Inlines -defListTerm = char ';' >> skipMany spaceChar >> anyLine >>= - parseFromString (trimInlines . mconcat <$> many inline) +defListTerm :: PandocMonad m => MWParser m Inlines +defListTerm = do + guardColumnOne + char ';' + skipMany spaceChar + pos' <- getPosition + anyLine >>= parseFromString (do setPosition pos' + trimInlines . mconcat <$> many inline) -listStart :: Char -> MWParser () +listStart :: PandocMonad m => Char -> MWParser m () listStart c = char c *> notFollowedBy listStartChar -listStartChar :: MWParser Char +listStartChar :: PandocMonad m => MWParser m Char listStartChar = oneOf "*#;:" -anyListStart :: MWParser Char -anyListStart = char '*' - <|> char '#' - <|> char ':' - <|> char ';' +anyListStart :: PandocMonad m => MWParser m Char +anyListStart = guardColumnOne >> oneOf "*#:;" -li :: MWParser Blocks +li :: PandocMonad m => MWParser m Blocks li = lookAhead (htmlTag (~== TagOpen "li" [])) *> (firstParaToPlain <$> blocksInTags "li") <* spaces -listItem :: Char -> MWParser Blocks +listItem :: PandocMonad m => Char -> MWParser m Blocks listItem c = try $ do + guardColumnOne extras <- many (try $ char c <* lookAhead listStartChar) if null extras then listItem' c else do skipMany spaceChar + pos' <- getPosition first <- concat <$> manyTill listChunk newline rest <- many (try $ string extras *> lookAhead listStartChar *> (concat <$> manyTill listChunk newline)) - contents <- parseFromString (many1 $ listItem' c) + contents <- parseFromString (do setPosition pos' + many1 $ listItem' c) (unlines (first : rest)) case c of - '*' -> return $ B.bulletList contents - '#' -> return $ B.orderedList contents - ':' -> return $ B.definitionList [(mempty, contents)] - _ -> mzero + '*' -> return $ B.bulletList contents + '#' -> return $ B.orderedList contents + ':' -> return $ B.definitionList [(mempty, contents)] + _ -> mzero -- The point of this is to handle stuff like -- * {{cite book @@ -475,30 +492,32 @@ listItem c = try $ do -- }} -- * next list item -- which seems to be valid mediawiki. -listChunk :: MWParser String +listChunk :: PandocMonad m => MWParser m String listChunk = template <|> count 1 anyChar -listItem' :: Char -> MWParser Blocks +listItem' :: PandocMonad m => Char -> MWParser m Blocks listItem' c = try $ do listStart c skipMany spaceChar + pos' <- getPosition first <- concat <$> manyTill listChunk newline rest <- many (try $ char c *> lookAhead listStartChar *> (concat <$> manyTill listChunk newline)) - parseFromString (firstParaToPlain . mconcat <$> many1 block) + parseFromString (do setPosition pos' + firstParaToPlain . mconcat <$> many1 block) $ unlines $ first : rest firstParaToPlain :: Blocks -> Blocks firstParaToPlain contents = case viewl (B.unMany contents) of - (Para xs) :< ys -> B.Many $ (Plain xs) <| ys - _ -> contents + Para xs :< ys -> B.Many $ Plain xs <| ys + _ -> contents -- -- inline parsers -- -inline :: MWParser Inlines +inline :: PandocMonad m => MWParser m Inlines inline = whitespace <|> url <|> str @@ -516,10 +535,10 @@ inline = whitespace <|> (B.rawInline "mediawiki" <$> template) <|> special -str :: MWParser Inlines +str :: PandocMonad m => MWParser m Inlines str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) -math :: MWParser Inlines +math :: PandocMonad m => MWParser m Inlines math = (B.displayMath . trim <$> try (many1 (char ':') >> charsInTags "math")) <|> (B.math . trim <$> charsInTags "math") <|> (B.displayMath . trim <$> try (dmStart *> manyTill anyChar dmEnd)) @@ -529,13 +548,13 @@ math = (B.displayMath . trim <$> try (many1 (char ':') >> charsInTags "math")) mStart = string "\\(" mEnd = try (string "\\)") -variable :: MWParser String +variable :: PandocMonad m => MWParser m String variable = try $ do string "{{{" contents <- manyTill anyChar (try $ string "}}}") return $ "{{{" ++ contents ++ "}}}" -inlineTag :: MWParser Inlines +inlineTag :: PandocMonad m => MWParser m Inlines inlineTag = do (tag, _) <- lookAhead $ htmlTag isInlineTag' case tag of @@ -553,22 +572,27 @@ inlineTag = do TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub" TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup" TagOpen "code" _ -> encode <$> inlinesInTags "code" - TagOpen "tt" _ -> encode <$> inlinesInTags "tt" + TagOpen "tt" _ -> do + inTT <- mwInTT <$> getState + updateState $ \st -> st{ mwInTT = True } + result <- encode <$> inlinesInTags "tt" + updateState $ \st -> st{ mwInTT = inTT } + return result TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" _ -> B.rawInline "html" . snd <$> htmlTag (~== tag) -special :: MWParser Inlines +special :: PandocMonad m => MWParser m Inlines special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *> oneOf specialChars) -inlineHtml :: MWParser Inlines +inlineHtml :: PandocMonad m => MWParser m Inlines inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag' -whitespace :: MWParser Inlines +whitespace :: PandocMonad m => MWParser m Inlines whitespace = B.space <$ (skipMany1 spaceChar <|> htmlComment) <|> B.softbreak <$ endline -endline :: MWParser () +endline :: PandocMonad m => MWParser m () endline = () <$ try (newline <* notFollowedBy spaceChar <* notFollowedBy newline <* @@ -577,30 +601,30 @@ endline = () <$ try (newline <* notFollowedBy' header <* notFollowedBy anyListStart) -imageIdentifiers :: [MWParser ()] +imageIdentifiers :: PandocMonad m => [MWParser m ()] imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier", "Bild"] -image :: MWParser Inlines +image :: PandocMonad m => MWParser m Inlines image = try $ do sym "[[" choice imageIdentifiers fname <- addUnderscores <$> many1 (noneOf "|]") _ <- many imageOption - dims <- try (char '|' *> (sepBy (many digit) (char 'x')) <* string "px") + dims <- try (char '|' *> sepBy (many digit) (char 'x') <* string "px") <|> return [] _ <- many imageOption let kvs = case dims of - w:[] -> [("width", w)] - w:(h:[]) -> [("width", w), ("height", h)] - _ -> [] + [w] -> [("width", w)] + [w, h] -> [("width", w), ("height", h)] + _ -> [] let attr = ("", [], kvs) caption <- (B.str fname <$ sym "]]") <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) return $ B.imageWith attr fname ("fig:" ++ stringify caption) caption -imageOption :: MWParser String +imageOption :: PandocMonad m => MWParser m String imageOption = try $ char '|' *> opt where opt = try (oneOfStrings [ "border", "thumbnail", "frameless" @@ -612,14 +636,14 @@ imageOption = try $ char '|' *> opt <|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]")) collapseUnderscores :: String -> String -collapseUnderscores [] = [] +collapseUnderscores [] = [] collapseUnderscores ('_':'_':xs) = collapseUnderscores ('_':xs) -collapseUnderscores (x:xs) = x : collapseUnderscores xs +collapseUnderscores (x:xs) = x : collapseUnderscores xs addUnderscores :: String -> String addUnderscores = collapseUnderscores . intercalate "_" . words -internalLink :: MWParser Inlines +internalLink :: PandocMonad m => MWParser m Inlines internalLink = try $ do sym "[[" pagename <- unwords . words <$> many (noneOf "|]") @@ -627,7 +651,7 @@ internalLink = try $ do ( (mconcat <$> many1 (notFollowedBy (char ']') *> inline)) -- the "pipe trick" -- [[Help:Contents|] -> "Contents" - <|> (return $ B.text $ drop 1 $ dropWhile (/=':') pagename) ) + <|> return (B.text $ drop 1 $ dropWhile (/=':') pagename) ) sym "]]" linktrail <- B.text <$> many letter let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail) @@ -637,7 +661,7 @@ internalLink = try $ do return mempty else return link -externalLink :: MWParser Inlines +externalLink :: PandocMonad m => MWParser m Inlines externalLink = try $ do char '[' (_, src) <- uri @@ -649,29 +673,33 @@ externalLink = try $ do return $ B.str $ show num return $ B.link src "" lab -url :: MWParser Inlines +url :: PandocMonad m => MWParser m Inlines url = do (orig, src) <- uri return $ B.link src "" (B.str orig) -- | Parses a list of inlines between start and end delimiters. -inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines +inlinesBetween :: (PandocMonad m, Show b) => MWParser m a -> MWParser m b -> MWParser m Inlines inlinesBetween start end = (trimInlines . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) innerSpace = try $ whitespace <* notFollowedBy' end -emph :: MWParser Inlines +emph :: PandocMonad m => MWParser m Inlines emph = B.emph <$> nested (inlinesBetween start end) where start = sym "''" >> lookAhead nonspaceChar end = try $ notFollowedBy' (() <$ strong) >> sym "''" -strong :: MWParser Inlines +strong :: PandocMonad m => MWParser m Inlines strong = B.strong <$> nested (inlinesBetween start end) where start = sym "'''" >> lookAhead nonspaceChar end = try $ sym "'''" -doubleQuotes :: MWParser Inlines -doubleQuotes = B.doubleQuoted <$> nested (inlinesBetween openDoubleQuote closeDoubleQuote) +doubleQuotes :: PandocMonad m => MWParser m Inlines +doubleQuotes = do + guardEnabled Ext_smart + inTT <- mwInTT <$> getState + guard (not inTT) + B.doubleQuoted <$> nested (inlinesBetween openDoubleQuote closeDoubleQuote) where openDoubleQuote = sym "\"" >> lookAhead nonspaceChar closeDoubleQuote = try $ sym "\"" diff --git a/src/Text/Pandoc/Readers/Muse.hs b/src/Text/Pandoc/Readers/Muse.hs new file mode 100644 index 000000000..4a9523e84 --- /dev/null +++ b/src/Text/Pandoc/Readers/Muse.hs @@ -0,0 +1,924 @@ +{-# LANGUAGE FlexibleContexts #-} +{- + Copyright (C) 2017-2018 Alexander Krotov <ilabdsf@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Muse + Copyright : Copyright (C) 2017-2018 Alexander Krotov + License : GNU GPL, version 2 or above + + Maintainer : Alexander Krotov <ilabdsf@gmail.com> + Stability : alpha + Portability : portable + +Conversion of Muse text to 'Pandoc' document. +-} +{- +TODO: +- Page breaks (five "*") +- Org tables +- table.el tables +- Images with attributes (floating and width) +- Citations and <biblio> +- <play> environment +-} +module Text.Pandoc.Readers.Muse (readMuse) where + +import Control.Monad +import Control.Monad.Except (throwError) +import Data.Char (isLetter) +import Data.Default +import Data.List (stripPrefix, intercalate) +import Data.List.Split (splitOn) +import qualified Data.Map as M +import qualified Data.Set as Set +import Data.Maybe (fromMaybe, isNothing) +import Data.Text (Text, unpack) +import System.FilePath (takeExtension) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (F) +import Text.Pandoc.Readers.HTML (htmlTag) +import Text.Pandoc.Shared (crFilter, underlineSpan) + +-- | Read Muse from an input string and return a Pandoc document. +readMuse :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readMuse opts s = do + res <- readWithM parseMuse def{ museOptions = opts } (unpack (crFilter s)) + case res of + Left e -> throwError e + Right d -> return d + +type F = Future MuseState + +data MuseState = MuseState { museMeta :: F Meta -- ^ Document metadata + , museOptions :: ReaderOptions + , museHeaders :: M.Map Inlines String -- ^ List of headers and ids (used for implicit ref links) + , museIdentifierList :: Set.Set String + , museLastStrPos :: Maybe SourcePos -- ^ Position after last str parsed + , museLogMessages :: [LogMessage] + , museNotes :: M.Map String (SourcePos, F Blocks) + , museInLink :: Bool + , museInPara :: Bool + } + +instance Default MuseState where + def = defaultMuseState + +defaultMuseState :: MuseState +defaultMuseState = MuseState { museMeta = return nullMeta + , museOptions = def + , museHeaders = M.empty + , museIdentifierList = Set.empty + , museLastStrPos = Nothing + , museLogMessages = [] + , museNotes = M.empty + , museInLink = False + , museInPara = False + } + +type MuseParser = ParserT String MuseState + +instance HasReaderOptions MuseState where + extractReaderOptions = museOptions + +instance HasHeaderMap MuseState where + extractHeaderMap = museHeaders + updateHeaderMap f st = st{ museHeaders = f $ museHeaders st } + +instance HasIdentifierList MuseState where + extractIdentifierList = museIdentifierList + updateIdentifierList f st = st{ museIdentifierList = f $ museIdentifierList st } + +instance HasLastStrPosition MuseState where + setLastStrPos pos st = st{ museLastStrPos = Just pos } + getLastStrPos st = museLastStrPos st + +instance HasLogMessages MuseState where + addLogMessage m s = s{ museLogMessages = m : museLogMessages s } + getLogMessages = reverse . museLogMessages + +-- +-- main parser +-- + +parseMuse :: PandocMonad m => MuseParser m Pandoc +parseMuse = do + many directive + blocks <- parseBlocks + st <- getState + let doc = runF (do Pandoc _ bs <- B.doc <$> blocks + meta <- museMeta st + return $ Pandoc meta bs) st + reportLogMessages + return doc + +-- +-- utility functions +-- + +eol :: Stream s m Char => ParserT s st m () +eol = void newline <|> eof + +htmlElement :: PandocMonad m => String -> MuseParser m (Attr, String) +htmlElement tag = try $ do + (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) + content <- manyTill anyChar endtag + return (htmlAttrToPandoc attr, content) + where + endtag = void $ htmlTag (~== TagClose tag) + +htmlBlock :: PandocMonad m => String -> MuseParser m (Attr, String) +htmlBlock tag = try $ do + many spaceChar + res <- htmlElement tag + manyTill spaceChar eol + return res + +htmlAttrToPandoc :: [Attribute String] -> Attr +htmlAttrToPandoc attrs = (ident, classes, keyvals) + where + ident = fromMaybe "" $ lookup "id" attrs + classes = maybe [] words $ lookup "class" attrs + keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + +parseHtmlContent :: PandocMonad m + => String -> MuseParser m (Attr, F Blocks) +parseHtmlContent tag = try $ do + many spaceChar + (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) + manyTill spaceChar eol + content <- parseBlocksTill (try $ manyTill spaceChar endtag) + manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline + return (htmlAttrToPandoc attr, content) + where + endtag = void $ htmlTag (~== TagClose tag) + +commonPrefix :: String -> String -> String +commonPrefix _ [] = [] +commonPrefix [] _ = [] +commonPrefix (x:xs) (y:ys) + | x == y = x : commonPrefix xs ys + | otherwise = [] + +atStart :: PandocMonad m => MuseParser m a -> MuseParser m a +atStart p = do + pos <- getPosition + st <- getState + guard $ museLastStrPos st /= Just pos + p + +someUntil :: (Stream s m t) + => ParserT s u m a + -> ParserT s u m b + -> ParserT s u m ([a], b) +someUntil p end = do + first <- p + (rest, e) <- manyUntil p end + return (first:rest, e) + +-- +-- directive parsers +-- + +-- While not documented, Emacs Muse allows "-" in directive name +parseDirectiveKey :: PandocMonad m => MuseParser m String +parseDirectiveKey = do + char '#' + many (letter <|> char '-') + +parseEmacsDirective :: PandocMonad m => MuseParser m (String, F Inlines) +parseEmacsDirective = do + key <- parseDirectiveKey + spaceChar + value <- trimInlinesF . mconcat <$> manyTill (choice inlineList) eol + return (key, value) + +parseAmuseDirective :: PandocMonad m => MuseParser m (String, F Inlines) +parseAmuseDirective = do + key <- parseDirectiveKey + many1 spaceChar + value <- trimInlinesF . mconcat <$> many1Till inline endOfDirective + many blankline + return (key, value) + where + endOfDirective = lookAhead $ eof <|> try (newline >> (void blankline <|> void parseDirectiveKey)) + +directive :: PandocMonad m => MuseParser m () +directive = do + ext <- getOption readerExtensions + (key, value) <- if extensionEnabled Ext_amuse ext then parseAmuseDirective else parseEmacsDirective + updateState $ \st -> st { museMeta = B.setMeta (translateKey key) <$> value <*> museMeta st } + where translateKey "cover" = "cover-image" + translateKey x = x + +-- +-- block parsers +-- + +parseBlocks :: PandocMonad m + => MuseParser m (F Blocks) +parseBlocks = + try parseEnd <|> + try blockStart <|> + try listStart <|> + try paraStart + where + parseEnd = mempty <$ eof + blockStart = do first <- header <|> blockElements <|> emacsNoteBlock + rest <- parseBlocks + return $ first B.<> rest + listStart = do + updateState (\st -> st { museInPara = False }) + (first, rest) <- anyListUntil parseBlocks <|> amuseNoteBlockUntil parseBlocks + return $ first B.<> rest + paraStart = do + indent <- length <$> many spaceChar + (first, rest) <- paraUntil parseBlocks + let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first + return $ first' B.<> rest + +parseBlocksTill :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks) +parseBlocksTill end = + try parseEnd <|> + try blockStart <|> + try listStart <|> + try paraStart + where + parseEnd = mempty <$ end + blockStart = do first <- blockElements + rest <- continuation + return $ first B.<> rest + listStart = do + updateState (\st -> st { museInPara = False }) + (first, e) <- anyListUntil ((Left <$> end) <|> (Right <$> continuation)) + case e of + Left _ -> return first + Right rest -> return $ first B.<> rest + paraStart = do (first, e) <- paraUntil ((Left <$> end) <|> (Right <$> continuation)) + case e of + Left _ -> return first + Right rest -> return $ first B.<> rest + continuation = parseBlocksTill end + +listItemContentsUntil :: PandocMonad m + => Int + -> MuseParser m a + -> MuseParser m a + -> MuseParser m (F Blocks, a) +listItemContentsUntil col pre end = + try blockStart <|> + try listStart <|> + try paraStart + where + parsePre = do e <- pre + return (mempty, e) + parseEnd = do e <- end + return (mempty, e) + paraStart = do + (first, e) <- paraUntil ((Left <$> pre) <|> (Right <$> continuation) <|> (Left <$> end)) + case e of + Left ee -> return (first, ee) + Right (rest, ee) -> return (first B.<> rest, ee) + blockStart = do first <- blockElements + (rest, e) <- parsePre <|> continuation <|> parseEnd + return (first B.<> rest, e) + listStart = do + updateState (\st -> st { museInPara = False }) + (first, e) <- anyListUntil ((Left <$> pre) <|> (Right <$> continuation) <|> (Left <$> end)) + case e of + Left ee -> return (first, ee) + Right (rest, ee) -> return (first B.<> rest, ee) + continuation = try $ do blank <- optionMaybe blankline + skipMany blankline + indentWith col + updateState (\st -> st { museInPara = museInPara st && isNothing blank }) + listItemContentsUntil col pre end + +parseBlock :: PandocMonad m => MuseParser m (F Blocks) +parseBlock = do + res <- blockElements <|> para + trace (take 60 $ show $ B.toList $ runF res def) + return res + where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements))) + +blockElements :: PandocMonad m => MuseParser m (F Blocks) +blockElements = do + updateState (\st -> st { museInPara = False }) + choice [ mempty <$ blankline + , comment + , separator + , example + , exampleTag + , literalTag + , centerTag + , rightTag + , quoteTag + , divTag + , verseTag + , lineBlock + , table + , commentTag + ] + +comment :: PandocMonad m => MuseParser m (F Blocks) +comment = try $ do + char ';' + optional (spaceChar >> many (noneOf "\n")) + eol + return mempty + +separator :: PandocMonad m => MuseParser m (F Blocks) +separator = try $ do + string "----" + many $ char '-' + many spaceChar + eol + return $ return B.horizontalRule + +header :: PandocMonad m => MuseParser m (F Blocks) +header = try $ do + getPosition >>= \pos -> guard (sourceColumn pos == 1) + level <- fmap length $ many1 $ char '*' + guard $ level <= 5 + spaceChar + content <- trimInlinesF . mconcat <$> manyTill inline eol + anchorId <- option "" parseAnchor + attr <- registerHeader (anchorId, [], []) (runF content def) + return $ B.headerWith attr level <$> content + +example :: PandocMonad m => MuseParser m (F Blocks) +example = try $ do + string "{{{" + optional blankline + contents <- manyTill anyChar $ try (optional blankline >> string "}}}") + return $ return $ B.codeBlock contents + +-- Trim up to one newline from the beginning of the string. +lchop :: String -> String +lchop s = case s of + '\n':ss -> ss + _ -> s + +-- Trim up to one newline from the end of the string. +rchop :: String -> String +rchop = reverse . lchop . reverse + +dropSpacePrefix :: [String] -> [String] +dropSpacePrefix lns = + map (drop maxIndent) lns + where flns = filter (not . all (== ' ')) lns + maxIndent = if null flns then maximum (map length lns) else length $ takeWhile (== ' ') $ foldl1 commonPrefix flns + +exampleTag :: PandocMonad m => MuseParser m (F Blocks) +exampleTag = try $ do + (attr, contents) <- htmlBlock "example" + return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents + +literalTag :: PandocMonad m => MuseParser m (F Blocks) +literalTag = + (return . rawBlock) <$> htmlBlock "literal" + where + -- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML + format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs + rawBlock (attrs, content) = B.rawBlock (format attrs) $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content + +-- <center> tag is ignored +centerTag :: PandocMonad m => MuseParser m (F Blocks) +centerTag = snd <$> parseHtmlContent "center" + +-- <right> tag is ignored +rightTag :: PandocMonad m => MuseParser m (F Blocks) +rightTag = snd <$> parseHtmlContent "right" + +quoteTag :: PandocMonad m => MuseParser m (F Blocks) +quoteTag = fmap B.blockQuote . snd <$> parseHtmlContent "quote" + +-- <div> tag is supported by Emacs Muse, but not Amusewiki 2.025 +divTag :: PandocMonad m => MuseParser m (F Blocks) +divTag = do + (attrs, content) <- parseHtmlContent "div" + return $ B.divWith attrs <$> content + +verseLine :: PandocMonad m => MuseParser m (F Inlines) +verseLine = do + indent <- (B.str <$> many1 (char ' ' >> pure '\160')) <|> pure mempty + rest <- manyTill (choice inlineList) newline + return $ trimInlinesF $ mconcat (pure indent : rest) + +verseLines :: PandocMonad m => MuseParser m (F Blocks) +verseLines = do + lns <- many verseLine + return $ B.lineBlock <$> sequence lns + +verseTag :: PandocMonad m => MuseParser m (F Blocks) +verseTag = do + (_, content) <- htmlBlock "verse" + parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content) + +commentTag :: PandocMonad m => MuseParser m (F Blocks) +commentTag = htmlBlock "comment" >> return mempty + +-- Indented paragraph is either center, right or quote +paraUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +paraUntil end = do + state <- getState + guard $ not $ museInPara state + setState $ state{ museInPara = True } + (l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end) + updateState (\st -> st { museInPara = False }) + return (fmap B.para $ trimInlinesF $ mconcat l, e) + +noteMarker :: PandocMonad m => MuseParser m String +noteMarker = try $ do + char '[' + first <- oneOf "123456789" + rest <- manyTill digit (char ']') + return $ first:rest + +-- Amusewiki version of note +-- Parsing is similar to list item, except that note marker is used instead of list marker +amuseNoteBlockUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +amuseNoteBlockUntil end = try $ do + guardEnabled Ext_amuse + ref <- noteMarker <* spaceChar + pos <- getPosition + updateState (\st -> st { museInPara = False }) + (content, e) <- listItemContentsUntil (sourceColumn pos - 1) (fail "x") end + oldnotes <- museNotes <$> getState + case M.lookup ref oldnotes of + Just _ -> logMessage $ DuplicateNoteReference ref pos + Nothing -> return () + updateState $ \s -> s{ museNotes = M.insert ref (pos, content) oldnotes } + return (mempty, e) + +-- Emacs version of note +-- Notes are allowed only at the end of text, no indentation is required. +emacsNoteBlock :: PandocMonad m => MuseParser m (F Blocks) +emacsNoteBlock = try $ do + guardDisabled Ext_amuse + pos <- getPosition + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillNote + oldnotes <- museNotes <$> getState + case M.lookup ref oldnotes of + Just _ -> logMessage $ DuplicateNoteReference ref pos + Nothing -> return () + updateState $ \s -> s{ museNotes = M.insert ref (pos, content) oldnotes } + return mempty + where + blocksTillNote = + many1Till parseBlock (eof <|> () <$ lookAhead noteMarker) + +-- +-- Verse markup +-- + +lineVerseLine :: PandocMonad m => MuseParser m (F Inlines) +lineVerseLine = try $ do + string "> " + indent <- B.str <$> many (char ' ' >> pure '\160') + rest <- manyTill (choice inlineList) eol + return $ trimInlinesF $ mconcat (pure indent : rest) + +blanklineVerseLine :: PandocMonad m => MuseParser m (F Inlines) +blanklineVerseLine = try $ do + char '>' + blankline + pure mempty + +lineBlock :: PandocMonad m => MuseParser m (F Blocks) +lineBlock = try $ do + col <- sourceColumn <$> getPosition + lns <- (blanklineVerseLine <|> lineVerseLine) `sepBy1'` try (indentWith (col - 1)) + return $ B.lineBlock <$> sequence lns + +-- +-- lists +-- + +bulletListItemsUntil :: PandocMonad m + => Int + -> MuseParser m a + -> MuseParser m ([F Blocks], a) +bulletListItemsUntil indent end = try $ do + char '-' + void spaceChar <|> lookAhead eol + updateState (\st -> st { museInPara = False }) + (x, e) <- listItemContentsUntil (indent + 2) (Right <$> try (optional blankline >> indentWith indent >> bulletListItemsUntil indent end)) (Left <$> end) + case e of + Left ee -> return ([x], ee) + Right (xs, ee) -> return (x:xs, ee) + +bulletListUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +bulletListUntil end = try $ do + many spaceChar + pos <- getPosition + let indent = sourceColumn pos - 1 + guard $ indent /= 0 + (items, e) <- bulletListItemsUntil indent end + return (B.bulletList <$> sequence items, e) + +-- | Parses an ordered list marker and returns list attributes. +anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes +anyMuseOrderedListMarker = do + (style, start) <- decimal <|> lowerRoman <|> upperRoman <|> lowerAlpha <|> upperAlpha + char '.' + return (start, style, Period) + +museOrderedListMarker :: PandocMonad m + => ListNumberStyle + -> MuseParser m Int +museOrderedListMarker style = do + (_, start) <- case style of + Decimal -> decimal + UpperRoman -> upperRoman + LowerRoman -> lowerRoman + UpperAlpha -> upperAlpha + LowerAlpha -> lowerAlpha + _ -> fail "Unhandled case" + char '.' + return start + +orderedListItemsUntil :: PandocMonad m + => Int + -> ListNumberStyle + -> MuseParser m a + -> MuseParser m ([F Blocks], a) +orderedListItemsUntil indent style end = + continuation + where + continuation = try $ do + pos <- getPosition + void spaceChar <|> lookAhead eol + updateState (\st -> st { museInPara = False }) + (x, e) <- listItemContentsUntil (sourceColumn pos) (Right <$> try (optionMaybe blankline >> indentWith indent >> museOrderedListMarker style >> continuation)) (Left <$> end) + case e of + Left ee -> return ([x], ee) + Right (xs, ee) -> return (x:xs, ee) + +orderedListUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +orderedListUntil end = try $ do + many spaceChar + pos <- getPosition + let indent = sourceColumn pos - 1 + guard $ indent /= 0 + p@(_, style, _) <- anyMuseOrderedListMarker + guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman] + (items, e) <- orderedListItemsUntil indent style end + return (B.orderedListWith p <$> sequence items, e) + +descriptionsUntil :: PandocMonad m + => Int + -> MuseParser m a + -> MuseParser m ([F Blocks], a) +descriptionsUntil indent end = do + void spaceChar <|> lookAhead eol + updateState (\st -> st { museInPara = False }) + (x, e) <- listItemContentsUntil indent (Right <$> try (optional blankline >> indentWith indent >> manyTill spaceChar (string "::") >> descriptionsUntil indent end)) (Left <$> end) + case e of + Right (xs, ee) -> return (x:xs, ee) + Left ee -> return ([x], ee) + +definitionListItemsUntil :: PandocMonad m + => Int + -> MuseParser m a + -> MuseParser m ([F (Inlines, [Blocks])], a) +definitionListItemsUntil indent end = + continuation + where + continuation = try $ do + pos <- getPosition + term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (try $ string "::") + (x, e) <- descriptionsUntil (sourceColumn pos) ((Right <$> try (optional blankline >> indentWith indent >> continuation)) <|> (Left <$> end)) + let xx = do + term' <- term + x' <- sequence x + return (term', x') + case e of + Left ee -> return ([xx], ee) + Right (xs, ee) -> return (xx:xs, ee) + +definitionListUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +definitionListUntil end = try $ do + many spaceChar + pos <- getPosition + let indent = sourceColumn pos - 1 + guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse + (items, e) <- definitionListItemsUntil indent end + return (B.definitionList <$> sequence items, e) + +anyListUntil :: PandocMonad m + => MuseParser m a + -> MuseParser m (F Blocks, a) +anyListUntil end = + bulletListUntil end <|> orderedListUntil end <|> definitionListUntil end + +-- +-- tables +-- + +data MuseTable = MuseTable + { museTableCaption :: Inlines + , museTableHeaders :: [[Blocks]] + , museTableRows :: [[Blocks]] + , museTableFooters :: [[Blocks]] + } + +data MuseTableElement = MuseHeaderRow (F [Blocks]) + | MuseBodyRow (F [Blocks]) + | MuseFooterRow (F [Blocks]) + | MuseCaption (F Inlines) + +museToPandocTable :: MuseTable -> Blocks +museToPandocTable (MuseTable caption headers body footers) = + B.table caption attrs headRow rows + where ncol = maximum (0 : map length (headers ++ body ++ footers)) + attrs = replicate ncol (AlignDefault, 0.0) + headRow = if null headers then [] else head headers + rows = (if null headers then [] else tail headers) ++ body ++ footers + +museAppendElement :: MuseTable + -> MuseTableElement + -> F MuseTable +museAppendElement tbl element = + case element of + MuseHeaderRow row -> do + row' <- row + return tbl{ museTableHeaders = museTableHeaders tbl ++ [row'] } + MuseBodyRow row -> do + row' <- row + return tbl{ museTableRows = museTableRows tbl ++ [row'] } + MuseFooterRow row-> do + row' <- row + return tbl{ museTableFooters = museTableFooters tbl ++ [row'] } + MuseCaption inlines -> do + inlines' <- inlines + return tbl{ museTableCaption = inlines' } + +tableCell :: PandocMonad m => MuseParser m (F Blocks) +tableCell = try $ fmap B.plain . trimInlinesF . mconcat <$> manyTill inline (lookAhead cellEnd) + where cellEnd = try $ void (many1 spaceChar >> char '|') <|> eol + +tableElements :: PandocMonad m => MuseParser m [MuseTableElement] +tableElements = tableParseElement `sepEndBy1` eol + +elementsToTable :: [MuseTableElement] -> F MuseTable +elementsToTable = foldM museAppendElement emptyTable + where emptyTable = MuseTable mempty mempty mempty mempty + +table :: PandocMonad m => MuseParser m (F Blocks) +table = try $ do + rows <- tableElements + let tbl = elementsToTable rows + let pandocTbl = museToPandocTable <$> tbl :: F Blocks + return pandocTbl + +tableParseElement :: PandocMonad m => MuseParser m MuseTableElement +tableParseElement = tableParseHeader + <|> tableParseBody + <|> tableParseFooter + <|> tableParseCaption + +tableParseRow :: PandocMonad m => Int -> MuseParser m (F [Blocks]) +tableParseRow n = try $ do + fields <- tableCell `sepBy2` fieldSep + return $ sequence fields + where p `sepBy2` sep = (:) <$> p <*> many1 (sep >> p) + fieldSep = many1 spaceChar >> count n (char '|') >> (void (many1 spaceChar) <|> void (lookAhead newline)) + +tableParseHeader :: PandocMonad m => MuseParser m MuseTableElement +tableParseHeader = MuseHeaderRow <$> tableParseRow 2 + +tableParseBody :: PandocMonad m => MuseParser m MuseTableElement +tableParseBody = MuseBodyRow <$> tableParseRow 1 + +tableParseFooter :: PandocMonad m => MuseParser m MuseTableElement +tableParseFooter = MuseFooterRow <$> tableParseRow 3 + +tableParseCaption :: PandocMonad m => MuseParser m MuseTableElement +tableParseCaption = try $ do + many spaceChar + string "|+" + MuseCaption <$> (trimInlinesF . mconcat <$> many1Till inline (string "+|")) + +-- +-- inline parsers +-- + +inlineList :: PandocMonad m => [MuseParser m (F Inlines)] +inlineList = [ whitespace + , br + , anchor + , footnote + , strong + , strongTag + , emph + , emphTag + , underlined + , superscriptTag + , subscriptTag + , strikeoutTag + , verbatimTag + , nbsp + , link + , code + , codeTag + , inlineLiteralTag + , str + , symbol + ] + +inline :: PandocMonad m => MuseParser m (F Inlines) +inline = endline <|> choice inlineList <?> "inline" + +endline :: PandocMonad m => MuseParser m (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + returnF B.softbreak + +parseAnchor :: PandocMonad m => MuseParser m String +parseAnchor = try $ do + getPosition >>= \pos -> guard (sourceColumn pos == 1) + char '#' + first <- letter + rest <- many (letter <|> digit) + skipMany spaceChar <|> void newline + return $ first:rest + +anchor :: PandocMonad m => MuseParser m (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + return $ return $ B.spanWith (anchorId, [], []) mempty + +footnote :: PandocMonad m => MuseParser m (F Inlines) +footnote = try $ do + ref <- noteMarker + return $ do + notes <- asksF museNotes + case M.lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just (_pos, contents) -> do + st <- askF + let contents' = runF contents st { museNotes = M.empty } + return $ B.note contents' + +whitespace :: PandocMonad m => MuseParser m (F Inlines) +whitespace = try $ do + skipMany1 spaceChar + return $ return B.space + +br :: PandocMonad m => MuseParser m (F Inlines) +br = try $ do + string "<br>" + return $ return B.linebreak + +emphasisBetween :: (PandocMonad m, Show a) => MuseParser m a -> MuseParser m (F Inlines) +emphasisBetween c = try $ enclosedInlines c c + +enclosedInlines :: (PandocMonad m, Show a, Show b) + => MuseParser m a + -> MuseParser m b + -> MuseParser m (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> (enclosed (atStart start) end inline <* notFollowedBy (satisfy isLetter)) + +inlineTag :: PandocMonad m + => (Inlines -> Inlines) + -> String + -> MuseParser m (F Inlines) +inlineTag f tag = try $ do + htmlTag (~== TagOpen tag []) + res <- manyTill inline (void $ htmlTag (~== TagClose tag)) + return $ f <$> mconcat res + +strongTag :: PandocMonad m => MuseParser m (F Inlines) +strongTag = inlineTag B.strong "strong" + +strong :: PandocMonad m => MuseParser m (F Inlines) +strong = fmap B.strong <$> emphasisBetween (string "**") + +emph :: PandocMonad m => MuseParser m (F Inlines) +emph = fmap B.emph <$> emphasisBetween (char '*') + +underlined :: PandocMonad m => MuseParser m (F Inlines) +underlined = do + guardDisabled Ext_amuse -- Supported only by Emacs Muse + fmap underlineSpan <$> emphasisBetween (char '_') + +emphTag :: PandocMonad m => MuseParser m (F Inlines) +emphTag = inlineTag B.emph "em" + +superscriptTag :: PandocMonad m => MuseParser m (F Inlines) +superscriptTag = inlineTag B.superscript "sup" + +subscriptTag :: PandocMonad m => MuseParser m (F Inlines) +subscriptTag = inlineTag B.subscript "sub" + +strikeoutTag :: PandocMonad m => MuseParser m (F Inlines) +strikeoutTag = inlineTag B.strikeout "del" + +verbatimTag :: PandocMonad m => MuseParser m (F Inlines) +verbatimTag = return . B.text . snd <$> htmlElement "verbatim" + +nbsp :: PandocMonad m => MuseParser m (F Inlines) +nbsp = try $ do + string "~~" + return $ return $ B.str "\160" + +code :: PandocMonad m => MuseParser m (F Inlines) +code = try $ do + atStart $ char '=' + contents <- many1Till (noneOf "\n\r" <|> (newline <* notFollowedBy newline)) $ char '=' + guard $ not $ null contents + guard $ head contents `notElem` " \t\n" + guard $ last contents `notElem` " \t\n" + notFollowedBy $ satisfy isLetter + return $ return $ B.code contents + +codeTag :: PandocMonad m => MuseParser m (F Inlines) +codeTag = do + (attrs, content) <- htmlElement "code" + return $ return $ B.codeWith attrs content + +inlineLiteralTag :: PandocMonad m => MuseParser m (F Inlines) +inlineLiteralTag = + (return . rawInline) <$> htmlElement "literal" + where + -- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML + format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs + rawInline (attrs, content) = B.rawInline (format attrs) content + +str :: PandocMonad m => MuseParser m (F Inlines) +str = do + result <- many1 alphaNum + updateLastStrPos + return $ return $ B.str result + +symbol :: PandocMonad m => MuseParser m (F Inlines) +symbol = return . B.str <$> count 1 nonspaceChar + +link :: PandocMonad m => MuseParser m (F Inlines) +link = try $ do + st <- getState + guard $ not $ museInLink st + setState $ st{ museInLink = True } + (url, title, content) <- linkText + updateState (\state -> state { museInLink = False }) + return $ case stripPrefix "URL:" url of + Nothing -> if isImageUrl url + then B.image url title <$> fromMaybe (return mempty) content + else B.link url title <$> fromMaybe (return $ B.str url) content + Just url' -> B.link url' title <$> fromMaybe (return $ B.str url') content + where -- Taken from muse-image-regexp defined in Emacs Muse file lisp/muse-regexps.el + imageExtensions = [".eps", ".gif", ".jpg", ".jpeg", ".pbm", ".png", ".tiff", ".xbm", ".xpm"] + isImageUrl = (`elem` imageExtensions) . takeExtension + +linkContent :: PandocMonad m => MuseParser m (F Inlines) +linkContent = do + char '[' + trimInlinesF . mconcat <$> many1Till inline (string "]") + +linkText :: PandocMonad m => MuseParser m (String, String, Maybe (F Inlines)) +linkText = do + string "[[" + url <- many1Till anyChar $ char ']' + content <- optionMaybe linkContent + char ']' + return (url, "", content) diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index 4ec164e19..88f6bfe8f 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2011-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011-2015 John MacFarlane + Copyright : Copyright (C) 2011-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,8 +31,12 @@ Conversion of a string representation of a pandoc type (@Pandoc@, module Text.Pandoc.Readers.Native ( readNative ) where import Text.Pandoc.Definition +import Text.Pandoc.Options (ReaderOptions) import Text.Pandoc.Shared (safeRead) +import Control.Monad.Except (throwError) +import Data.Text (Text, unpack) +import Text.Pandoc.Class import Text.Pandoc.Error -- | Read native formatted text and return a Pandoc document. @@ -45,19 +49,23 @@ import Text.Pandoc.Error -- -- > Pandoc nullMeta [Plain [Str "hi"]] -- -readNative :: String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readNative s = maybe (Pandoc nullMeta <$> readBlocks s) Right (safeRead s) +readNative :: PandocMonad m + => ReaderOptions + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readNative _ s = + case maybe (Pandoc nullMeta <$> readBlocks s) Right (safeRead (unpack s)) of + Right doc -> return doc + Left _ -> throwError $ PandocParseError "couldn't read native" -readBlocks :: String -> Either PandocError [Block] -readBlocks s = maybe ((:[]) <$> readBlock s) Right (safeRead s) +readBlocks :: Text -> Either PandocError [Block] +readBlocks s = maybe ((:[]) <$> readBlock s) Right (safeRead (unpack s)) -readBlock :: String -> Either PandocError Block -readBlock s = maybe (Plain <$> readInlines s) Right (safeRead s) +readBlock :: Text -> Either PandocError Block +readBlock s = maybe (Plain <$> readInlines s) Right (safeRead (unpack s)) -readInlines :: String -> Either PandocError [Inline] -readInlines s = maybe ((:[]) <$> readInline s) Right (safeRead s) - -readInline :: String -> Either PandocError Inline -readInline s = maybe (Left . ParseFailure $ "Could not read: " ++ s) Right (safeRead s) +readInlines :: Text -> Either PandocError [Inline] +readInlines s = maybe ((:[]) <$> readInline s) Right (safeRead (unpack s)) +readInline :: Text -> Either PandocError Inline +readInline s = maybe (Left . PandocParseError $ "Could not read: " ++ unpack s) Right (safeRead (unpack s)) diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index 4dcf5e5a0..82266748f 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -1,26 +1,28 @@ {-# LANGUAGE FlexibleContexts #-} module Text.Pandoc.Readers.OPML ( readOPML ) where +import Control.Monad.State.Strict import Data.Char (toUpper) -import Text.Pandoc.Options -import Text.Pandoc.Definition +import Data.Default +import Data.Generics +import Data.Maybe (fromMaybe) +import Data.Text (Text, pack, unpack) +import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Options import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) +import Text.Pandoc.Shared (crFilter, blocksToInlines') import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) -import Data.Generics -import Control.Monad.State -import Data.Default -import Control.Monad.Except -import Text.Pandoc.Error -type OPML = ExceptT PandocError (State OPMLState) +type OPML m = StateT OPMLState m data OPMLState = OPMLState{ opmlSectionLevel :: Int , opmlDocTitle :: Inlines , opmlDocAuthors :: [Inlines] , opmlDocDate :: Inlines + , opmlOptions :: ReaderOptions } deriving Show instance Default OPMLState where @@ -28,14 +30,19 @@ instance Default OPMLState where , opmlDocTitle = mempty , opmlDocAuthors = [] , opmlDocDate = mempty - } + , opmlOptions = def + } -readOPML :: ReaderOptions -> String -> Either PandocError Pandoc -readOPML _ inp = setTitle (opmlDocTitle st') - . setAuthors (opmlDocAuthors st') - . setDate (opmlDocDate st') - . doc . mconcat <$> bs - where (bs, st') = flip runState def . runExceptT $ (mapM parseBlock $ normalizeTree $ parseXML inp) +readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readOPML opts inp = do + (bs, st') <- runStateT + (mapM parseBlock $ normalizeTree $ + parseXML (unpack (crFilter inp))) def{ opmlOptions = opts } + return $ + setTitle (opmlDocTitle st') $ + setAuthors (opmlDocAuthors st') $ + setDate (opmlDocDate st') $ + doc $ mconcat bs -- normalize input, consolidating adjacent Text and CRef elements normalizeTree :: [Content] -> [Content] @@ -53,30 +60,32 @@ normalizeTree = everywhere (mkT go) go xs = xs convertEntity :: String -> String -convertEntity e = maybe (map toUpper e) id (lookupEntity e) +convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) -- convenience function to get an attribute value, defaulting to "" attrValue :: String -> Element -> String attrValue attr elt = - case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of - Just z -> z - Nothing -> "" + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -exceptT :: Either PandocError a -> OPML a -exceptT = either throwError return +-- exceptT :: PandocMonad m => Either PandocError a -> OPML m a +-- exceptT = either throwError return -asHtml :: String -> OPML Inlines -asHtml s = (\(Pandoc _ bs) -> case bs of - [Plain ils] -> fromList ils - _ -> mempty) <$> exceptT (readHtml def s) +asHtml :: PandocMonad m => String -> OPML m Inlines +asHtml s = do + opts <- gets opmlOptions + Pandoc _ bs <- readHtml def{ readerExtensions = readerExtensions opts } (pack s) + return $ blocksToInlines' bs -asMarkdown :: String -> OPML Blocks -asMarkdown s = (\(Pandoc _ bs) -> fromList bs) <$> exceptT (readMarkdown def s) +asMarkdown :: PandocMonad m => String -> OPML m Blocks +asMarkdown s = do + opts <- gets opmlOptions + Pandoc _ bs <- readMarkdown def{ readerExtensions = readerExtensions opts } (pack s) + return $ fromList bs -getBlocks :: Element -> OPML Blocks -getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) +getBlocks :: PandocMonad m => Element -> OPML m Blocks +getBlocks e = mconcat <$> mapM parseBlock (elContent e) -parseBlock :: Content -> OPML Blocks +parseBlock :: PandocMonad m => Content -> OPML m Blocks parseBlock (Elem e) = case qName (elName e) of "ownerName" -> mempty <$ modify (\st -> diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 046fb4d6d..875c18a85 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -32,31 +32,45 @@ Entry point to the odt reader. module Text.Pandoc.Readers.Odt ( readOdt ) where -import Codec.Archive.Zip -import qualified Text.XML.Light as XML +import Codec.Archive.Zip +import qualified Text.XML.Light as XML -import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString.Lazy as B -import System.FilePath +import System.FilePath -import Text.Pandoc.Definition -import Text.Pandoc.Error -import Text.Pandoc.Options -import Text.Pandoc.MediaBag -import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad.Except (throwError) -import Text.Pandoc.Readers.Odt.ContentReader -import Text.Pandoc.Readers.Odt.StyleReader +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.MediaBag +import Text.Pandoc.Options +import qualified Text.Pandoc.UTF8 as UTF8 -import Text.Pandoc.Readers.Odt.Generic.XMLConverter -import Text.Pandoc.Readers.Odt.Generic.Fallible -import Text.Pandoc.Shared (filteredFilesFromArchive) +import Text.Pandoc.Readers.Odt.ContentReader +import Text.Pandoc.Readers.Odt.StyleReader --- -readOdt :: ReaderOptions +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Shared (filteredFilesFromArchive) + +readOdt :: PandocMonad m + => ReaderOptions -> B.ByteString - -> Either PandocError (Pandoc, MediaBag) -readOdt _ bytes = bytesToOdt bytes-- of + -> m Pandoc +readOdt opts bytes = case readOdt' opts bytes of + Right (doc, mb) -> do + P.setMediaBag mb + return doc + Left e -> throwError e + +-- +readOdt' :: ReaderOptions + -> B.ByteString + -> Either PandocError (Pandoc, MediaBag) +readOdt' _ bytes = bytesToOdt bytes-- of -- Right (pandoc, mediaBag) -> Right (pandoc , mediaBag) -- Left err -> Left err @@ -64,7 +78,7 @@ readOdt _ bytes = bytesToOdt bytes-- of bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag) bytesToOdt bytes = case toArchiveOrFail bytes of Right archive -> archiveToOdt archive - Left _ -> Left $ ParseFailure "Couldn't parse odt file." + Left _ -> Left $ PandocParseError "Couldn't parse odt file." -- archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) @@ -85,7 +99,7 @@ archiveToOdt archive | otherwise -- Not very detailed, but I don't think more information would be helpful - = Left $ ParseFailure "Couldn't parse odt file." + = Left $ PandocParseError "Couldn't parse odt file." where filePathIsOdtMedia :: FilePath -> Bool filePathIsOdtMedia fp = diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs index b056f1ecc..73bed545e 100644 --- a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs +++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs @@ -1,6 +1,6 @@ -{-# LANGUAGE Arrows #-} -{-# LANGUAGE TupleSections #-} + {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE TupleSections #-} {- Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> @@ -38,17 +38,17 @@ faster and easier to implement this way. module Text.Pandoc.Readers.Odt.Arrows.State where -import Prelude hiding ( foldr, foldl ) +import Prelude hiding (foldl, foldr) -import qualified Control.Category as Cat -import Control.Arrow -import Control.Monad +import Control.Arrow +import qualified Control.Category as Cat +import Control.Monad -import Data.Foldable -import Data.Monoid +import Data.Foldable +import Data.Monoid -import Text.Pandoc.Readers.Odt.Arrows.Utils -import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Arrows.Utils +import Text.Pandoc.Readers.Odt.Generic.Fallible newtype ArrowState state a b = ArrowState @@ -59,10 +59,6 @@ withState :: (state -> a -> (state, b)) -> ArrowState state a b withState = ArrowState . uncurry -- | Constructor -withState' :: ((state, a) -> (state, b)) -> ArrowState state a b -withState' = ArrowState - --- | Constructor modifyState :: (state -> state ) -> ArrowState state a a modifyState = ArrowState . first @@ -79,10 +75,6 @@ extractFromState :: (state -> b ) -> ArrowState state x b extractFromState f = ArrowState $ \(state,_) -> (state, f state) -- | Constructor -withUnchangedState :: (state -> a -> b ) -> ArrowState state a b -withUnchangedState f = ArrowState $ \(state,a) -> (state, f state a) - --- | Constructor tryModifyState :: (state -> Either f state) -> ArrowState state a (Either f a) tryModifyState f = ArrowState $ \(state,a) @@ -90,7 +82,7 @@ tryModifyState f = ArrowState $ \(state,a) instance Cat.Category (ArrowState s) where id = ArrowState id - arrow2 . arrow1 = ArrowState $ (runArrowState arrow2).(runArrowState arrow1) + arrow2 . arrow1 = ArrowState $ runArrowState arrow2 . runArrowState arrow1 instance Arrow (ArrowState state) where arr = ignoringState @@ -107,43 +99,9 @@ instance ArrowChoice (ArrowState state) where Left l -> (s, Left l) Right r -> second Right $ runArrowState a (s,r) -instance ArrowLoop (ArrowState state) where - loop a = ArrowState $ \(s, x) - -> let (s', (x', _d)) = runArrowState a (s, (x, _d)) - in (s', x') - instance ArrowApply (ArrowState state) where app = ArrowState $ \(s, (f,b)) -> runArrowState f (s,b) - --- | Embedding of a state arrow in a state arrow with a different state type. -switchState :: (s -> s') -> (s' -> s) -> ArrowState s' x y -> ArrowState s x y -switchState there back a = ArrowState $ first there - >>> runArrowState a - >>> first back - --- | Lift a state arrow to modify the state of an arrow --- with a different state type. -liftToState :: (s -> s') -> ArrowState s' s s -> ArrowState s x x -liftToState unlift a = modifyState $ unlift &&& id - >>> runArrowState a - >>> snd - --- | Switches the type of the state temporarily. --- Drops the intermediate result state, behaving like the identity arrow, --- save for side effects in the state. -withSubState :: ArrowState s x s2 -> ArrowState s2 s s -> ArrowState s x x -withSubState unlift a = keepingTheValue (withSubState unlift a) >>^ fst - --- | Switches the type of the state temporarily. --- Returns the resulting sub-state. -withSubState' :: ArrowState s x s' -> ArrowState s' s s -> ArrowState s x s' -withSubState' unlift a = ArrowState $ runArrowState unlift - >>> switch - >>> runArrowState a - >>> switch - where switch (x,y) = (y,x) - -- | Switches the type of the state temporarily. -- Drops the intermediate result state, behaving like a fallible -- identity arrow, save for side effects in the state. @@ -175,49 +133,13 @@ foldS :: (Foldable f, Monoid m) => ArrowState s x m -> ArrowState s (f x) m foldS a = ArrowState $ \(s,f) -> foldr a' (s,mempty) f where a' x (s',m) = second (m <>) $ runArrowState a (s',x) --- | Fold a state arrow through something 'Foldable'. Collect the results --- in a 'Monoid'. --- Intermediate form of a fold between one with "only" a 'Monoid' --- and one with any function. -foldSL :: (Foldable f, Monoid m) => ArrowState s x m -> ArrowState s (f x) m -foldSL a = ArrowState $ \(s,f) -> foldl a' (s,mempty) f - where a' (s',m) x = second (m <>) $ runArrowState a (s',x) - --- | Fold a fallible state arrow through something 'Foldable'. Collect the --- results in a 'Monoid'. --- Intermediate form of a fold between one with "only" a 'Monoid' --- and one with any function. --- If the iteration fails, the state will be reset to the initial one. -foldS' :: (Foldable f, Monoid m) - => ArrowState s x (Either e m) - -> ArrowState s (f x) (Either e m) -foldS' a = ArrowState $ \(s,f) -> foldr (a' s) (s,Right mempty) f - where a' s x (s',Right m) = case runArrowState a (s',x) of - (s'',Right m') -> (s'', Right (m <> m')) - (_ ,Left e ) -> (s , Left e) - a' _ _ e = e - --- | Fold a fallible state arrow through something 'Foldable'. Collect the --- results in a 'Monoid'. --- Intermediate form of a fold between one with "only" a 'Monoid' --- and one with any function. --- If the iteration fails, the state will be reset to the initial one. -foldSL' :: (Foldable f, Monoid m) - => ArrowState s x (Either e m) - -> ArrowState s (f x) (Either e m) -foldSL' a = ArrowState $ \(s,f) -> foldl (a' s) (s,Right mempty) f - where a' s (s',Right m) x = case runArrowState a (s',x) of - (s'',Right m') -> (s'', Right (m <> m')) - (_ ,Left e ) -> (s , Left e) - a' _ e _ = e - -- | Fold a state arrow through something 'Foldable'. Collect the results in a -- 'MonadPlus'. iterateS :: (Foldable f, MonadPlus m) => ArrowState s x y -> ArrowState s (f x) (m y) iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f - where a' x (s',m) = second ((mplus m).return) $ runArrowState a (s',x) + where a' x (s',m) = second (mplus m.return) $ runArrowState a (s',x) -- | Fold a state arrow through something 'Foldable'. Collect the results in a -- 'MonadPlus'. @@ -225,7 +147,7 @@ iterateSL :: (Foldable f, MonadPlus m) => ArrowState s x y -> ArrowState s (f x) (m y) iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f - where a' (s',m) x = second ((mplus m).return) $ runArrowState a (s',x) + where a' (s',m) x = second (mplus m.return) $ runArrowState a (s',x) -- | Fold a fallible state arrow through something 'Foldable'. @@ -239,15 +161,3 @@ iterateS' a = ArrowState $ \(s,f) -> foldr (a' s) (s,Right mzero) f (s'',Right m') -> (s'',Right $ mplus m $ return m') (_ ,Left e ) -> (s ,Left e ) a' _ _ e = e - --- | Fold a fallible state arrow through something 'Foldable'. --- Collect the results in a 'MonadPlus'. --- If the iteration fails, the state will be reset to the initial one. -iterateSL' :: (Foldable f, MonadPlus m) - => ArrowState s x (Either e y ) - -> ArrowState s (f x) (Either e (m y)) -iterateSL' a = ArrowState $ \(s,f) -> foldl (a' s) (s,Right mzero) f - where a' s (s',Right m) x = case runArrowState a (s',x) of - (s'',Right m') -> (s'',Right $ mplus m $ return m') - (_ ,Left e ) -> (s ,Left e ) - a' _ e _ = e diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs b/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs index 218a85661..ef8b2d18a 100644 --- a/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs @@ -39,14 +39,11 @@ with an equivalent return value. -- We export everything module Text.Pandoc.Readers.Odt.Arrows.Utils where -import Control.Arrow -import Control.Monad ( join, MonadPlus(..) ) +import Control.Arrow +import Control.Monad (join) -import qualified Data.Foldable as F -import Data.Monoid - -import Text.Pandoc.Readers.Odt.Generic.Fallible -import Text.Pandoc.Readers.Odt.Generic.Utils +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.Utils and2 :: (Arrow a) => a b c -> a b c' -> a b (c,c') and2 = (&&&) @@ -63,12 +60,6 @@ and5 :: (Arrow a) and6 :: (Arrow a) => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5 -> a b (c0,c1,c2,c3,c4,c5 ) -and7 :: (Arrow a) - => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5->a b c6 - -> a b (c0,c1,c2,c3,c4,c5,c6 ) -and8 :: (Arrow a) - => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5->a b c6->a b c7 - -> a b (c0,c1,c2,c3,c4,c5,c6,c7) and3 a b c = (and2 a b ) &&& c >>^ \((z,y ) , x) -> (z,y,x ) @@ -78,10 +69,6 @@ and5 a b c d e = (and4 a b c d ) &&& e >>^ \((z,y,x,w ) , v) -> (z,y,x,w,v ) and6 a b c d e f = (and5 a b c d e ) &&& f >>^ \((z,y,x,w,v ) , u) -> (z,y,x,w,v,u ) -and7 a b c d e f g = (and6 a b c d e f ) &&& g - >>^ \((z,y,x,w,v,u ) , t) -> (z,y,x,w,v,u,t ) -and8 a b c d e f g h = (and7 a b c d e f g) &&& h - >>^ \((z,y,x,w,v,u,t) , s) -> (z,y,x,w,v,u,t,s) liftA2 :: (Arrow a) => (x -> y -> z) -> a b x -> a b y -> a b z liftA2 f a b = a &&& b >>^ uncurry f @@ -98,19 +85,11 @@ liftA5 :: (Arrow a) => (z->y->x->w->v -> r) liftA6 :: (Arrow a) => (z->y->x->w->v->u -> r) -> a b z->a b y->a b x->a b w->a b v->a b u -> a b r -liftA7 :: (Arrow a) => (z->y->x->w->v->u->t -> r) - -> a b z->a b y->a b x->a b w->a b v->a b u->a b t - -> a b r -liftA8 :: (Arrow a) => (z->y->x->w->v->u->t->s -> r) - -> a b z->a b y->a b x->a b w->a b v->a b u->a b t->a b s - -> a b r liftA3 fun a b c = and3 a b c >>^ uncurry3 fun liftA4 fun a b c d = and4 a b c d >>^ uncurry4 fun liftA5 fun a b c d e = and5 a b c d e >>^ uncurry5 fun liftA6 fun a b c d e f = and6 a b c d e f >>^ uncurry6 fun -liftA7 fun a b c d e f g = and7 a b c d e f g >>^ uncurry7 fun -liftA8 fun a b c d e f g h = and8 a b c d e f g h >>^ uncurry8 fun liftA :: (Arrow a) => (y -> z) -> a b y -> a b z liftA fun a = a >>^ fun @@ -124,28 +103,12 @@ liftA fun a = a >>^ fun duplicate :: (Arrow a) => a b (b,b) duplicate = arr $ join (,) --- | Lifts the combination of two values into an arrow. -joinOn :: (Arrow a) => (x -> y -> z) -> a (x,y) z -joinOn = arr.uncurry - -- | Applies a function to the uncurried result-pair of an arrow-application. -- (The %-symbol was chosen to evoke an association with pairs.) (>>%) :: (Arrow a) => a x (b,c) -> (b -> c -> d) -> a x d a >>% f = a >>^ uncurry f --- | '(>>%)' with its arguments flipped -(%<<) :: (Arrow a) => (b -> c -> d) -> a x (b,c) -> a x d -(%<<) = flip (>>%) - --- | Precomposition with an uncurried function -(%>>) :: (Arrow a) => (b -> c -> d) -> a d r -> a (b,c) r -f %>> a = uncurry f ^>> a - --- | Precomposition with an uncurried function (right to left variant) -(<<%) :: (Arrow a) => a d r -> (b -> c -> d) -> a (b,c) r -(<<%) = flip (%>>) - -infixr 2 >>%, %<<, %>>, <<% +infixr 2 >>% -- | Duplicate a value and apply an arrow to the second instance. @@ -156,56 +119,6 @@ infixr 2 >>%, %<<, %>>, <<% keepingTheValue :: (Arrow a) => a b c -> a b (b,c) keepingTheValue a = returnA &&& a --- | Duplicate a value and apply an arrow to the first instance. --- Aequivalent to --- > \a -> duplicate >>> first a --- or --- > \a -> a &&& returnA -keepingTheValue' :: (Arrow a) => a b c -> a b (c,b) -keepingTheValue' a = a &&& returnA - --- | 'bind' from the "Maybe"-Monad lifted into an 'ArrowChoice'. --- Actually, it's the more complex '(>=>)', because 'bind' alone does not --- combine as nicely in arrow form. --- The current implementation is not the most efficient one, because it can --- not return directly if a 'Nothing' is encountered. That in turn follows --- from the type system, as 'Nothing' has an "invisible" type parameter that --- can not be dropped early. --- --- Also, there probably is a way to generalize this to other monads --- or applicatives, but I'm leaving that as an exercise to the reader. --- I have a feeling there is a new Arrow-typeclass to be found that is less --- restrictive than 'ArrowApply'. If it is already out there, --- I have not seen it yet. ('ArrowPlus' for example is not general enough.) -(>>>=) :: (ArrowChoice a) => a x (Maybe b) -> a b (Maybe c) -> a x (Maybe c) -a1 >>>= a2 = a1 >>> maybeToChoice >>> right a2 >>> choiceToMaybe >>^ join - -infixr 2 >>>= - --- | 'mplus' Lifted into an arrow. No 'ArrowPlus' required. --- (But still different from a true bind) -(>++<) :: (Arrow a, MonadPlus m) => a x (m b) -> a x (m b) -> a x (m b) -(>++<) = liftA2 mplus - --- | Left-compose with a pure function -leftLift :: (ArrowChoice a) => (l -> l') -> a (Either l r) (Either l' r) -leftLift = left.arr - --- | Right-compose with a pure function -rightLift :: (ArrowChoice a) => (r -> r') -> a (Either l r) (Either l r') -rightLift = right.arr - - -( ^+++ ) :: (ArrowChoice a) => (b -> c) -> a b' c' -> a (Either b b') (Either c c') -( +++^ ) :: (ArrowChoice a) => a b c -> (b' -> c') -> a (Either b b') (Either c c') -( ^+++^ ) :: (ArrowChoice a) => (b -> c) -> (b' -> c') -> a (Either b b') (Either c c') - -l ^+++ r = leftLift l >>> right r -l +++^ r = left l >>> rightLift r -l ^+++^ r = leftLift l >>> rightLift r - -infixr 2 ^+++, +++^, ^+++^ - ( ^||| ) :: (ArrowChoice a) => (b -> d) -> a c d -> a (Either b c) d ( |||^ ) :: (ArrowChoice a) => a b d -> (c -> d) -> a (Either b c) d ( ^|||^ ) :: (ArrowChoice a) => (b -> d) -> (c -> d) -> a (Either b c) d @@ -218,33 +131,12 @@ infixr 2 ^||| , |||^, ^|||^ ( ^&&& ) :: (Arrow a) => (b -> c) -> a b c' -> a b (c,c') ( &&&^ ) :: (Arrow a) => a b c -> (b -> c') -> a b (c,c') -( ^&&&^ ) :: (Arrow a) => (b -> c) -> (b -> c') -> a b (c,c') l ^&&& r = arr l &&& r l &&&^ r = l &&& arr r -l ^&&&^ r = arr l &&& arr r - -infixr 3 ^&&&, &&&^, ^&&&^ -( ^*** ) :: (Arrow a) => (b -> c) -> a b' c' -> a (b,b') (c,c') -( ***^ ) :: (Arrow a) => a b c -> (b' -> c') -> a (b,b') (c,c') -( ^***^ ) :: (Arrow a) => (b -> c) -> (b' -> c') -> a (b,b') (c,c') +infixr 3 ^&&&, &&&^ -l ^*** r = arr l *** r -l ***^ r = l *** arr r -l ^***^ r = arr l *** arr r - -infixr 3 ^***, ***^, ^***^ - --- | A version of --- --- >>> \p -> arr (\x -> if p x the Right x else Left x) --- --- but with p being an arrow -choose :: (ArrowChoice a) => a b Bool -> a b (Either b b) -choose checkValue = keepingTheValue checkValue >>^ select - where select (x,True ) = Right x - select (x,False ) = Left x -- | Converts @Right a@ into @Just a@ and @Left _@ into @Nothing@. choiceToMaybe :: (ArrowChoice a) => a (Either l r) (Maybe r) @@ -258,130 +150,15 @@ maybeToChoice = arr maybeToEither returnV :: (Arrow a) => c -> a x c returnV = arr.const --- | 'returnA' dropping everything -returnA_ :: (Arrow a) => a _b () -returnA_ = returnV () - --- | Wrapper for an arrow that can be evaluated im parallel. All --- Arrows can be evaluated in parallel, as long as they return a --- monoid. -newtype ParallelArrow a b c = CoEval { evalParallelArrow :: a b c } - deriving (Eq, Ord, Show) - -instance (Arrow a, Monoid m) => Monoid (ParallelArrow a b m) where - mempty = CoEval $ returnV mempty - (CoEval a) `mappend` (CoEval ~b) = CoEval $ a &&& b >>% mappend - --- | Evaluates a collection of arrows in a parallel fashion. --- --- This is in essence a fold of '(&&&)' over the collection, --- so the actual execution order and parallelity depends on the --- implementation of '(&&&)' in the arrow in question. --- The default implementation of '(&&&)' for example keeps the --- order as given in the collection. --- --- This function can be seen as a generalization of --- 'Control.Applicative.sequenceA' to arrows or as an alternative to --- a fold with 'Control.Applicative.WrappedArrow', which --- substitutes the monoid with function application. --- -coEval :: (Arrow a, F.Foldable f, Monoid m) => f (a b m) -> a b m -coEval = evalParallelArrow . (F.foldMap CoEval) - -- | Defines Left as failure, Right as success type FallibleArrow a input failure success = a input (Either failure success) -type ReFallibleArrow a failure success success' - = FallibleArrow a (Either failure success) failure success' - --- | Wrapper for fallible arrows. Fallible arrows are all arrows that return --- an Either value where left is a faliure and right is a success value. -newtype AlternativeArrow a input failure success - = TryArrow { evalAlternativeArrow :: FallibleArrow a input failure success } - - -instance (ArrowChoice a, Monoid failure) - => Monoid (AlternativeArrow a input failure success) where - mempty = TryArrow $ returnV $ Left mempty - (TryArrow a) `mappend` (TryArrow b) - = TryArrow $ a &&& b - >>^ \(a',~b') - -> ( (\a'' -> left (mappend a'') b') ||| Right ) - a' - --- | Evaluates a collection of fallible arrows, trying each one in succession. --- Left values are interpreted as failures, right values as successes. --- --- The evaluation is stopped once an arrow succeeds. --- Up to that point, all failures are collected in the failure-monoid. --- Note that '()' is a monoid, and thus can serve as a failure-collector if --- you are uninterested in the exact failures. --- --- This is in essence a fold of '(&&&)' over the collection, enhanced with a --- little bit of repackaging, so the actual execution order depends on the --- implementation of '(&&&)' in the arrow in question. --- The default implementation of '(&&&)' for example keeps the --- order as given in the collection. --- -tryArrows :: (ArrowChoice a, F.Foldable f, Monoid failure) - => f (FallibleArrow a b failure success) - -> FallibleArrow a b failure success -tryArrows = evalAlternativeArrow . (F.foldMap TryArrow) - --- -liftSuccess :: (ArrowChoice a) - => (success -> success') - -> ReFallibleArrow a failure success success' -liftSuccess = rightLift - -- liftAsSuccess :: (ArrowChoice a) => a x success -> FallibleArrow a x failure success liftAsSuccess a = a >>^ Right --- -asFallibleArrow :: (ArrowChoice a) - => a x success - -> FallibleArrow a x failure success -asFallibleArrow a = a >>^ Right - --- | Raises an error into a 'ReFallibleArrow' if the arrow is already in --- "error mode" -liftError :: (ArrowChoice a, Monoid failure) - => failure - -> ReFallibleArrow a failure success success -liftError e = leftLift (e <>) - --- | Raises an error into a 'FallibleArrow', droping both the arrow input --- and any previously stored error value. -_raiseA :: (ArrowChoice a) - => failure - -> FallibleArrow a x failure success -_raiseA e = returnV (Left e) - --- | Raises an empty error into a 'FallibleArrow', droping both the arrow input --- and any previously stored error value. -_raiseAEmpty :: (ArrowChoice a, Monoid failure) - => FallibleArrow a x failure success -_raiseAEmpty = _raiseA mempty - --- | Raises an error into a 'ReFallibleArrow', possibly appending the new error --- to an existing one -raiseA :: (ArrowChoice a, Monoid failure) - => failure - -> ReFallibleArrow a failure success success -raiseA e = arr $ Left.(either (<> e) (const e)) - --- | Raises an empty error into a 'ReFallibleArrow'. If there already is an --- error, nothing changes. --- (Note that this function is only aequivalent to @raiseA mempty@ iff the --- failure monoid follows the monoid laws.) -raiseAEmpty :: (ArrowChoice a, Monoid failure) - => ReFallibleArrow a failure success success -raiseAEmpty = arr (fromRight (const mempty) >>> Left) - - -- | Execute the second arrow if the first succeeds (>>?) :: (ArrowChoice a) => FallibleArrow a x failure success @@ -410,20 +187,6 @@ a >>?^? b = a >>> Left ^|||^ b -> FallibleArrow a x failure success' a ^>>? b = a ^>> Left ^||| b --- | Execute the lifted second arrow if the lifted first arrow succeeds -(^>>?^) :: (ArrowChoice a) - => (x -> Either failure success) - -> (success -> success') - -> FallibleArrow a x failure success' -a ^>>?^ f = arr $ a >>> right f - --- | Execute the lifted second arrow if the lifted first arrow succeeds -(^>>?^?) :: (ArrowChoice a) - => (x -> Either failure success) - -> (success -> Either failure success') - -> FallibleArrow a x failure success' -a ^>>?^? f = a ^>> Left ^|||^ f - -- | Execute the second, non-fallible arrow if the first arrow succeeds (>>?!) :: (ArrowChoice a) => FallibleArrow a x failure success @@ -448,38 +211,14 @@ a ^>>?% f = arr a >>?^ (uncurry f) --- (>>?%?) :: (ArrowChoice a) => FallibleArrow a x f (b,b') - -> (b -> b' -> (Either f c)) + -> (b -> b' -> Either f c) -> FallibleArrow a x f c -a >>?%? f = a >>?^? (uncurry f) +a >>?%? f = a >>?^? uncurry f infixr 1 >>?, >>?^, >>?^? -infixr 1 ^>>?, ^>>?^, ^>>?^?, >>?! +infixr 1 ^>>?, >>?! infixr 1 >>?%, ^>>?%, >>?%? --- | Keep values that are Right, replace Left values by a constant. -ifFailedUse :: (ArrowChoice a) => v -> a (Either f v) v -ifFailedUse v = arr $ either (const v) id - --- | '(&&)' lifted into an arrow -(<&&>) :: (Arrow a) => a x Bool -> a x Bool -> a x Bool -(<&&>) = liftA2 (&&) - --- | '(||)' lifted into an arrow -(<||>) :: (Arrow a) => a x Bool -> a x Bool -> a x Bool -(<||>) = liftA2 (||) - --- | An equivalent of '(&&)' in a fallible arrow -(>&&<) :: (ArrowChoice a, Monoid f) => FallibleArrow a x f s - -> FallibleArrow a x f s' - -> FallibleArrow a x f (s,s') -(>&&<) = liftA2 chooseMin - --- | An equivalent of '(||)' in some forms of fallible arrows -(>||<) :: (ArrowChoice a, Monoid f, Monoid s) => FallibleArrow a x f s - -> FallibleArrow a x f s - -> FallibleArrow a x f s -(>||<) = liftA2 chooseMax - -- | An arrow version of a short-circuit (<|>) ifFailedDo :: (ArrowChoice a) => FallibleArrow a x f y @@ -489,7 +228,4 @@ ifFailedDo a b = keepingTheValue a >>> repackage ^>> (b |||^ Right) where repackage (x , Left _) = Left x repackage (_ , Right y) = Right y -infixr 4 <&&>, <||>, >&&<, >||< infixr 1 `ifFailedDo` - - diff --git a/src/Text/Pandoc/Readers/Odt/Base.hs b/src/Text/Pandoc/Readers/Odt/Base.hs index 1f095bade..51c2da788 100644 --- a/src/Text/Pandoc/Readers/Odt/Base.hs +++ b/src/Text/Pandoc/Readers/Odt/Base.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE PatternGuards #-} + {- Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> @@ -32,12 +32,11 @@ Core types of the odt reader. module Text.Pandoc.Readers.Odt.Base where -import Text.Pandoc.Readers.Odt.Generic.XMLConverter -import Text.Pandoc.Readers.Odt.Namespaces +import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Readers.Odt.Namespaces type OdtConverterState s = XMLConverterState Namespace s type XMLReader s a b = FallibleXMLConverter Namespace s a b type XMLReaderSafe s a b = XMLConverter Namespace s a b - diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 2672b01ef..380f16c66 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -1,8 +1,8 @@ {-# LANGUAGE Arrows #-} -{-# LANGUAGE TupleSections #-} {-# LANGUAGE PatternGuards #-} -{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE ViewPatterns #-} {- Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> @@ -39,29 +39,28 @@ module Text.Pandoc.Readers.Odt.ContentReader , read_body ) where -import Control.Arrow -import Control.Applicative hiding ( liftA, liftA2, liftA3 ) +import Control.Applicative hiding (liftA, liftA2, liftA3) +import Control.Arrow -import qualified Data.ByteString.Lazy as B -import qualified Data.Map as M -import Data.List ( find, intercalate ) -import Data.Maybe +import qualified Data.ByteString.Lazy as B +import Data.List (find, intercalate) +import qualified Data.Map as M +import Data.Maybe -import qualified Text.XML.Light as XML +import qualified Text.XML.Light as XML -import Text.Pandoc.Definition -import Text.Pandoc.Builder -import Text.Pandoc.MediaBag (insertMedia, MediaBag) -import Text.Pandoc.Shared +import Text.Pandoc.Builder +import Text.Pandoc.MediaBag (MediaBag, insertMedia) +import Text.Pandoc.Shared -import Text.Pandoc.Readers.Odt.Base -import Text.Pandoc.Readers.Odt.Namespaces -import Text.Pandoc.Readers.Odt.StyleReader +import Text.Pandoc.Readers.Odt.Base +import Text.Pandoc.Readers.Odt.Namespaces +import Text.Pandoc.Readers.Odt.StyleReader -import Text.Pandoc.Readers.Odt.Arrows.Utils -import Text.Pandoc.Readers.Odt.Generic.XMLConverter -import Text.Pandoc.Readers.Odt.Generic.Fallible -import Text.Pandoc.Readers.Odt.Generic.Utils +import Text.Pandoc.Readers.Odt.Arrows.Utils +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.Utils +import Text.Pandoc.Readers.Odt.Generic.XMLConverter import qualified Data.Set as Set @@ -94,8 +93,6 @@ data ReaderState , envMedia :: Media -- | Hold binary resources used in the document , odtMediaBag :: MediaBag --- , sequences --- , trackedChangeIDs } deriving ( Show ) @@ -250,7 +247,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do modifyExtraState (putPrettyAnchor uglyAnchor newPretty) -<< newPretty -- | Input: basis for a new header anchor --- Ouput: saved new anchor +-- Output: saved new anchor getHeaderAnchor :: OdtReaderSafe Inlines Anchor getHeaderAnchor = proc title -> do state <- getExtraState -< () @@ -296,7 +293,7 @@ withNewStyle a = proc x -> do modifier <- arr modifierFromStyleDiff -< triple fShouldTrace <- isStyleToTrace -< style case fShouldTrace of - Right shouldTrace -> do + Right shouldTrace -> if shouldTrace then do pushStyle -< style @@ -325,7 +322,7 @@ type InlineModifier = Inlines -> Inlines modifierFromStyleDiff :: PropertyTriple -> InlineModifier modifierFromStyleDiff propertyTriple = composition $ - (getVPosModifier propertyTriple) + getVPosModifier propertyTriple : map (first ($ propertyTriple) >>> ifThen_else ignore) [ (hasEmphChanged , emph ) , (hasChanged isStrong , strong ) @@ -344,9 +341,9 @@ modifierFromStyleDiff propertyTriple = Just oldVPos -> getVPosModifier' (oldVPos, verticalPosition textProps) getVPosModifier' (oldVPos , newVPos ) | oldVPos == newVPos = ignore - getVPosModifier' ( _ , VPosSub ) = subscript - getVPosModifier' ( _ , VPosSuper ) = superscript - getVPosModifier' ( _ , _ ) = ignore + getVPosModifier' ( _ , VPosSub ) = subscript + getVPosModifier' ( _ , VPosSuper ) = superscript + getVPosModifier' ( _ , _ ) = ignore hasEmphChanged :: PropertyTriple -> Bool hasEmphChanged = swing any [ hasChanged isEmphasised @@ -355,17 +352,17 @@ modifierFromStyleDiff propertyTriple = ] hasChanged property triple@(_, property -> newProperty, _) = - maybe True (/=newProperty) (lookupPreviousValue property triple) + (/= Just newProperty) (lookupPreviousValue property triple) hasChangedM property triple@(_, textProps,_) = fromMaybe False $ (/=) <$> property textProps <*> lookupPreviousValueM property triple - lookupPreviousValue f = lookupPreviousStyleValue ((fmap f).textProperties) + lookupPreviousValue f = lookupPreviousStyleValue (fmap f . textProperties) lookupPreviousValueM f = lookupPreviousStyleValue ((f =<<).textProperties) lookupPreviousStyleValue f (ReaderState{..},_,mFamily) - = ( findBy f $ extendedStylePropertyChain styleTrace styleSet ) + = findBy f (extendedStylePropertyChain styleTrace styleSet) <|> ( f =<< fmap (lookupDefaultStyle' styleSet) mFamily ) @@ -569,7 +566,7 @@ read_text_seq = matchingElement NsText "sequence" -- specifically. I honor that, although the current implementation of '(<>)' --- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein. +-- for 'Inlines' in "Text.Pandoc.Builder" will collapse them again. -- The rational is to be prepared for future modifications. read_spaces :: InlineMatcher read_spaces = matchingElement NsText "s" ( @@ -663,7 +660,7 @@ read_list = matchingElement NsText "list" -- read_list_item :: ElementMatcher [Blocks] read_list_item = matchingElement NsText "list-item" - $ liftA (compactify'.(:[])) + $ liftA (compactify.(:[])) ( matchChildContent' [ read_paragraph , read_header , read_list @@ -749,7 +746,7 @@ read_table_row = matchingElement NsTable "table-row" -- read_table_cell :: ElementMatcher [Blocks] read_table_cell = matchingElement NsTable "table-cell" - $ liftA (compactify'.(:[])) + $ liftA (compactify.(:[])) $ matchChildContent' [ read_paragraph ] @@ -796,8 +793,7 @@ read_image_src = matchingElement NsDraw "image" Left _ -> returnV "" -< () read_frame_title :: InlineMatcher -read_frame_title = matchingElement NsSVG "title" - $ (matchChildContent [] read_plain_text) +read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) read_frame_text_box :: InlineMatcher read_frame_text_box = matchingElement NsDraw "text-box" @@ -806,12 +802,12 @@ read_frame_text_box = matchingElement NsDraw "text-box" arr read_img_with_caption -< toList paragraphs read_img_with_caption :: [Block] -> Inlines -read_img_with_caption ((Para ((Image attr alt (src,title)) : [])) : _) = +read_img_with_caption (Para [Image attr alt (src,title)] : _) = singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption -read_img_with_caption ((Para ((Image attr _ (src,title)) : txt)) : _) = +read_img_with_caption (Para (Image attr _ (src,title) : txt) : _) = singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows -read_img_with_caption ( (Para (_ : xs)) : ys) = - read_img_with_caption ((Para xs) : ys) +read_img_with_caption ( Para (_ : xs) : ys) = + read_img_with_caption (Para xs : ys) read_img_with_caption _ = mempty @@ -899,9 +895,6 @@ read_reference_ref = matchingElement NsText "reference-ref" -- Entry point ---------------------- ---read_plain_content :: OdtReaderSafe _x Inlines ---read_plain_content = strContent >>^ text - read_text :: OdtReaderSafe _x Pandoc read_text = matchChildContent' [ read_header , read_paragraph @@ -915,8 +908,8 @@ post_process (Pandoc m blocks) = Pandoc m (post_process' blocks) post_process' :: [Block] -> [Block] -post_process' ((Table _ a w h r) : (Div ("", ["caption"], _) [Para inlines] ) : xs) = - (Table inlines a w h r) : ( post_process' xs ) +post_process' (Table _ a w h r : Div ("", ["caption"], _) [Para inlines] : xs) = + Table inlines a w h r : post_process' xs post_process' bs = bs read_body :: OdtReader _x (Pandoc, MediaBag) diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs b/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs index 877443543..f8ea5c605 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving #-} + {- Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> @@ -39,11 +39,7 @@ compatible instances of "ArrowChoice". -- We export everything module Text.Pandoc.Readers.Odt.Generic.Fallible where -import Control.Applicative -import Control.Monad - -import qualified Data.Foldable as F -import Data.Monoid ((<>)) +import Data.Monoid ((<>)) -- | Default for now. Will probably become a class at some point. type Failure = () @@ -51,16 +47,6 @@ type Failure = () type Fallible a = Either Failure a --- | False -> Left (), True -> Right () -boolToEither :: Bool -> Fallible () -boolToEither False = Left () -boolToEither True = Right () - --- | False -> Left (), True -> Right () -boolToChoice :: Bool -> Fallible () -boolToChoice False = Left () -boolToChoice True = Right () - -- maybeToEither :: Maybe a -> Fallible a maybeToEither (Just a) = Right a @@ -71,21 +57,11 @@ eitherToMaybe :: Either _l a -> Maybe a eitherToMaybe (Left _) = Nothing eitherToMaybe (Right a) = Just a --- | > untagEither === either id id -untagEither :: Either a a -> a -untagEither (Left a) = a -untagEither (Right a) = a - -- | > fromLeft f === either f id fromLeft :: (a -> b) -> Either a b -> b fromLeft f (Left a) = f a fromLeft _ (Right b) = b --- | > fromRight f === either id f -fromRight :: (a -> b) -> Either b a -> b -fromRight _ (Left b) = b -fromRight f (Right a) = f a - -- | > recover a === fromLeft (const a) === either (const a) id recover :: a -> Either _f a -> a recover a (Left _) = a @@ -110,24 +86,6 @@ collapseEither (Left f ) = Left f collapseEither (Right (Left f)) = Left f collapseEither (Right (Right x)) = Right x --- | If either of the values represents an error, the result is a --- (possibly combined) error. If both values represent a success, --- both are returned. -chooseMin :: (Monoid a) => Either a b -> Either a b' -> Either a (b,b') -chooseMin = chooseMinWith (,) - --- | If either of the values represents an error, the result is a --- (possibly combined) error. If both values represent a success, --- a combination is returned. -chooseMinWith :: (Monoid a) => (b -> b' -> c) - -> Either a b - -> Either a b' - -> Either a c -chooseMinWith (><) (Right a) (Right b) = Right $ a >< b -chooseMinWith _ (Left a) (Left b) = Left $ a <> b -chooseMinWith _ (Left a) _ = Left a -chooseMinWith _ _ (Left b) = Left b - -- | If either of the values represents a non-error, the result is a -- (possibly combined) non-error. If both values represent an error, an error -- is returned. @@ -152,109 +110,17 @@ chooseMaxWith _ _ (Right b) = Right b class ChoiceVector v where spreadChoice :: v (Either f a) -> Either f (v a) --- Let's do a few examples first - -instance ChoiceVector Maybe where - spreadChoice (Just (Left f)) = Left f - spreadChoice (Just (Right x)) = Right (Just x) - spreadChoice Nothing = Right Nothing - -instance ChoiceVector (Either l) where - spreadChoice (Right (Left f)) = Left f - spreadChoice (Right (Right x)) = Right (Right x) - spreadChoice (Left x ) = Right (Left x) - instance ChoiceVector ((,) a) where spreadChoice (_, Left f) = Left f spreadChoice (x, Right y) = Right (x,y) -- Wasn't there a newtype somewhere with the elements flipped? --- --- More instances later, first some discussion. --- --- I'll have to freshen up on type system details to see how (or if) to do --- something like --- --- > instance (ChoiceVector a, ChoiceVector b) => ChoiceVector (a b) where --- > : --- --- But maybe it would be even better to use something like --- --- > class ChoiceVector v v' f | v -> v' f where --- > spreadChoice :: v -> Either f v' --- --- That way, more places in @v@ could spread the cheer, e.g.: --- --- As before: --- -- ( a , Either f b) (a , b) f --- > instance ChoiceVector ((,) a (Either f b)) ((,) a b) f where --- > spreadChoice (_, Left f) = Left f --- > spreadChoice (a, Right b) = Right (a,b) --- --- But also: --- -- ( Either f a , b) (a , b) f --- > instance ChoiceVector ((,) (Either f a) b) ((,) a b) f where --- > spreadChoice (Right a,b) = Right (a,b) --- > spreadChoice (Left f,_) = Left f --- --- And maybe even: --- -- ( Either f a , Either f b) (a , b) f --- > instance ChoiceVector ((,) (Either f a) (Either f b)) ((,) a b) f where --- > spreadChoice (Right a , Right b) = Right (a,b) --- > spreadChoice (Left f , _ ) = Left f --- > spreadChoice ( _ , Left f) = Left f --- --- Of course that would lead to a lot of overlapping instances... --- But I can't think of a different way. A selector function might help, --- but not even a "Data.Traversable" is powerful enough for that. --- But maybe someone has already solved all this with a lens library. --- --- Well, it's an interesting academic question. But for practical purposes, --- I have more than enough right now. - -instance ChoiceVector ((,,) a b) where - spreadChoice (_,_, Left f) = Left f - spreadChoice (a,b, Right x) = Right (a,b,x) - -instance ChoiceVector ((,,,) a b c) where - spreadChoice (_,_,_, Left f) = Left f - spreadChoice (a,b,c, Right x) = Right (a,b,c,x) - -instance ChoiceVector ((,,,,) a b c d) where - spreadChoice (_,_,_,_, Left f) = Left f - spreadChoice (a,b,c,d, Right x) = Right (a,b,c,d,x) - -instance ChoiceVector (Const a) where - spreadChoice (Const c) = Right (Const c) -- need to repackage because of implicit types - --- | Fails on the first error -instance ChoiceVector [] where - spreadChoice = sequence -- using the monad instance of Either. - -- Could be generalized to "Data.Traversable" - but why play - -- with UndecidableInstances unless this is really needed. - -- | Wrapper for a list. While the normal list instance of 'ChoiceVector' -- fails whenever it can, this type will never fail. newtype SuccessList a = SuccessList { collectNonFailing :: [a] } deriving ( Eq, Ord, Show ) instance ChoiceVector SuccessList where - spreadChoice = Right . SuccessList . (foldr unTagRight []) . collectNonFailing + spreadChoice = Right . SuccessList . foldr unTagRight [] . collectNonFailing where unTagRight (Right x) = (x:) unTagRight _ = id - --- | Like 'catMaybes', but for 'Either'. -collectRights :: [Either _l r] -> [r] -collectRights = collectNonFailing . untag . spreadChoice . SuccessList - where untag = fromLeft (error "Unexpected Left") - --- | A version of 'collectRights' generalized to other containers. The --- container must be both "reducible" and "buildable". Most general containers --- should fullfill these requirements, but there is no single typeclass --- (that I know of) for that. --- Therefore, they are split between 'Foldable' and 'MonadPlus'. --- (Note that 'Data.Traversable.Traversable' alone would not be enough, either.) -collectRightsF :: (F.Foldable c, MonadPlus c) => c (Either _l r) -> c r -collectRightsF = F.foldr unTagRight mzero - where unTagRight (Right x) = mplus $ return x - unTagRight _ = id diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs index 6c10ed61d..556517259 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -1,6 +1,6 @@ + + {-# LANGUAGE TypeOperators #-} -{-# LANGUAGE TupleSections #-} -{-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} {- @@ -38,8 +38,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , uncurry4 , uncurry5 , uncurry6 -, uncurry7 -, uncurry8 , swap , reverseComposition , bool @@ -53,12 +51,12 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , composition ) where -import Control.Category ( Category, (>>>), (<<<) ) -import qualified Control.Category as Cat ( id ) -import Control.Monad ( msum ) +import Control.Category (Category, (<<<), (>>>)) +import qualified Control.Category as Cat (id) +import Control.Monad (msum) -import qualified Data.Foldable as F ( Foldable, foldr ) -import Data.Maybe +import qualified Data.Foldable as F (Foldable, foldr) +import Data.Maybe -- | Aequivalent to @@ -148,15 +146,11 @@ uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z uncurry5 :: (a->b->c->d->e -> z) -> (a,b,c,d,e ) -> z uncurry6 :: (a->b->c->d->e->f -> z) -> (a,b,c,d,e,f ) -> z -uncurry7 :: (a->b->c->d->e->f->g -> z) -> (a,b,c,d,e,f,g ) -> z -uncurry8 :: (a->b->c->d->e->f->g->h -> z) -> (a,b,c,d,e,f,g,h) -> z uncurry3 fun (a,b,c ) = fun a b c uncurry4 fun (a,b,c,d ) = fun a b c d uncurry5 fun (a,b,c,d,e ) = fun a b c d e uncurry6 fun (a,b,c,d,e,f ) = fun a b c d e f -uncurry7 fun (a,b,c,d,e,f,g ) = fun a b c d e f g -uncurry8 fun (a,b,c,d,e,f,g,h) = fun a b c d e f g h swap :: (a,b) -> (b,a) swap (a,b) = (b,a) @@ -168,4 +162,3 @@ findBy :: (a -> Maybe b) -> [a] -> Maybe b findBy _ [] = Nothing findBy f ((f -> Just x):_ ) = Just x findBy f ( _:xs) = findBy f xs - diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 8c03d1a09..428048427 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -41,50 +41,17 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter , XMLConverterState , XMLConverter , FallibleXMLConverter -, swapPosition -, runConverter -, runConverter'' , runConverter' -, runConverterF' -, runConverterF -, getCurrentElement , getExtraState , setExtraState , modifyExtraState -, convertingExtraState , producingExtraState -, lookupNSiri -, lookupNSprefix -, readNSattributes -, elemName -, elemNameIs -, strContent -, elContent -, currentElem -, currentElemIs -, expectElement -, elChildren -, findChildren -, filterChildren -, filterChildrenName , findChild' -, findChild -, filterChild' -, filterChild -, filterChildName' -, filterChildName -, isSet , isSet' , isSetWithDefault -, hasAttrValueOf' -, failIfNotAttrValueOf -, isThatTheAttrValue -, searchAttrIn -, searchAttrWith , searchAttr , lookupAttr , lookupAttr' -, lookupAttrWithDefault , lookupDefaultingAttr , findAttr' , findAttr @@ -93,25 +60,9 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter , readAttr' , readAttrWithDefault , getAttr --- , (>/<) --- , (?>/<) , executeIn -, collectEvery , withEveryL -, withEvery , tryAll -, tryAll' -, IdXMLConverter -, MaybeEConverter -, ElementMatchConverter -, MaybeCConverter -, ContentMatchConverter -, makeMatcherE -, makeMatcherC -, prepareMatchersE -, prepareMatchersC -, matchChildren -, matchContent'' , matchContent' , matchContent ) where @@ -120,8 +71,8 @@ import Control.Applicative hiding ( liftA, liftA2 ) import Control.Monad ( MonadPlus ) import Control.Arrow +import Data.Either ( rights ) import qualified Data.Map as M -import qualified Data.Foldable as F import Data.Default import Data.Maybe @@ -210,17 +161,6 @@ currentElement state = head (parentElements state) -- | Replace the current position by another, modifying the extra state -- in the process -swapPosition :: (extraState -> extraState') - -> [XML.Element] - -> XMLConverterState nsID extraState - -> XMLConverterState nsID extraState' -swapPosition f stack state - = state { parentElements = stack - , moreState = f (moreState state) - } - --- | Replace the current position by another, modifying the extra state --- in the process swapStack' :: XMLConverterState nsID extraState -> [XML.Element] -> ( XMLConverterState nsID extraState , [XML.Element] ) @@ -264,14 +204,6 @@ runConverter :: XMLConverter nsID extraState input output -> output runConverter converter state input = snd $ runArrowState converter (state,input) --- -runConverter'' :: (NameSpaceID nsID) - => XMLConverter nsID extraState (Fallible ()) output - -> extraState - -> XML.Element - -> output -runConverter'' converter extraState element = runConverter (readNSattributes >>> converter) (createStartState element extraState) () - runConverter' :: (NameSpaceID nsID) => FallibleXMLConverter nsID extraState () success -> extraState @@ -280,20 +212,6 @@ runConverter' :: (NameSpaceID nsID) runConverter' converter extraState element = runConverter (readNSattributes >>? converter) (createStartState element extraState) () -- -runConverterF' :: FallibleXMLConverter nsID extraState x y - -> XMLConverterState nsID extraState - -> Fallible x -> Fallible y -runConverterF' a s e = runConverter (returnV e >>? a) s e - --- -runConverterF :: (NameSpaceID nsID) - => FallibleXMLConverter nsID extraState XML.Element x - -> extraState - -> Fallible XML.Element -> Fallible x -runConverterF a s = either failWith - (\e -> runConverter a (createStartState e s) e) - --- getCurrentElement :: XMLConverter nsID extraState x XML.Element getCurrentElement = extractFromState currentElement @@ -430,57 +348,15 @@ elemNameIs nsID name = keepingTheValue (lookupNSiri nsID) >>% hasThatName -------------------------------------------------------------------------------- -- -strContent :: XMLConverter nsID extraState x String -strContent = getCurrentElement - >>^ XML.strContent - --- elContent :: XMLConverter nsID extraState x [XML.Content] elContent = getCurrentElement >>^ XML.elContent -------------------------------------------------------------------------------- --- Current element --------------------------------------------------------------------------------- - --- -currentElem :: XMLConverter nsID extraState x (XML.QName) -currentElem = getCurrentElement - >>^ XML.elName - -currentElemIs :: (NameSpaceID nsID) - => nsID -> ElementName - -> XMLConverter nsID extraState x Bool -currentElemIs nsID name = getCurrentElement - >>> elemNameIs nsID name - - - -{- -currentElemIs'' nsID name = ( (getCurrentElement >>^ XML.elName >>> - (XML.qName >>^ (&&).(== name) ) - ^&&&^ - (XML.qIRI >>^ (==) ) - ) >>% (.) - ) &&& lookupNSiri nsID >>% ($) --} - --- -expectElement :: (NameSpaceID nsID) - => nsID -> ElementName - -> FallibleXMLConverter nsID extraState x () -expectElement nsID name = currentElemIs nsID name - >>^ boolToChoice - --------------------------------------------------------------------------------- -- Chilren -------------------------------------------------------------------------------- -- -elChildren :: XMLConverter nsID extraState x [XML.Element] -elChildren = getCurrentElement - >>^ XML.elChildren - -- findChildren :: (NameSpaceID nsID) => nsID -> ElementName @@ -490,18 +366,6 @@ findChildren nsID name = elemName nsID name >>% XML.findChildren -- -filterChildren :: (XML.Element -> Bool) - -> XMLConverter nsID extraState x [XML.Element] -filterChildren p = getCurrentElement - >>^ XML.filterChildren p - --- -filterChildrenName :: (XML.QName -> Bool) - -> XMLConverter nsID extraState x [XML.Element] -filterChildrenName p = getCurrentElement - >>^ XML.filterChildrenName p - --- findChild' :: (NameSpaceID nsID) => nsID -> ElementName @@ -517,45 +381,12 @@ findChild :: (NameSpaceID nsID) findChild nsID name = findChild' nsID name >>> maybeToChoice --- -filterChild' :: (XML.Element -> Bool) - -> XMLConverter nsID extraState x (Maybe XML.Element) -filterChild' p = getCurrentElement - >>^ XML.filterChild p - --- -filterChild :: (XML.Element -> Bool) - -> FallibleXMLConverter nsID extraState x XML.Element -filterChild p = filterChild' p - >>> maybeToChoice - --- -filterChildName' :: (XML.QName -> Bool) - -> XMLConverter nsID extraState x (Maybe XML.Element) -filterChildName' p = getCurrentElement - >>^ XML.filterChildName p - --- -filterChildName :: (XML.QName -> Bool) - -> FallibleXMLConverter nsID extraState x XML.Element -filterChildName p = filterChildName' p - >>> maybeToChoice - -------------------------------------------------------------------------------- -- Attributes -------------------------------------------------------------------------------- -- -isSet :: (NameSpaceID nsID) - => nsID -> AttributeName - -> (Either Failure Bool) - -> FallibleXMLConverter nsID extraState x Bool -isSet nsID attrName deflt - = findAttr' nsID attrName - >>^ maybe deflt stringToBool - --- isSet' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe Bool) @@ -570,34 +401,6 @@ isSetWithDefault nsID attrName def' = isSet' nsID attrName >>^ fromMaybe def' --- -hasAttrValueOf' :: (NameSpaceID nsID) - => nsID -> AttributeName - -> AttributeValue - -> XMLConverter nsID extraState x Bool -hasAttrValueOf' nsID attrName attrValue - = findAttr nsID attrName - >>> ( const False ^|||^ (==attrValue)) - --- -failIfNotAttrValueOf :: (NameSpaceID nsID) - => nsID -> AttributeName - -> AttributeValue - -> FallibleXMLConverter nsID extraState x () -failIfNotAttrValueOf nsID attrName attrValue - = hasAttrValueOf' nsID attrName attrValue - >>^ boolToChoice - --- | Is the value that is currently transported in the arrow the value of --- the specified attribute? -isThatTheAttrValue :: (NameSpaceID nsID) - => nsID -> AttributeName - -> FallibleXMLConverter nsID extraState AttributeValue Bool -isThatTheAttrValue nsID attrName - = keepingTheValue - (findAttr nsID attrName) - >>% right.(==) - -- | Lookup value in a dictionary, fail if no attribute found or value -- not in dictionary searchAttrIn :: (NameSpaceID nsID) @@ -608,18 +411,6 @@ searchAttrIn nsID attrName dict = findAttr nsID attrName >>?^? maybeToChoice.(`lookup` dict ) - --- | Lookup value in a dictionary. Fail if no attribute found. If value not in --- dictionary, return default value -searchAttrWith :: (NameSpaceID nsID) - => nsID -> AttributeName - -> a - -> [(AttributeValue,a)] - -> FallibleXMLConverter nsID extraState x a -searchAttrWith nsID attrName defV dict - = findAttr nsID attrName - >>?^ (fromMaybe defV).(`lookup` dict ) - -- | Lookup value in a dictionary. If attribute or value not found, -- return default value searchAttr :: (NameSpaceID nsID) @@ -789,16 +580,6 @@ prepareIteration nsID name = keepingTheValue (findChildren nsID name) >>% distributeValue --- | Applies a converter to every child element of a specific type. --- Collects results in a 'Monoid'. --- Fails completely if any conversion fails. -collectEvery :: (NameSpaceID nsID, Monoid m) - => nsID -> ElementName - -> FallibleXMLConverter nsID extraState a m - -> FallibleXMLConverter nsID extraState a m -collectEvery nsID name a = prepareIteration nsID name - >>> foldS' (switchingTheStack a) - -- withEveryL :: (NameSpaceID nsID) => nsID -> ElementName @@ -824,17 +605,7 @@ tryAll :: (NameSpaceID nsID) -> XMLConverter nsID extraState b [a] tryAll nsID name a = prepareIteration nsID name >>> iterateS (switchingTheStack a) - >>^ collectRights - --- | Applies a converter to every child element of a specific type. --- Collects all successful results. -tryAll' :: (NameSpaceID nsID, F.Foldable c, MonadPlus c) - => nsID -> ElementName - -> FallibleXMLConverter nsID extraState b a - -> XMLConverter nsID extraState b (c a) -tryAll' nsID name a = prepareIteration nsID name - >>> iterateS (switchingTheStack a) - >>^ collectRightsF + >>^ rights -------------------------------------------------------------------------------- -- Matching children @@ -843,15 +614,6 @@ tryAll' nsID name a = prepareIteration nsID name type IdXMLConverter nsID moreState x = XMLConverter nsID moreState x x -type MaybeEConverter nsID moreState x - = Maybe (IdXMLConverter nsID moreState (x, XML.Element)) - --- Chainable converter that helps deciding which converter to actually use. -type ElementMatchConverter nsID extraState x - = IdXMLConverter nsID - extraState - (MaybeEConverter nsID extraState x, XML.Element) - type MaybeCConverter nsID moreState x = Maybe (IdXMLConverter nsID moreState (x, XML.Content)) @@ -862,26 +624,6 @@ type ContentMatchConverter nsID extraState x (MaybeCConverter nsID extraState x, XML.Content) -- Helper function: The @c@ is actually a converter that is to be selected by --- matching XML elements to the first two parameters. --- The fold used to match elements however is very simple, so to use it, --- this function wraps the converter in another converter that unifies --- the accumulator. Think of a lot of converters with the resulting type --- chained together. The accumulator not only transports the element --- unchanged to the next matcher, it also does the actual selecting by --- combining the intermediate results with '(<|>)'. -makeMatcherE :: (NameSpaceID nsID) - => nsID -> ElementName - -> FallibleXMLConverter nsID extraState a a - -> ElementMatchConverter nsID extraState a -makeMatcherE nsID name c = ( second ( - elemNameIs nsID name - >>^ bool Nothing (Just tryC) - ) - >>% (<|>) - ) &&&^ snd - where tryC = (fst ^&&& executeThere c >>% recover) &&&^ snd - --- Helper function: The @c@ is actually a converter that is to be selected by -- matching XML content to the first two parameters. -- The fold used to match elements however is very simple, so to use it, -- this function wraps the converter in another converter that unifies @@ -914,13 +656,6 @@ makeMatcherC nsID name c = ( second ( contentToElem _ -> failEmpty -- Creates and chains a bunch of matchers -prepareMatchersE :: (NameSpaceID nsID) - => [(nsID, ElementName, FallibleXMLConverter nsID extraState x x)] - -> ElementMatchConverter nsID extraState x ---prepareMatchersE = foldSs . (map $ uncurry3 makeMatcherE) -prepareMatchersE = reverseComposition . (map $ uncurry3 makeMatcherE) - --- Creates and chains a bunch of matchers prepareMatchersC :: (NameSpaceID nsID) => [(nsID, ElementName, FallibleXMLConverter nsID extraState x x)] -> ContentMatchConverter nsID extraState x @@ -928,52 +663,6 @@ prepareMatchersC :: (NameSpaceID nsID) prepareMatchersC = reverseComposition . (map $ uncurry3 makeMatcherC) -- | Takes a list of element-data - converter groups and --- * Finds all children of the current element --- * Matches each group to each child in order (at most one group per child) --- * Filters non-matched children --- * Chains all found converters in child-order --- * Applies the chain to the input element -matchChildren :: (NameSpaceID nsID) - => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] - -> XMLConverter nsID extraState a a -matchChildren lookups = let matcher = prepareMatchersE lookups - in keepingTheValue ( - elChildren - >>> map (Nothing,) - ^>> iterateSL matcher - >>^ catMaybes.map (\(m,e) -> fmap (swallowElem e) m) - -- >>> foldSs - >>> reverseComposition - ) - >>> swap - ^>> app - where - -- let the converter swallow the element and drop the element - -- in the return value - swallowElem element converter = (,element) ^>> converter >>^ fst - --- -matchContent'' :: (NameSpaceID nsID) - => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] - -> XMLConverter nsID extraState a a -matchContent'' lookups = let matcher = prepareMatchersC lookups - in keepingTheValue ( - elContent - >>> map (Nothing,) - ^>> iterateSL matcher - >>^ catMaybes.map (\(m,c) -> fmap (swallowContent c) m) - -- >>> foldSs - >>> reverseComposition - ) - >>> swap - ^>> app - where - -- let the converter swallow the content and drop the content - -- in the return value - swallowContent content converter = (,content) ^>> converter >>^ fst - - --- | Takes a list of element-data - converter groups and -- * Finds all content of the current element -- * Matches each group to each piece of content in order -- (at most one group per piece of content) @@ -1018,14 +707,6 @@ matchContent lookups fallback -- Internals -------------------------------------------------------------------------------- -stringToBool :: (Monoid failure) => String -> Either failure Bool -stringToBool val -- stringToBool' val >>> maybeToChoice - | val `elem` trueValues = succeedWith True - | val `elem` falseValues = succeedWith False - | otherwise = failEmpty - where trueValues = ["true" ,"on" ,"1"] - falseValues = ["false","off","0"] - stringToBool' :: String -> Maybe Bool stringToBool' val | val `elem` trueValues = Just True | val `elem` falseValues = Just False diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs index deb009998..92e12931d 100644 --- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -31,11 +31,11 @@ Namespaces used in odt files. module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) ) where -import Data.List ( isPrefixOf ) -import Data.Maybe ( fromMaybe, listToMaybe ) -import qualified Data.Map as M ( empty, insert ) +import Data.List (isPrefixOf) +import qualified Data.Map as M (empty, insert) +import Data.Maybe (fromMaybe, listToMaybe) -import Text.Pandoc.Readers.Odt.Generic.Namespaces +import Text.Pandoc.Readers.Odt.Generic.Namespaces instance NameSpaceID Namespace where @@ -48,7 +48,7 @@ instance NameSpaceID Namespace where findID :: NameSpaceIRI -> Maybe Namespace -findID iri = listToMaybe [nsID | (iri',~nsID) <- nsIDs, iri' `isPrefixOf` iri] +findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri] nsIDmap :: NameSpaceIRIs Namespace nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 26ba6df82..58be8e4a3 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -1,9 +1,8 @@ -{-# LANGUAGE TupleSections #-} -{-# LANGUAGE PatternGuards #-} -{-# LANGUAGE ViewPatterns #-} -{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE Arrows #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} + {- Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> @@ -50,49 +49,36 @@ module Text.Pandoc.Readers.Odt.StyleReader , ListLevelType (..) , LengthOrPercent (..) , lookupStyle -, getTextProperty -, getTextProperty' -, getParaProperty -, getListStyle , getListLevelStyle , getStyleFamily -, lookupDefaultStyle , lookupDefaultStyle' , lookupListStyleByName -, getPropertyChain -, textPropertyChain -, stylePropertyChain -, stylePropertyChain' -, getStylePropertyChain , extendedStylePropertyChain -, extendedStylePropertyChain' -, liftStyles , readStylesAt ) where -import Control.Arrow -import Control.Applicative hiding ( liftA, liftA2, liftA3 ) +import Control.Applicative hiding (liftA, liftA2, liftA3) +import Control.Arrow -import qualified Data.Foldable as F -import qualified Data.Map as M -import qualified Data.Set as S -import Data.Char ( isDigit ) -import Data.Default -import Data.List ( unfoldr ) -import Data.Maybe +import Data.Char (isDigit) +import Data.Default +import qualified Data.Foldable as F +import Data.List (unfoldr) +import qualified Data.Map as M +import Data.Maybe +import qualified Data.Set as S -import qualified Text.XML.Light as XML +import qualified Text.XML.Light as XML -import Text.Pandoc.Readers.Odt.Arrows.State -import Text.Pandoc.Readers.Odt.Arrows.Utils +import Text.Pandoc.Readers.Odt.Arrows.Utils -import Text.Pandoc.Readers.Odt.Generic.Utils -import qualified Text.Pandoc.Readers.Odt.Generic.SetMap as SM -import Text.Pandoc.Readers.Odt.Generic.Fallible -import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Readers.Odt.Generic.Fallible +import qualified Text.Pandoc.Readers.Odt.Generic.SetMap as SM +import Text.Pandoc.Readers.Odt.Generic.Utils +import Text.Pandoc.Readers.Odt.Generic.XMLConverter -import Text.Pandoc.Readers.Odt.Namespaces -import Text.Pandoc.Readers.Odt.Base +import Text.Pandoc.Readers.Odt.Base +import Text.Pandoc.Readers.Odt.Namespaces readStylesAt :: XML.Element -> Fallible Styles @@ -118,7 +104,7 @@ instance Lookupable FontPitch where instance Default FontPitch where def = PitchVariable --- The font pitch can be specifed in a style directly. Normally, however, +-- The font pitch can be specified in a style directly. Normally, however, -- it is defined in the font. That is also the specs' recommendation. -- -- Thus, we want @@ -145,13 +131,12 @@ type StyleReaderSafe a b = XMLReaderSafe FontPitches a b -- | A reader for font pitches fontPitchReader :: XMLReader _s _x FontPitches fontPitchReader = executeIn NsOffice "font-face-decls" ( - ( withEveryL NsStyle "font-face" $ liftAsSuccess ( + withEveryL NsStyle "font-face" (liftAsSuccess ( findAttr' NsStyle "name" &&& lookupDefaultingAttr NsStyle "font-pitch" - ) - ) - >>?^ ( M.fromList . (foldl accumLegalPitches []) ) + )) + >>?^ ( M.fromList . foldl accumLegalPitches [] ) ) where accumLegalPitches ls (Nothing,_) = ls accumLegalPitches ls (Just n,p) = (n,p):ls @@ -230,15 +215,15 @@ instance Lookupable StyleFamily where ] -- | A named style -data Style = Style { styleFamily :: Maybe StyleFamily - , styleParentName :: Maybe StyleName - , listStyle :: Maybe StyleName - , styleProperties :: StyleProperties +data Style = Style { styleFamily :: Maybe StyleFamily + , styleParentName :: Maybe StyleName + , listStyle :: Maybe StyleName + , styleProperties :: StyleProperties } deriving ( Eq, Show ) -data StyleProperties = SProps { textProperties :: Maybe TextProperties - , paraProperties :: Maybe ParaProperties +data StyleProperties = SProps { textProperties :: Maybe TextProperties + , paraProperties :: Maybe ParaProperties -- , tableColProperties :: Maybe TColProperties -- , tableRowProperties :: Maybe TRowProperties -- , tableCellProperties :: Maybe TCellProperties @@ -354,8 +339,8 @@ instance Read XslUnit where readsPrec _ "em" = [(XslUnitEM , "")] readsPrec _ _ = [] --- | Rough conversion of measures into millimeters. --- Pixels and em's are actually implemetation dependant/relative measures, +-- | Rough conversion of measures into millimetres. +-- Pixels and em's are actually implementation dependant/relative measures, -- so I could not really easily calculate anything exact here even if I wanted. -- But I do not care about exactness right now, as I only use measures -- to determine if a paragraph is "indented" or not. @@ -397,11 +382,11 @@ data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType instance Show ListLevelStyle where show ListLevelStyle{..} = "<LLS|" - ++ (show listLevelType) + ++ show listLevelType ++ "|" - ++ (maybeToString listItemPrefix) - ++ (show listItemFormat) - ++ (maybeToString listItemSuffix) + ++ maybeToString listItemPrefix + ++ show listItemFormat + ++ maybeToString listItemSuffix ++ ">" where maybeToString = fromMaybe "" @@ -439,7 +424,7 @@ instance Read ListItemNumberFormat where -------------------------------------------------------------------------------- -- Readers -- --- ...it seems like a whole lot of this should be automatically deriveable +-- ...it seems like a whole lot of this should be automatically derivable -- or at least moveable into a class. Most of this is data concealed in -- code. -------------------------------------------------------------------------------- @@ -497,14 +482,14 @@ readTextProperties = ( liftA6 PropT ( searchAttr NsXSL_FO "font-style" False isFontEmphasised ) ( searchAttr NsXSL_FO "font-weight" False isFontBold ) - ( findPitch ) + findPitch ( getAttr NsStyle "text-position" ) - ( readUnderlineMode ) - ( readStrikeThroughMode ) + readUnderlineMode + readStrikeThroughMode ) where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] isFontBold = ("normal",False):("bold",True) - :(map ((,True).show) ([100,200..900]::[Int])) + :map ((,True).show) ([100,200..900]::[Int]) readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) readUnderlineMode = readLineMode "text-underline-mode" @@ -524,7 +509,7 @@ readLineMode modeAttr styleAttr = proc x -> do Nothing -> returnA -< Just UnderlineModeNormal else returnA -< Nothing where - isLinePresent = [("none",False)] ++ map (,True) + isLinePresent = ("none",False) : map (,True) [ "dash" , "dot-dash" , "dot-dot-dash" , "dotted" , "long-dash" , "solid" , "wave" ] @@ -561,20 +546,18 @@ readListStyle = findAttr NsStyle "name" >>?! keepingTheValue ( liftA ListStyle - $ ( liftA3 SM.union3 + $ liftA3 SM.union3 ( readListLevelStyles NsText "list-level-style-number" LltNumbered ) ( readListLevelStyles NsText "list-level-style-bullet" LltBullet ) - ( readListLevelStyles NsText "list-level-style-image" LltImage ) - ) >>^ M.mapMaybe chooseMostSpecificListLevelStyle + ( readListLevelStyles NsText "list-level-style-image" LltImage ) >>^ M.mapMaybe chooseMostSpecificListLevelStyle ) -- readListLevelStyles :: Namespace -> ElementName -> ListLevelType -> StyleReaderSafe _x (SM.SetMap Int ListLevelStyle) readListLevelStyles namespace elementName levelType = - ( tryAll namespace elementName (readListLevelStyle levelType) + tryAll namespace elementName (readListLevelStyle levelType) >>^ SM.fromList - ) -- readListLevelStyle :: ListLevelType -> StyleReader _x (Int, ListLevelStyle) @@ -624,20 +607,11 @@ lookupStyle :: StyleName -> Styles -> Maybe Style lookupStyle name Styles{..} = M.lookup name stylesByName -- -lookupDefaultStyle :: StyleFamily -> Styles -> StyleProperties -lookupDefaultStyle family Styles{..} = fromMaybe def - (M.lookup family defaultStyleMap) - --- lookupDefaultStyle' :: Styles -> StyleFamily -> StyleProperties lookupDefaultStyle' Styles{..} family = fromMaybe def (M.lookup family defaultStyleMap) -- -getListStyle :: Style -> Styles -> Maybe ListStyle -getListStyle Style{..} styles = listStyle >>= (`lookupListStyleByName` styles) - --- lookupListStyleByName :: StyleName -> Styles -> Maybe ListStyle lookupListStyleByName name Styles{..} = M.lookup name listStylesByName @@ -655,7 +629,7 @@ parents style styles = unfoldr findNextParent style -- Ha! getStyleFamily :: Style -> Styles -> Maybe StyleFamily getStyleFamily style@Style{..} styles = styleFamily - <|> (F.asum $ map (`getStyleFamily` styles) $ parents style styles) + <|> F.asum (map (`getStyleFamily` styles) $ parents style styles) -- | Each 'Style' has certain 'StyleProperties'. But sometimes not all property -- values are specified. Instead, a value might be inherited from a @@ -677,68 +651,7 @@ stylePropertyChain style styles -- extendedStylePropertyChain :: [Style] -> Styles -> [StyleProperties] extendedStylePropertyChain [] _ = [] -extendedStylePropertyChain [style] styles = (stylePropertyChain style styles) - ++ (maybeToList (fmap (lookupDefaultStyle' styles) (getStyleFamily style styles))) -extendedStylePropertyChain (style:trace) styles = (stylePropertyChain style styles) - ++ (extendedStylePropertyChain trace styles) --- Optimizable with Data.Sequence - --- -extendedStylePropertyChain' :: [Style] -> Styles -> Maybe [StyleProperties] -extendedStylePropertyChain' [] _ = Nothing -extendedStylePropertyChain' [style] styles = Just ( - (stylePropertyChain style styles) - ++ (maybeToList (fmap (lookupDefaultStyle' styles) (getStyleFamily style styles))) - ) -extendedStylePropertyChain' (style:trace) styles = fmap ((stylePropertyChain style styles) ++) - (extendedStylePropertyChain' trace styles) - --- -stylePropertyChain' :: Styles -> Style -> [StyleProperties] -stylePropertyChain' = flip stylePropertyChain - --- -getStylePropertyChain :: StyleName -> Styles -> [StyleProperties] -getStylePropertyChain name styles = maybe [] - (`stylePropertyChain` styles) - (lookupStyle name styles) - --- -getPropertyChain :: (StyleProperties -> Maybe a) -> Style -> Styles -> [a] -getPropertyChain extract style styles = catMaybes - $ map extract - $ stylePropertyChain style styles - --- -textPropertyChain :: Style -> Styles -> [TextProperties] -textPropertyChain = getPropertyChain textProperties - --- -paraPropertyChain :: Style -> Styles -> [ParaProperties] -paraPropertyChain = getPropertyChain paraProperties - --- -getTextProperty :: (TextProperties -> a) -> Style -> Styles -> Maybe a -getTextProperty extract style styles = fmap extract - $ listToMaybe - $ textPropertyChain style styles - --- -getTextProperty' :: (TextProperties -> Maybe a) -> Style -> Styles -> Maybe a -getTextProperty' extract style styles = F.asum - $ map extract - $ textPropertyChain style styles - --- -getParaProperty :: (ParaProperties -> a) -> Style -> Styles -> Maybe a -getParaProperty extract style styles = fmap extract - $ listToMaybe - $ paraPropertyChain style styles - --- | Lifts the reader into another readers' state. -liftStyles :: (OdtConverterState s -> OdtConverterState Styles) - -> (OdtConverterState Styles -> OdtConverterState s ) - -> XMLReader s x x -liftStyles extract inject = switchState extract inject - $ convertingExtraState M.empty readAllStyles - +extendedStylePropertyChain [style] styles = stylePropertyChain style styles + ++ maybeToList (fmap (lookupDefaultStyle' styles) (getStyleFamily style styles)) +extendedStylePropertyChain (style:trace) styles = stylePropertyChain style styles + ++ extendedStylePropertyChain trace styles diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 4e1c926da..292830bd2 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Org - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -27,29 +27,42 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Org ( readOrg ) where -import Text.Pandoc.Readers.Org.Blocks ( blockList, meta ) -import Text.Pandoc.Readers.Org.Parsing ( OrgParser, readWithM ) -import Text.Pandoc.Readers.Org.ParserState ( optionsToParserState ) +import Text.Pandoc.Readers.Org.Blocks (blockList, meta) +import Text.Pandoc.Readers.Org.ParserState (optionsToParserState) +import Text.Pandoc.Readers.Org.Parsing (OrgParser, readWithM) -import Text.Pandoc.Definition -import Text.Pandoc.Error -import Text.Pandoc.Options +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.Options +import Text.Pandoc.Parsing (reportLogMessages) +import Text.Pandoc.Shared (crFilter) -import Control.Monad.Reader ( runReader ) +import Control.Monad.Except (throwError) +import Control.Monad.Reader (runReaderT) +import Data.Text (Text) +import qualified Data.Text as T -- | Parse org-mode string and return a Pandoc document. -readOrg :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readOrg opts s = flip runReader def $ - readWithM parseOrg (optionsToParserState opts) (s ++ "\n\n") +readOrg :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readOrg opts s = do + parsed <- flip runReaderT def $ + readWithM parseOrg (optionsToParserState opts) + (T.unpack (crFilter s) ++ "\n\n") + case parsed of + Right result -> return result + Left _ -> throwError $ PandocParseError "problem parsing org" -- -- Parser -- -parseOrg :: OrgParser Pandoc +parseOrg :: PandocMonad m => OrgParser m Pandoc parseOrg = do blocks' <- blockList meta' <- meta + reportLogMessages return $ Pandoc meta' blocks' diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs index b1004dda6..424102cb0 100644 --- a/src/Text/Pandoc/Readers/Org/BlockStarts.hs +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,8 +17,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.BlockStarts + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -40,11 +40,11 @@ module Text.Pandoc.Readers.Org.BlockStarts , endOfBlock ) where -import Control.Monad ( void ) +import Control.Monad (void) import Text.Pandoc.Readers.Org.Parsing -- | Horizontal Line (five -- dashes or more) -hline :: OrgParser () +hline :: Monad m => OrgParser m () hline = try $ do skipSpaces string "-----" @@ -54,58 +54,66 @@ hline = try $ do return () -- | Read the start of a header line, return the header level -headerStart :: OrgParser Int +headerStart :: Monad m => OrgParser m Int headerStart = try $ (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos -tableStart :: OrgParser Char +tableStart :: Monad m => OrgParser m Char tableStart = try $ skipSpaces *> char '|' -latexEnvStart :: OrgParser String -latexEnvStart = try $ do +gridTableStart :: Monad m => OrgParser m () +gridTableStart = try $ skipSpaces <* char '+' <* char '-' + + +latexEnvStart :: Monad m => OrgParser m String +latexEnvStart = try $ skipSpaces *> string "\\begin{" *> latexEnvName <* string "}" <* blankline where - latexEnvName :: OrgParser String + latexEnvName :: Monad m => OrgParser m String latexEnvName = try $ mappend <$> many1 alphaNum <*> option "" (string "*") - --- | Parses bullet list marker. -bulletListStart :: OrgParser () -bulletListStart = try $ - choice - [ () <$ skipSpaces <* oneOf "+-" <* skipSpaces1 - , () <$ skipSpaces1 <* char '*' <* skipSpaces1 - ] - -genericListStart :: OrgParser String - -> OrgParser Int -genericListStart listMarker = try $ - (+) <$> (length <$> many spaceChar) - <*> (length <$> listMarker <* many1 spaceChar) - -orderedListStart :: OrgParser Int +bulletListStart :: Monad m => OrgParser m Int +bulletListStart = try $ do + ind <- length <$> many spaceChar + -- Unindented lists cannot use '*' bullets. + oneOf (if ind == 0 then "+-" else "*+-") + skipSpaces1 <|> lookAhead eol + return (ind + 1) + +genericListStart :: Monad m + => OrgParser m String + -> OrgParser m Int +genericListStart listMarker = try $ do + ind <- length <$> many spaceChar + void listMarker + skipSpaces1 <|> lookAhead eol + return (ind + 1) + +eol :: Monad m => OrgParser m () +eol = void (char '\n') + +orderedListStart :: Monad m => OrgParser m Int orderedListStart = genericListStart orderedListMarker -- Ordered list markers allowed in org-mode where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") -drawerStart :: OrgParser String -drawerStart = try $ - skipSpaces *> drawerName <* skipSpaces <* newline +drawerStart :: Monad m => OrgParser m String +drawerStart = try $ skipSpaces *> drawerName <* skipSpaces <* newline where drawerName = char ':' *> manyTill nonspaceChar (char ':') -metaLineStart :: OrgParser () +metaLineStart :: Monad m => OrgParser m () metaLineStart = try $ skipSpaces <* string "#+" -commentLineStart :: OrgParser () +commentLineStart :: Monad m => OrgParser m () commentLineStart = try $ skipSpaces <* string "# " -exampleLineStart :: OrgParser () +exampleLineStart :: Monad m => OrgParser m () exampleLineStart = () <$ try (skipSpaces *> string ": ") -noteMarker :: OrgParser String +noteMarker :: Monad m => OrgParser m String noteMarker = try $ do char '[' choice [ many1Till digit (char ']') @@ -114,17 +122,18 @@ noteMarker = try $ do ] -- | Succeeds if the parser is at the end of a block. -endOfBlock :: OrgParser () -endOfBlock = lookAhead . try $ do - void blankline <|> anyBlockStart +endOfBlock :: Monad m => OrgParser m () +endOfBlock = lookAhead . try $ + void blankline <|> anyBlockStart where -- Succeeds if there is a new block starting at this position. - anyBlockStart :: OrgParser () + anyBlockStart :: Monad m => OrgParser m () anyBlockStart = try . choice $ [ exampleLineStart , hline , metaLineStart , commentLineStart + , gridTableStart , void noteMarker , void tableStart , void drawerStart @@ -133,4 +142,3 @@ endOfBlock = lookAhead . try $ do , void bulletListStart , void orderedListStart ] - diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs index 484d97482..fa016283c 100644 --- a/src/Text/Pandoc/Readers/Org/Blocks.hs +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -1,8 +1,5 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE RecordWildCards #-} -{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2014-2017 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,10 +15,11 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} - +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE RecordWildCards #-} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2017 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.Blocks + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -33,247 +31,61 @@ module Text.Pandoc.Readers.Org.Blocks , meta ) where -import Text.Pandoc.Readers.Org.BlockStarts -import Text.Pandoc.Readers.Org.Inlines -import Text.Pandoc.Readers.Org.Meta ( metaExport, metaKey, metaLine ) -import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Readers.Org.Parsing -import Text.Pandoc.Readers.Org.Shared - ( cleanLinkString, isImageFilename, rundocBlockClass - , toRundocAttrib, translateLang ) +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.DocumentTree (documentTree, headlineToBlocks) +import Text.Pandoc.Readers.Org.Inlines +import Text.Pandoc.Readers.Org.Meta (metaExport, metaKey, metaLine) +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.Shared (cleanLinkString, isImageFilename, + originalLang, translateLang) + +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared (compactify, compactifyDL, safeRead) + +import Control.Monad (foldM, guard, mzero, void) +import Data.Char (isSpace, toLower, toUpper) +import Data.Default (Default) +import Data.List (foldl', isPrefixOf) +import Data.Maybe (fromMaybe, isJust, isNothing) +import Data.Monoid ((<>)) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Inlines, Blocks ) -import Text.Pandoc.Definition -import Text.Pandoc.Options -import Text.Pandoc.Shared ( compactify', compactify'DL, safeRead ) - -import Control.Monad ( foldM, guard, mzero, void ) -import Data.Char ( isSpace, toLower, toUpper) -import Data.Default ( Default ) -import Data.List ( foldl', isPrefixOf ) -import Data.Maybe ( fromMaybe, isNothing ) -import Data.Monoid ((<>)) - --- --- Org headers --- -newtype Tag = Tag { fromTag :: String } - deriving (Show, Eq) - --- | Create a tag containing the given string. -toTag :: String -> Tag -toTag = Tag - --- | The key (also called name or type) of a property. -newtype PropertyKey = PropertyKey { fromKey :: String } - deriving (Show, Eq, Ord) - --- | Create a property key containing the given string. Org mode keys are --- case insensitive and are hence converted to lower case. -toPropertyKey :: String -> PropertyKey -toPropertyKey = PropertyKey . map toLower - --- | The value assigned to a property. -newtype PropertyValue = PropertyValue { fromValue :: String } - --- | Create a property value containing the given string. -toPropertyValue :: String -> PropertyValue -toPropertyValue = PropertyValue - --- | Check whether the property value is non-nil (i.e. truish). -isNonNil :: PropertyValue -> Bool -isNonNil p = map toLower (fromValue p) `notElem` ["()", "{}", "nil"] - --- | Key/value pairs from a PROPERTIES drawer -type Properties = [(PropertyKey, PropertyValue)] - --- | Org mode headline (i.e. a document subtree). -data Headline = Headline - { headlineLevel :: Int - , headlineTodoMarker :: Maybe TodoMarker - , headlineText :: Inlines - , headlineTags :: [Tag] - , headlineProperties :: Properties - , headlineContents :: Blocks - , headlineChildren :: [Headline] - } - --- --- Parsing headlines and subtrees --- - --- | Read an Org mode headline and its contents (i.e. a document subtree). --- @lvl@ gives the minimum acceptable level of the tree. -headline :: Int -> OrgParser (F Headline) -headline lvl = try $ do - level <- headerStart - guard (lvl <= level) - todoKw <- optionMaybe todoKeyword - title <- trimInlinesF . mconcat <$> manyTill inline endOfTitle - tags <- option [] headerTags - newline - properties <- option mempty propertiesDrawer - contents <- blocks - children <- many (headline (level + 1)) - return $ do - title' <- title - contents' <- contents - children' <- sequence children - return $ Headline - { headlineLevel = level - , headlineTodoMarker = todoKw - , headlineText = title' - , headlineTags = tags - , headlineProperties = properties - , headlineContents = contents' - , headlineChildren = children' - } - where - endOfTitle :: OrgParser () - endOfTitle = void . lookAhead $ optional headerTags *> newline - - headerTags :: OrgParser [Tag] - headerTags = try $ - let tag = many1 (alphaNum <|> oneOf "@%#_") <* char ':' - in map toTag <$> (skipSpaces *> char ':' *> many1 tag <* skipSpaces) - --- | Convert an Org mode headline (i.e. a document tree) into pandoc's Blocks -headlineToBlocks :: Headline -> OrgParser Blocks -headlineToBlocks hdln@(Headline {..}) = do - maxHeadlineLevels <- getExportSetting exportHeadlineLevels - case () of - _ | any isNoExportTag headlineTags -> return mempty - _ | any isArchiveTag headlineTags -> archivedHeadlineToBlocks hdln - _ | isCommentTitle headlineText -> return mempty - _ | headlineLevel >= maxHeadlineLevels -> headlineToHeaderWithList hdln - _ | otherwise -> headlineToHeaderWithContents hdln - -isNoExportTag :: Tag -> Bool -isNoExportTag = (== toTag "noexport") - -isArchiveTag :: Tag -> Bool -isArchiveTag = (== toTag "ARCHIVE") - --- | Check if the title starts with COMMENT. --- FIXME: This accesses builder internals not intended for use in situations --- like these. Replace once keyword parsing is supported. -isCommentTitle :: Inlines -> Bool -isCommentTitle (B.toList -> (Str "COMMENT":_)) = True -isCommentTitle _ = False - -archivedHeadlineToBlocks :: Headline -> OrgParser Blocks -archivedHeadlineToBlocks hdln = do - archivedTreesOption <- getExportSetting exportArchivedTrees - case archivedTreesOption of - ArchivedTreesNoExport -> return mempty - ArchivedTreesExport -> headlineToHeaderWithContents hdln - ArchivedTreesHeadlineOnly -> headlineToHeader hdln - -headlineToHeaderWithList :: Headline -> OrgParser Blocks -headlineToHeaderWithList hdln@(Headline {..}) = do - maxHeadlineLevels <- getExportSetting exportHeadlineLevels - header <- headlineToHeader hdln - listElements <- sequence (map headlineToBlocks headlineChildren) - let listBlock = if null listElements - then mempty - else B.orderedList listElements - let headerText = if maxHeadlineLevels == headlineLevel - then header - else flattenHeader header - return $ headerText <> headlineContents <> listBlock - where - flattenHeader :: Blocks -> Blocks - flattenHeader blks = - case B.toList blks of - (Header _ _ inlns:_) -> B.para (B.fromList inlns) - _ -> mempty - -headlineToHeaderWithContents :: Headline -> OrgParser Blocks -headlineToHeaderWithContents hdln@(Headline {..}) = do - header <- headlineToHeader hdln - childrenBlocks <- mconcat <$> sequence (map headlineToBlocks headlineChildren) - return $ header <> headlineContents <> childrenBlocks - -headlineToHeader :: Headline -> OrgParser Blocks -headlineToHeader (Headline {..}) = do - exportTodoKeyword <- getExportSetting exportWithTodoKeywords - let todoText = if exportTodoKeyword - then case headlineTodoMarker of - Just kw -> todoKeywordToInlines kw <> B.space - Nothing -> mempty - else mempty - let text = tagTitle (todoText <> headlineText) headlineTags - let propAttr = propertiesToAttr headlineProperties - attr <- registerHeader propAttr headlineText - return $ B.headerWith attr headlineLevel text - -todoKeyword :: OrgParser TodoMarker -todoKeyword = try $ do - taskStates <- activeTodoMarkers <$> getState - let kwParser tdm = try $ (tdm <$ string (todoMarkerName tdm) <* spaceChar) - choice (map kwParser taskStates) - -todoKeywordToInlines :: TodoMarker -> Inlines -todoKeywordToInlines tdm = - let todoText = todoMarkerName tdm - todoState = map toLower . show $ todoMarkerState tdm - classes = [todoState, todoText] - in B.spanWith (mempty, classes, mempty) (B.str todoText) - -propertiesToAttr :: Properties -> Attr -propertiesToAttr properties = - let - toStringPair prop = (fromKey (fst prop), fromValue (snd prop)) - customIdKey = toPropertyKey "custom_id" - classKey = toPropertyKey "class" - unnumberedKey = toPropertyKey "unnumbered" - specialProperties = [customIdKey, classKey, unnumberedKey] - id' = fromMaybe mempty . fmap fromValue . lookup customIdKey $ properties - cls = fromMaybe mempty . fmap fromValue . lookup classKey $ properties - kvs' = map toStringPair . filter ((`notElem` specialProperties) . fst) - $ properties - isUnnumbered = - fromMaybe False . fmap isNonNil . lookup unnumberedKey $ properties - in - (id', words cls ++ (if isUnnumbered then ["unnumbered"] else []), kvs') - -tagTitle :: Inlines -> [Tag] -> Inlines -tagTitle title tags = title <> (mconcat $ map tagToInline tags) - -tagToInline :: Tag -> Inlines -tagToInline t = B.spanWith ("", ["tag"], [("data-tag-name", fromTag t)]) mempty - +import qualified Text.Pandoc.Walk as Walk -- -- parsing blocks -- -- | Get a list of blocks. -blockList :: OrgParser [Block] +blockList :: PandocMonad m => OrgParser m [Block] blockList = do - initialBlocks <- blocks - headlines <- sequence <$> manyTill (headline 1) eof + headlines <- documentTree blocks inline st <- getState - headlineBlocks <- fmap mconcat . sequence . map headlineToBlocks $ runF headlines st - return . B.toList $ (runF initialBlocks st) <> headlineBlocks + headlineBlocks <- headlineToBlocks $ runF headlines st + -- ignore first headline, it's the document's title + return . drop 1 . B.toList $ headlineBlocks --- | Get the meta information safed in the state. -meta :: OrgParser Meta +-- | Get the meta information saved in the state. +meta :: Monad m => OrgParser m Meta meta = do meta' <- metaExport runF meta' <$> getState -blocks :: OrgParser (F Blocks) +blocks :: PandocMonad m => OrgParser m (F Blocks) blocks = mconcat <$> manyTill block (void (lookAhead headerStart) <|> eof) -block :: OrgParser (F Blocks) +block :: PandocMonad m => OrgParser m (F Blocks) block = choice [ mempty <$ blanklines , table , orgBlock , figure , example , genericDrawer + , include , specialLine , horizontalRule , list @@ -283,6 +95,11 @@ block = choice [ mempty <$ blanklines ] <?> "block" +-- | Parse a horizontal rule into a block element +horizontalRule :: Monad m => OrgParser m (F Blocks) +horizontalRule = return B.horizontalRule <$ try hline + + -- -- Block Attributes -- @@ -297,7 +114,7 @@ data BlockAttributes = BlockAttributes -- | Convert BlockAttributes into pandoc Attr attrFromBlockAttributes :: BlockAttributes -> Attr -attrFromBlockAttributes (BlockAttributes{..}) = +attrFromBlockAttributes BlockAttributes{..} = let ident = fromMaybe mempty $ lookup "id" blockAttrKeyValues classes = case lookup "class" blockAttrKeyValues of @@ -306,18 +123,18 @@ attrFromBlockAttributes (BlockAttributes{..}) = kv = filter ((`notElem` ["id", "class"]) . fst) blockAttrKeyValues in (ident, classes, kv) -stringyMetaAttribute :: (String -> Bool) -> OrgParser (String, String) -stringyMetaAttribute attrCheck = try $ do +stringyMetaAttribute :: Monad m => OrgParser m (String, String) +stringyMetaAttribute = try $ do metaLineStart attrName <- map toUpper <$> many1Till nonspaceChar (char ':') - guard $ attrCheck attrName skipSpaces - attrValue <- anyLine + attrValue <- anyLine <|> ("" <$ newline) return (attrName, attrValue) -blockAttributes :: OrgParser BlockAttributes +blockAttributes :: PandocMonad m => OrgParser m BlockAttributes blockAttributes = try $ do - kv <- many (stringyMetaAttribute attrCheck) + kv <- many stringyMetaAttribute + guard $ all (attrCheck . fst) kv let caption = foldl' (appendValues "CAPTION") Nothing kv let kvAttrs = foldl' (appendValues "ATTR_HTML") Nothing kv let name = lookup "NAME" kv @@ -326,7 +143,7 @@ blockAttributes = try $ do Nothing -> return Nothing Just s -> Just <$> parseFromString inlines (s ++ "\n") kvAttrs' <- parseFromString keyValues . (++ "\n") $ fromMaybe mempty kvAttrs - return $ BlockAttributes + return BlockAttributes { blockAttrName = name , blockAttrLabel = label , blockAttrCaption = caption' @@ -334,13 +151,7 @@ blockAttributes = try $ do } where attrCheck :: String -> Bool - attrCheck attr = - case attr of - "NAME" -> True - "LABEL" -> True - "CAPTION" -> True - "ATTR_HTML" -> True - _ -> False + attrCheck x = x `elem` ["NAME", "LABEL", "CAPTION", "ATTR_HTML", "RESULTS"] appendValues :: String -> Maybe String -> (String, String) -> Maybe String appendValues attrName accValue (key, value) = @@ -350,17 +161,18 @@ blockAttributes = try $ do Just acc -> Just $ acc ++ ' ':value Nothing -> Just value -keyValues :: OrgParser [(String, String)] +-- | Parse key-value pairs for HTML attributes +keyValues :: Monad m => OrgParser m [(String, String)] keyValues = try $ manyTill ((,) <$> key <*> value) newline where - key :: OrgParser String + key :: Monad m => OrgParser m String key = try $ skipSpaces *> char ':' *> many1 nonspaceChar - value :: OrgParser String + value :: Monad m => OrgParser m String value = skipSpaces *> manyTill anyChar endOfValue - endOfValue :: OrgParser () + endOfValue :: Monad m => OrgParser m () endOfValue = lookAhead $ (() <$ try (many1 spaceChar <* key)) <|> () <$ newline @@ -371,12 +183,12 @@ keyValues = try $ -- -- | Read an org-mode block delimited by #+BEGIN_TYPE and #+END_TYPE. -orgBlock :: OrgParser (F Blocks) +orgBlock :: PandocMonad m => OrgParser m (F Blocks) orgBlock = try $ do blockAttrs <- blockAttributes blkType <- blockHeaderStart ($ blkType) $ - case (map toLower blkType) of + case map toLower blkType of "export" -> exportBlock "comment" -> rawBlockLines (const mempty) "html" -> rawBlockLines (return . B.rawBlock (lowercase blkType)) @@ -390,25 +202,25 @@ orgBlock = try $ do let (ident, classes, kv) = attrFromBlockAttributes blockAttrs in fmap $ B.divWith (ident, classes ++ [blkType], kv) where - blockHeaderStart :: OrgParser String + blockHeaderStart :: Monad m => OrgParser m String blockHeaderStart = try $ skipSpaces *> stringAnyCase "#+begin_" *> orgArgWord lowercase :: String -> String lowercase = map toLower -rawBlockLines :: (String -> F Blocks) -> String -> OrgParser (F Blocks) -rawBlockLines f blockType = (ignHeaders *> (f <$> rawBlockContent blockType)) +rawBlockLines :: Monad m => (String -> F Blocks) -> String -> OrgParser m (F Blocks) +rawBlockLines f blockType = ignHeaders *> (f <$> rawBlockContent blockType) -parseBlockLines :: (F Blocks -> F Blocks) -> String -> OrgParser (F Blocks) -parseBlockLines f blockType = (ignHeaders *> (f <$> parsedBlockContent)) +parseBlockLines :: PandocMonad m => (F Blocks -> F Blocks) -> String -> OrgParser m (F Blocks) +parseBlockLines f blockType = ignHeaders *> (f <$> parsedBlockContent) where - parsedBlockContent :: OrgParser (F Blocks) + parsedBlockContent :: PandocMonad m => OrgParser m (F Blocks) parsedBlockContent = try $ do raw <- rawBlockContent blockType parseFromString blocks (raw ++ "\n") -- | Read the raw string content of a block -rawBlockContent :: String -> OrgParser String +rawBlockContent :: Monad m => String -> OrgParser m String rawBlockContent blockType = try $ do blkLines <- manyTill rawLine blockEnder tabLen <- getOption readerTabStop @@ -418,18 +230,17 @@ rawBlockContent blockType = try $ do . map (tabsToSpaces tabLen . commaEscaped) $ blkLines where - rawLine :: OrgParser String + rawLine :: Monad m => OrgParser m String rawLine = try $ ("" <$ blankline) <|> anyLine - blockEnder :: OrgParser () + blockEnder :: Monad m => OrgParser m () blockEnder = try $ skipSpaces <* stringAnyCase ("#+end_" <> blockType) stripIndent :: [String] -> [String] stripIndent strs = map (drop (shortestIndent strs)) strs shortestIndent :: [String] -> Int - shortestIndent = foldr min maxBound - . map (length . takeWhile isSpace) + shortestIndent = foldr (min . length . takeWhile isSpace) maxBound . filter (not . null) tabsToSpaces :: Int -> String -> String @@ -437,7 +248,7 @@ rawBlockContent blockType = try $ do tabsToSpaces tabLen cs'@(c:cs) = case c of ' ' -> ' ':tabsToSpaces tabLen cs - '\t' -> (take tabLen $ repeat ' ') ++ tabsToSpaces tabLen cs + '\t' -> replicate tabLen ' ' ++ tabsToSpaces tabLen cs _ -> cs' commaEscaped :: String -> String @@ -448,18 +259,18 @@ rawBlockContent blockType = try $ do commaEscaped cs = cs -- | Read but ignore all remaining block headers. -ignHeaders :: OrgParser () +ignHeaders :: Monad m => OrgParser m () ignHeaders = (() <$ newline) <|> (() <$ anyLine) -- | Read a block containing code intended for export in specific backends -- only. -exportBlock :: String -> OrgParser (F Blocks) +exportBlock :: Monad m => String -> OrgParser m (F Blocks) exportBlock blockType = try $ do exportType <- skipSpaces *> orgArgWord <* ignHeaders contents <- rawBlockContent blockType returnF (B.rawBlock (map toLower exportType) contents) -verseBlock :: String -> OrgParser (F Blocks) +verseBlock :: PandocMonad m => String -> OrgParser m (F Blocks) verseBlock blockType = try $ do ignHeaders content <- rawBlockContent blockType @@ -468,7 +279,7 @@ verseBlock blockType = try $ do where -- replace initial spaces with nonbreaking spaces to preserve -- indentation, parse the rest as normal inline - parseVerseLine :: String -> OrgParser (F Inlines) + parseVerseLine :: PandocMonad m => String -> OrgParser m (F Inlines) parseVerseLine cs = do let (initialSpaces, indentedLine) = span isSpace cs let nbspIndent = if null initialSpaces @@ -480,23 +291,20 @@ verseBlock blockType = try $ do -- | Read a code block and the associated results block if present. Which of -- boths blocks is included in the output is determined using the "exports" -- argument in the block header. -codeBlock :: BlockAttributes -> String -> OrgParser (F Blocks) +codeBlock :: PandocMonad m => BlockAttributes -> String -> OrgParser m (F Blocks) codeBlock blockAttrs blockType = do skipSpaces (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) content <- rawBlockContent blockType - resultsContent <- trailingResultsBlock + resultsContent <- option mempty babelResultsBlock let id' = fromMaybe mempty $ blockAttrName blockAttrs - let includeCode = exportsCode kv - let includeResults = exportsResults kv let codeBlck = B.codeBlockWith ( id', classes, kv ) content let labelledBlck = maybe (pure codeBlck) (labelDiv codeBlck) (blockAttrCaption blockAttrs) - let resultBlck = fromMaybe mempty resultsContent return $ - (if includeCode then labelledBlck else mempty) <> - (if includeResults then resultBlck else mempty) + (if exportsCode kv then labelledBlck else mempty) <> + (if exportsResults kv then resultsContent else mempty) where labelDiv :: Blocks -> F Inlines -> F Blocks labelDiv blk value = @@ -505,60 +313,97 @@ codeBlock blockAttrs blockType = do labelledBlock :: F Inlines -> F Blocks labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) -exportsCode :: [(String, String)] -> Bool -exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs - || ("rundoc-exports", "results") `elem` attrs) + exportsCode :: [(String, String)] -> Bool + exportsCode = maybe True (`elem` ["code", "both"]) . lookup "exports" -exportsResults :: [(String, String)] -> Bool -exportsResults attrs = ("rundoc-exports", "results") `elem` attrs - || ("rundoc-exports", "both") `elem` attrs + exportsResults :: [(String, String)] -> Bool + exportsResults = maybe False (`elem` ["results", "both"]) . lookup "exports" -trailingResultsBlock :: OrgParser (Maybe (F Blocks)) -trailingResultsBlock = optionMaybe . try $ do +-- | Parse the result of an evaluated babel code block. +babelResultsBlock :: PandocMonad m => OrgParser m (F Blocks) +babelResultsBlock = try $ do blanklines - stringAnyCase "#+RESULTS:" - blankline + resultsMarker <|> + (lookAhead . void . try $ + manyTill (metaLineStart *> anyLineNewline) resultsMarker) block + where + resultsMarker = try . void $ stringAnyCase "#+RESULTS:" *> blankline -- | Parse code block arguments --- TODO: We currently don't handle switches. -codeHeaderArgs :: OrgParser ([String], [(String, String)]) +codeHeaderArgs :: Monad m => OrgParser m ([String], [(String, String)]) codeHeaderArgs = try $ do language <- skipSpaces *> orgArgWord - _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + (switchClasses, switchKv) <- switchesAsAttributes parameters <- manyTill blockOption newline - let pandocLang = translateLang language - return $ - if hasRundocParameters parameters - then ( [ pandocLang, rundocBlockClass ] - , map toRundocAttrib (("language", language) : parameters) + return ( translateLang language : switchClasses + , originalLang language <> switchKv <> parameters ) - else ([ pandocLang ], parameters) - where - hasRundocParameters = not . null -switch :: OrgParser (Char, Maybe String) -switch = try $ simpleSwitch <|> lineNumbersSwitch +switchesAsAttributes :: Monad m => OrgParser m ([String], [(String, String)]) +switchesAsAttributes = try $ do + switches <- skipSpaces *> try (switch `sepBy` many1 spaceChar) + return $ foldr addToAttr ([], []) switches where - simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) - lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> - (string "-l \"" *> many1Till nonspaceChar (char '"')) + addToAttr :: (Char, Maybe String, SwitchPolarity) + -> ([String], [(String, String)]) + -> ([String], [(String, String)]) + addToAttr ('n', lineNum, pol) (cls, kv) = + let kv' = case lineNum of + Just num -> ("startFrom", num):kv + Nothing -> kv + cls' = case pol of + SwitchPlus -> "continuedSourceBlock":cls + SwitchMinus -> cls + in ("numberLines":cls', kv') + addToAttr _ x = x + +-- | Whether a switch flag is specified with @+@ or @-@. +data SwitchPolarity = SwitchPlus | SwitchMinus + deriving (Show, Eq) + +-- | Parses a switch's polarity. +switchPolarity :: Monad m => OrgParser m SwitchPolarity +switchPolarity = (SwitchMinus <$ char '-') <|> (SwitchPlus <$ char '+') -blockOption :: OrgParser (String, String) +-- | Parses a source block switch option. +switch :: Monad m => OrgParser m (Char, Maybe String, SwitchPolarity) +switch = try $ lineNumberSwitch <|> labelSwitch <|> simpleSwitch + where + simpleSwitch = (\pol c -> (c, Nothing, pol)) <$> switchPolarity <*> letter + labelSwitch = genericSwitch 'l' $ + char '"' *> many1Till nonspaceChar (char '"') + +-- | Generic source block switch-option parser. +genericSwitch :: Monad m + => Char + -> OrgParser m String + -> OrgParser m (Char, Maybe String, SwitchPolarity) +genericSwitch c p = try $ do + polarity <- switchPolarity <* char c <* skipSpaces + arg <- optionMaybe p + return (c, arg, polarity) + +-- | Reads a line number switch option. The line number switch can be used with +-- example and source blocks. +lineNumberSwitch :: Monad m => OrgParser m (Char, Maybe String, SwitchPolarity) +lineNumberSwitch = genericSwitch 'n' (many digit) + +blockOption :: Monad m => OrgParser m (String, String) blockOption = try $ do argKey <- orgArgKey paramValue <- option "yes" orgParamValue return (argKey, paramValue) -orgParamValue :: OrgParser String +orgParamValue :: Monad m => OrgParser m String orgParamValue = try $ skipSpaces - *> notFollowedBy (char ':' ) - *> many1 nonspaceChar + *> notFollowedBy orgArgKey + *> noneOf "\n\r" `many1Till` endOfValue <* skipSpaces - -horizontalRule :: OrgParser (F Blocks) -horizontalRule = return B.horizontalRule <$ try hline + where + endOfValue = lookAhead $ try (skipSpaces <* oneOf "\n\r") + <|> try (skipSpaces1 <* orgArgKey) -- @@ -568,7 +413,7 @@ horizontalRule = return B.horizontalRule <$ try hline -- | A generic drawer which has no special meaning for org-mode. -- Whether or not this drawer is included in the output depends on the drawers -- export setting. -genericDrawer :: OrgParser (F Blocks) +genericDrawer :: PandocMonad m => OrgParser m (F Blocks) genericDrawer = try $ do name <- map toUpper <$> drawerStart content <- manyTill drawerLine (try drawerEnd) @@ -576,44 +421,25 @@ genericDrawer = try $ do -- Include drawer if it is explicitly included in or not explicitly excluded -- from the list of drawers that should be exported. PROPERTIES drawers are -- never exported. - case (exportDrawers . orgStateExportSettings $ state) of + case exportDrawers . orgStateExportSettings $ state of _ | name == "PROPERTIES" -> return mempty Left names | name `elem` names -> return mempty Right names | name `notElem` names -> return mempty - _ -> drawerDiv name <$> parseLines content + _ -> drawerDiv name <$> parseLines content where - parseLines :: [String] -> OrgParser (F Blocks) + parseLines :: PandocMonad m => [String] -> OrgParser m (F Blocks) parseLines = parseFromString blocks . (++ "\n") . unlines drawerDiv :: String -> F Blocks -> F Blocks drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) -drawerLine :: OrgParser String +drawerLine :: Monad m => OrgParser m String drawerLine = anyLine -drawerEnd :: OrgParser String +drawerEnd :: Monad m => OrgParser m String drawerEnd = try $ skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline --- | Read a :PROPERTIES: drawer and return the key/value pairs contained --- within. -propertiesDrawer :: OrgParser Properties -propertiesDrawer = try $ do - drawerType <- drawerStart - guard $ map toUpper drawerType == "PROPERTIES" - manyTill property (try drawerEnd) - where - property :: OrgParser (PropertyKey, PropertyValue) - property = try $ (,) <$> key <*> value - - key :: OrgParser PropertyKey - key = fmap toPropertyKey . try $ - skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') - - value :: OrgParser PropertyValue - value = fmap toPropertyValue . try $ - skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) - -- -- Figures @@ -621,7 +447,7 @@ propertiesDrawer = try $ do -- | Figures or an image paragraph (i.e. an image on a line by itself). Only -- images with a caption attribute are interpreted as figures. -figure :: OrgParser (F Blocks) +figure :: PandocMonad m => OrgParser m (F Blocks) figure = try $ do figAttrs <- blockAttributes src <- skipSpaces *> selfTarget <* skipSpaces <* endOfParagraph @@ -629,10 +455,10 @@ figure = try $ do Nothing -> mzero Just imgSrc -> do guard (isImageFilename imgSrc) - let isFigure = not . isNothing $ blockAttrCaption figAttrs + let isFigure = isJust $ blockAttrCaption figAttrs return $ imageBlock isFigure figAttrs imgSrc where - selfTarget :: OrgParser String + selfTarget :: PandocMonad m => OrgParser m String selfTarget = try $ char '[' *> linkTarget <* char ']' imageBlock :: Bool -> BlockAttributes -> String -> F Blocks @@ -654,7 +480,7 @@ figure = try $ do else "fig:" ++ cs -- | Succeeds if looking at the end of the current paragraph -endOfParagraph :: OrgParser () +endOfParagraph :: Monad m => OrgParser m () endOfParagraph = try $ skipSpaces *> newline *> endOfBlock @@ -663,11 +489,10 @@ endOfParagraph = try $ skipSpaces *> newline *> endOfBlock -- -- | Example code marked up by a leading colon. -example :: OrgParser (F Blocks) -example = try $ do - return . return . exampleCode =<< unlines <$> many1 exampleLine +example :: Monad m => OrgParser m (F Blocks) +example = try $ returnF . exampleCode =<< unlines <$> many1 exampleLine where - exampleLine :: OrgParser String + exampleLine :: Monad m => OrgParser m String exampleLine = try $ exampleLineStart *> anyLine exampleCode :: String -> Blocks @@ -678,10 +503,59 @@ exampleCode = B.codeBlockWith ("", ["example"], []) -- Comments, Options and Metadata -- -specialLine :: OrgParser (F Blocks) +specialLine :: PandocMonad m => OrgParser m (F Blocks) specialLine = fmap return . try $ rawExportLine <|> metaLine <|> commentLine -rawExportLine :: OrgParser Blocks +-- | Include the content of a file. +include :: PandocMonad m => OrgParser m (F Blocks) +include = try $ do + metaLineStart <* stringAnyCase "include:" <* skipSpaces + filename <- includeTarget + includeArgs <- many (try $ skipSpaces *> many1 alphaNum) + params <- keyValues + blocksParser <- case includeArgs of + ("example" : _) -> return $ pure . B.codeBlock <$> parseRaw + ["export"] -> return . returnF $ B.fromList [] + ["export", format] -> return $ pure . B.rawBlock format <$> parseRaw + ("src" : rest) -> do + let attr = case rest of + [lang] -> (mempty, [lang], mempty) + _ -> nullAttr + return $ pure . B.codeBlockWith attr <$> parseRaw + _ -> return $ return . B.fromList . blockFilter params <$> blockList + insertIncludedFileF blocksParser ["."] filename + where + includeTarget :: PandocMonad m => OrgParser m FilePath + includeTarget = do + char '"' + manyTill (noneOf "\n\r\t") (char '"') + + parseRaw :: PandocMonad m => OrgParser m String + parseRaw = many anyChar + + blockFilter :: [(String, String)] -> [Block] -> [Block] + blockFilter params blks = + let minlvl = lookup "minlevel" params + in case (minlvl >>= safeRead :: Maybe Int) of + Nothing -> blks + Just lvl -> let levels = Walk.query headerLevel blks + -- CAVE: partial function in else + curMin = if null levels then 0 else minimum levels + in Walk.walk (shiftHeader (curMin - lvl)) blks + + headerLevel :: Block -> [Int] + headerLevel (Header lvl _attr _content) = [lvl] + headerLevel _ = [] + + shiftHeader :: Int -> Block -> Block + shiftHeader shift blk = + if shift <= 0 + then blk + else case blk of + (Header lvl attr content) -> Header (lvl - shift) attr content + _ -> blk + +rawExportLine :: PandocMonad m => OrgParser m Blocks rawExportLine = try $ do metaLineStart key <- metaKey @@ -689,7 +563,7 @@ rawExportLine = try $ do then B.rawBlock key <$> anyLine else mzero -commentLine :: OrgParser Blocks +commentLine :: Monad m => OrgParser m Blocks commentLine = commentLineStart *> anyLine *> pure mempty @@ -714,12 +588,21 @@ data OrgTableRow = OrgContentRow (F [Blocks]) -- should be generated using a builder function. data OrgTable = OrgTable { orgTableColumnProperties :: [ColumnProperty] - , orgTableHeader :: [Blocks] - , orgTableRows :: [[Blocks]] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] } -table :: OrgParser (F Blocks) -table = try $ do +table :: PandocMonad m => OrgParser m (F Blocks) +table = gridTableWith blocks True <|> orgTable + +-- | A normal org table +orgTable :: PandocMonad m => OrgParser m (F Blocks) +orgTable = try $ do + -- don't allow a table on the first line of a list item; org requires that + -- tables start at first non-space character on the line + let isFirstInListItem st = orgStateParserContext st == ListItemState && + isNothing (orgStateLastPreCharPos st) + guard =<< not . isFirstInListItem <$> getState blockAttrs <- blockAttributes lookAhead tableStart do @@ -731,7 +614,7 @@ orgToPandocTable :: OrgTable -> Inlines -> Blocks orgToPandocTable (OrgTable colProps heads lns) caption = - let totalWidth = if any (not . isNothing) (map columnRelWidth colProps) + let totalWidth = if any isJust (map columnRelWidth colProps) then Just . sum $ map (fromMaybe 1 . columnRelWidth) colProps else Nothing in B.table caption (map (convertColProp totalWidth) colProps) heads lns @@ -741,22 +624,22 @@ orgToPandocTable (OrgTable colProps heads lns) caption = let align' = fromMaybe AlignDefault $ columnAlignment colProp width' = fromMaybe 0 $ (\w t -> (fromIntegral w / fromIntegral t)) - <$> (columnRelWidth colProp) + <$> columnRelWidth colProp <*> totalWidth in (align', width') -tableRows :: OrgParser [OrgTableRow] +tableRows :: PandocMonad m => OrgParser m [OrgTableRow] tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) -tableContentRow :: OrgParser OrgTableRow +tableContentRow :: PandocMonad m => OrgParser m OrgTableRow tableContentRow = try $ OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline) -tableContentCell :: OrgParser (F Blocks) +tableContentCell :: PandocMonad m => OrgParser m (F Blocks) tableContentCell = try $ fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell -tableAlignRow :: OrgParser OrgTableRow +tableAlignRow :: Monad m => OrgParser m OrgTableRow tableAlignRow = try $ do tableStart colProps <- many1Till columnPropertyCell newline @@ -764,10 +647,10 @@ tableAlignRow = try $ do guard $ any (/= def) colProps return $ OrgAlignRow colProps -columnPropertyCell :: OrgParser ColumnProperty +columnPropertyCell :: Monad m => OrgParser m ColumnProperty columnPropertyCell = emptyCell <|> propCell <?> "alignment info" where - emptyCell = ColumnProperty Nothing Nothing <$ (try $ skipSpaces *> endOfCell) + emptyCell = ColumnProperty Nothing Nothing <$ try (skipSpaces *> endOfCell) propCell = try $ ColumnProperty <$> (skipSpaces *> char '<' @@ -776,18 +659,18 @@ columnPropertyCell = emptyCell <|> propCell <?> "alignment info" <* char '>' <* emptyCell) -tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar :: Monad m => OrgParser m Alignment tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft , char 'c' *> return AlignCenter , char 'r' *> return AlignRight ] -tableHline :: OrgParser OrgTableRow +tableHline :: Monad m => OrgParser m OrgTableRow tableHline = try $ OrgHlineRow <$ (tableStart *> char '-' *> anyLine) -endOfCell :: OrgParser Char +endOfCell :: Monad m => OrgParser m Char endOfCell = try $ char '|' <|> lookAhead newline rowsToTable :: [OrgTableRow] @@ -813,45 +696,45 @@ normalizeTable (OrgTable colProps heads rows) = rowToContent :: OrgTable -> OrgTableRow -> F OrgTable -rowToContent orgTable row = +rowToContent tbl row = case row of OrgHlineRow -> return singleRowPromotedToHeader OrgAlignRow props -> return . setProperties $ props OrgContentRow cs -> appendToBody cs where singleRowPromotedToHeader :: OrgTable - singleRowPromotedToHeader = case orgTable of - OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> - orgTable{ orgTableHeader = b , orgTableRows = [] } - _ -> orgTable + singleRowPromotedToHeader = case tbl of + OrgTable{ orgTableHeader = [], orgTableRows = [b] } -> + tbl{ orgTableHeader = b , orgTableRows = [] } + _ -> tbl setProperties :: [ColumnProperty] -> OrgTable - setProperties ps = orgTable{ orgTableColumnProperties = ps } + setProperties ps = tbl{ orgTableColumnProperties = ps } appendToBody :: F [Blocks] -> F OrgTable appendToBody frow = do newRow <- frow - let oldRows = orgTableRows orgTable + let oldRows = orgTableRows tbl -- NOTE: This is an inefficient O(n) operation. This should be changed -- if performance ever becomes a problem. - return orgTable{ orgTableRows = oldRows ++ [newRow] } + return tbl{ orgTableRows = oldRows ++ [newRow] } -- -- LaTeX fragments -- -latexFragment :: OrgParser (F Blocks) +latexFragment :: Monad m => OrgParser m (F Blocks) latexFragment = try $ do envName <- latexEnvStart content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) - return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + returnF $ B.rawBlock "latex" (content `inLatexEnv` envName) where c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" , c , "\\end{", e, "}\n" ] -latexEnd :: String -> OrgParser () +latexEnd :: Monad m => String -> OrgParser m () latexEnd envName = try $ () <$ skipSpaces <* string ("\\end{" ++ envName ++ "}") @@ -861,74 +744,70 @@ latexEnd envName = try $ -- -- Footnote defintions -- -noteBlock :: OrgParser (F Blocks) +noteBlock :: PandocMonad m => OrgParser m (F Blocks) noteBlock = try $ do - ref <- noteMarker <* skipSpaces - content <- mconcat <$> blocksTillHeaderOrNote + ref <- noteMarker <* skipSpaces <* updateLastPreCharPos + content <- mconcat <$> many1Till block endOfFootnote addToNotesTable (ref, content) return mempty where - blocksTillHeaderOrNote = - many1Till block (eof <|> () <$ lookAhead noteMarker - <|> () <$ lookAhead headerStart) + endOfFootnote = eof + <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart + <|> () <$ lookAhead (try $ blankline *> blankline) -- Paragraphs or Plain text -paraOrPlain :: OrgParser (F Blocks) +paraOrPlain :: PandocMonad m => OrgParser m (F Blocks) paraOrPlain = try $ do -- Make sure we are not looking at a headline - notFollowedBy' (char '*' *> (oneOf " *")) + notFollowedBy' headerStart ils <- inlines nl <- option False (newline *> return True) -- Read block as paragraph, except if we are in a list context and the block -- is directly followed by a list item, in which case the block is read as -- plain text. try (guard nl - *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) + *> notFollowedBy (inList *> (orderedListStart <|> bulletListStart)) *> return (B.para <$> ils)) - <|> (return (B.plain <$> ils)) + <|> return (B.plain <$> ils) -- -- list blocks -- -list :: OrgParser (F Blocks) +list :: PandocMonad m => OrgParser m (F Blocks) list = choice [ definitionList, bulletList, orderedList ] <?> "list" -definitionList :: OrgParser (F Blocks) -definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) - fmap B.definitionList . fmap compactify'DL . sequence - <$> many1 (definitionListItem $ bulletListStart' (Just n)) - -bulletList :: OrgParser (F Blocks) -bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) - fmap B.bulletList . fmap compactify' . sequence - <$> many1 (listItem (bulletListStart' $ Just n)) - -orderedList :: OrgParser (F Blocks) -orderedList = fmap B.orderedList . fmap compactify' . sequence - <$> many1 (listItem orderedListStart) - -bulletListStart' :: Maybe Int -> OrgParser Int --- returns length of bulletList prefix, inclusive of marker -bulletListStart' Nothing = do ind <- length <$> many spaceChar - oneOf (bullets $ ind == 0) - skipSpaces1 - return (ind + 1) -bulletListStart' (Just n) = do count (n-1) spaceChar - oneOf (bullets $ n == 1) - many1 spaceChar - return n - --- Unindented lists are legal, but they can't use '*' bullets. --- We return n to maintain compatibility with the generic listItem. -bullets :: Bool -> String -bullets unindented = if unindented then "+-" else "*+-" - -definitionListItem :: OrgParser Int - -> OrgParser (F (Inlines, [Blocks])) -definitionListItem parseMarkerGetLength = try $ do - markerLength <- parseMarkerGetLength +definitionList :: PandocMonad m => OrgParser m (F Blocks) +definitionList = try $ do + indent <- lookAhead bulletListStart + fmap (B.definitionList . compactifyDL) . sequence + <$> many1 (definitionListItem (bulletListStart `indented` indent)) + +bulletList :: PandocMonad m => OrgParser m (F Blocks) +bulletList = try $ do + indent <- lookAhead bulletListStart + fmap (B.bulletList . compactify) . sequence + <$> many1 (listItem (bulletListStart `indented` indent)) + +indented :: Monad m => OrgParser m Int -> Int -> OrgParser m Int +indented indentedMarker minIndent = try $ do + n <- indentedMarker + guard (minIndent <= n) + return n + +orderedList :: PandocMonad m => OrgParser m (F Blocks) +orderedList = try $ do + indent <- lookAhead orderedListStart + fmap (B.orderedList . compactify) . sequence + <$> many1 (listItem (orderedListStart `indented` indent)) + +definitionListItem :: PandocMonad m + => OrgParser m Int + -> OrgParser m (F (Inlines, [Blocks])) +definitionListItem parseIndentedMarker = try $ do + markerLength <- parseIndentedMarker term <- manyTill (noneOf "\n\r") (try definitionMarker) line1 <- anyLineNewline blank <- option "" ("\n" <$ blankline) @@ -940,12 +819,12 @@ definitionListItem parseMarkerGetLength = try $ do definitionMarker = spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) - --- parse raw text for one list item, excluding start marker and continuations -listItem :: OrgParser Int - -> OrgParser (F Blocks) -listItem start = try . withContext ListItemState $ do - markerLength <- try start +-- | parse raw text for one list item +listItem :: PandocMonad m + => OrgParser m Int + -> OrgParser m (F Blocks) +listItem parseIndentedMarker = try . withContext ListItemState $ do + markerLength <- try parseIndentedMarker firstLine <- anyLineNewline blank <- option "" ("\n" <$ blankline) rest <- concat <$> many (listContinuation markerLength) @@ -953,24 +832,11 @@ listItem start = try . withContext ListItemState $ do -- continuation of a list item - indented and separated by blankline or endline. -- Note: nested lists are parsed as continuations. -listContinuation :: Int - -> OrgParser String -listContinuation markerLength = try $ +listContinuation :: Monad m => Int + -> OrgParser m String +listContinuation markerLength = try $ do notFollowedBy' blankline - *> (mappend <$> (concat <$> many1 listLine) - <*> many blankline) + mappend <$> (concat <$> many1 listLine) + <*> many blankline where listLine = try $ indentWith markerLength *> anyLineNewline - - -- indent by specified number of spaces (or equiv. tabs) - indentWith :: Int -> OrgParser String - indentWith num = do - tabStop <- getOption readerTabStop - if num < tabStop - then count num (char ' ') - else choice [ try (count num (char ' ')) - , try (char '\t' >> count (num - tabStop) (char ' ')) ] - --- | Parse any line, include the final newline in the output. -anyLineNewline :: OrgParser String -anyLineNewline = (++ "\n") <$> anyLine diff --git a/src/Text/Pandoc/Readers/Org/DocumentTree.hs b/src/Text/Pandoc/Readers/Org/DocumentTree.hs new file mode 100644 index 000000000..f77778ec9 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs @@ -0,0 +1,304 @@ +{- +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.Org.DocumentTree + Copyright : Copyright (C) 2014-2018 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for org-mode headlines and document subtrees +-} +module Text.Pandoc.Readers.Org.DocumentTree + ( documentTree + , headlineToBlocks + ) where + +import Control.Arrow ((***)) +import Control.Monad (guard, void) +import Data.Char (toLower, toUpper) +import Data.List (intersperse) +import Data.Monoid ((<>)) +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import qualified Data.Map as Map +import qualified Text.Pandoc.Builder as B + +-- +-- Org headers +-- + +-- | Parse input as org document tree. +documentTree :: PandocMonad m + => OrgParser m (F Blocks) + -> OrgParser m (F Inlines) + -> OrgParser m (F Headline) +documentTree blocks inline = do + initialBlocks <- blocks + headlines <- sequence <$> manyTill (headline blocks inline 1) eof + title <- fmap (getTitle . unMeta) . orgStateMeta <$> getState + return $ do + headlines' <- headlines + initialBlocks' <- initialBlocks + title' <- title + return Headline + { headlineLevel = 0 + , headlineTodoMarker = Nothing + , headlineText = B.fromList title' + , headlineTags = mempty + , headlineProperties = mempty + , headlineContents = initialBlocks' + , headlineChildren = headlines' + } + where + getTitle :: Map.Map String MetaValue -> [Inline] + getTitle metamap = + case Map.lookup "title" metamap of + Just (MetaInlines inlns) -> inlns + _ -> [] + +newtype Tag = Tag { fromTag :: String } + deriving (Show, Eq) + +-- | Create a tag containing the given string. +toTag :: String -> Tag +toTag = Tag + +-- | The key (also called name or type) of a property. +newtype PropertyKey = PropertyKey { fromKey :: String } + deriving (Show, Eq, Ord) + +-- | Create a property key containing the given string. Org mode keys are +-- case insensitive and are hence converted to lower case. +toPropertyKey :: String -> PropertyKey +toPropertyKey = PropertyKey . map toLower + +-- | The value assigned to a property. +newtype PropertyValue = PropertyValue { fromValue :: String } + +-- | Create a property value containing the given string. +toPropertyValue :: String -> PropertyValue +toPropertyValue = PropertyValue + +-- | Check whether the property value is non-nil (i.e. truish). +isNonNil :: PropertyValue -> Bool +isNonNil p = map toLower (fromValue p) `notElem` ["()", "{}", "nil"] + +-- | Key/value pairs from a PROPERTIES drawer +type Properties = [(PropertyKey, PropertyValue)] + +-- | Org mode headline (i.e. a document subtree). +data Headline = Headline + { headlineLevel :: Int + , headlineTodoMarker :: Maybe TodoMarker + , headlineText :: Inlines + , headlineTags :: [Tag] + , headlineProperties :: Properties + , headlineContents :: Blocks + , headlineChildren :: [Headline] + } + +-- | Read an Org mode headline and its contents (i.e. a document subtree). +-- @lvl@ gives the minimum acceptable level of the tree. +headline :: PandocMonad m + => OrgParser m (F Blocks) + -> OrgParser m (F Inlines) + -> Int + -> OrgParser m (F Headline) +headline blocks inline lvl = try $ do + level <- headerStart + guard (lvl <= level) + todoKw <- optionMaybe todoKeyword + title <- trimInlinesF . mconcat <$> manyTill inline endOfTitle + tags <- option [] headerTags + newline + properties <- option mempty propertiesDrawer + contents <- blocks + children <- many (headline blocks inline (level + 1)) + return $ do + title' <- title + contents' <- contents + children' <- sequence children + return Headline + { headlineLevel = level + , headlineTodoMarker = todoKw + , headlineText = title' + , headlineTags = tags + , headlineProperties = properties + , headlineContents = contents' + , headlineChildren = children' + } + where + endOfTitle :: Monad m => OrgParser m () + endOfTitle = void . lookAhead $ optional headerTags *> newline + + headerTags :: Monad m => OrgParser m [Tag] + headerTags = try $ + let tag = many1 (alphaNum <|> oneOf "@%#_") <* char ':' + in map toTag <$> (skipSpaces *> char ':' *> many1 tag <* skipSpaces) + +-- | Convert an Org mode headline (i.e. a document tree) into pandoc's Blocks +headlineToBlocks :: Monad m => Headline -> OrgParser m Blocks +headlineToBlocks hdln@Headline {..} = do + maxHeadlineLevels <- getExportSetting exportHeadlineLevels + case () of + _ | any isNoExportTag headlineTags -> return mempty + _ | any isArchiveTag headlineTags -> archivedHeadlineToBlocks hdln + _ | isCommentTitle headlineText -> return mempty + _ | headlineLevel >= maxHeadlineLevels -> headlineToHeaderWithList hdln + _ | otherwise -> headlineToHeaderWithContents hdln + +isNoExportTag :: Tag -> Bool +isNoExportTag = (== toTag "noexport") + +isArchiveTag :: Tag -> Bool +isArchiveTag = (== toTag "ARCHIVE") + +-- | Check if the title starts with COMMENT. +-- FIXME: This accesses builder internals not intended for use in situations +-- like these. Replace once keyword parsing is supported. +isCommentTitle :: Inlines -> Bool +isCommentTitle (B.toList -> (Str "COMMENT":_)) = True +isCommentTitle _ = False + +archivedHeadlineToBlocks :: Monad m => Headline -> OrgParser m Blocks +archivedHeadlineToBlocks hdln = do + archivedTreesOption <- getExportSetting exportArchivedTrees + case archivedTreesOption of + ArchivedTreesNoExport -> return mempty + ArchivedTreesExport -> headlineToHeaderWithContents hdln + ArchivedTreesHeadlineOnly -> headlineToHeader hdln + +headlineToHeaderWithList :: Monad m => Headline -> OrgParser m Blocks +headlineToHeaderWithList hdln@Headline {..} = do + maxHeadlineLevels <- getExportSetting exportHeadlineLevels + header <- headlineToHeader hdln + listElements <- mapM headlineToBlocks headlineChildren + let listBlock = if null listElements + then mempty + else B.orderedList listElements + let headerText = if maxHeadlineLevels == headlineLevel + then header + else flattenHeader header + return $ headerText <> headlineContents <> listBlock + where + flattenHeader :: Blocks -> Blocks + flattenHeader blks = + case B.toList blks of + (Header _ _ inlns:_) -> B.para (B.fromList inlns) + _ -> mempty + +headlineToHeaderWithContents :: Monad m => Headline -> OrgParser m Blocks +headlineToHeaderWithContents hdln@Headline {..} = do + header <- headlineToHeader hdln + childrenBlocks <- mconcat <$> mapM headlineToBlocks headlineChildren + return $ header <> headlineContents <> childrenBlocks + +headlineToHeader :: Monad m => Headline -> OrgParser m Blocks +headlineToHeader Headline {..} = do + exportTodoKeyword <- getExportSetting exportWithTodoKeywords + exportTags <- getExportSetting exportWithTags + let todoText = if exportTodoKeyword + then case headlineTodoMarker of + Just kw -> todoKeywordToInlines kw <> B.space + Nothing -> mempty + else mempty + let text = todoText <> headlineText <> + if exportTags + then tagsToInlines headlineTags + else mempty + let propAttr = propertiesToAttr headlineProperties + attr <- registerHeader propAttr headlineText + return $ B.headerWith attr headlineLevel text + +todoKeyword :: Monad m => OrgParser m TodoMarker +todoKeyword = try $ do + taskStates <- activeTodoMarkers <$> getState + let kwParser tdm = try (tdm <$ string (todoMarkerName tdm) <* spaceChar) + choice (map kwParser taskStates) + +todoKeywordToInlines :: TodoMarker -> Inlines +todoKeywordToInlines tdm = + let todoText = todoMarkerName tdm + todoState = map toLower . show $ todoMarkerState tdm + classes = [todoState, todoText] + in B.spanWith (mempty, classes, mempty) (B.str todoText) + +propertiesToAttr :: Properties -> Attr +propertiesToAttr properties = + let + toStringPair = fromKey *** fromValue + customIdKey = toPropertyKey "custom_id" + classKey = toPropertyKey "class" + unnumberedKey = toPropertyKey "unnumbered" + specialProperties = [customIdKey, classKey, unnumberedKey] + id' = maybe mempty fromValue . lookup customIdKey $ properties + cls = maybe mempty fromValue . lookup classKey $ properties + kvs' = map toStringPair . filter ((`notElem` specialProperties) . fst) + $ properties + isUnnumbered = + maybe False isNonNil . lookup unnumberedKey $ properties + in + (id', words cls ++ ["unnumbered" | isUnnumbered], kvs') + +tagsToInlines :: [Tag] -> Inlines +tagsToInlines [] = mempty +tagsToInlines tags = + (B.space <>) . mconcat . intersperse (B.str "\160") . map tagToInline $ tags + where + tagToInline :: Tag -> Inlines + tagToInline t = tagSpan t . B.smallcaps . B.str $ fromTag t + +-- | Wrap the given inline in a span, marking it as a tag. +tagSpan :: Tag -> Inlines -> Inlines +tagSpan t = B.spanWith ("", ["tag"], [("tag-name", fromTag t)]) + + + + + +-- | Read a :PROPERTIES: drawer and return the key/value pairs contained +-- within. +propertiesDrawer :: Monad m => OrgParser m Properties +propertiesDrawer = try $ do + drawerType <- drawerStart + guard $ map toUpper drawerType == "PROPERTIES" + manyTill property (try endOfDrawer) + where + property :: Monad m => OrgParser m (PropertyKey, PropertyValue) + property = try $ (,) <$> key <*> value + + key :: Monad m => OrgParser m PropertyKey + key = fmap toPropertyKey . try $ + skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') + + value :: Monad m => OrgParser m PropertyValue + value = fmap toPropertyValue . try $ + skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) + + endOfDrawer :: Monad m => OrgParser m String + endOfDrawer = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline diff --git a/src/Text/Pandoc/Readers/Org/ExportSettings.hs b/src/Text/Pandoc/Readers/Org/ExportSettings.hs index 764e5b0d5..6a70c50b9 100644 --- a/src/Text/Pandoc/Readers/Org/ExportSettings.hs +++ b/src/Text/Pandoc/Readers/Org/ExportSettings.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2016-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,8 +17,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.ExportSettings + Copyright : © 2016–2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -29,22 +29,22 @@ module Text.Pandoc.Readers.Org.ExportSettings ( exportSettings ) where -import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing -import Control.Monad ( mzero, void ) -import Data.Char ( toLower ) -import Data.Maybe ( listToMaybe ) +import Control.Monad (mzero, void) +import Data.Char (toLower) +import Data.Maybe (listToMaybe) -- | Read and handle space separated org-mode export settings. -exportSettings :: OrgParser () +exportSettings :: Monad m => OrgParser m () exportSettings = void $ sepBy spaces exportSetting -- | Setter function for export settings. type ExportSettingSetter a = a -> ExportSettings -> ExportSettings -- | Read and process a single org-mode export option. -exportSetting :: OrgParser () +exportSetting :: Monad m => OrgParser m () exportSetting = choice [ booleanSetting "^" (\val es -> es { exportSubSuperscripts = val }) , booleanSetting "'" (\val es -> es { exportSmartQuotes = val }) @@ -52,7 +52,7 @@ exportSetting = choice , booleanSetting "-" (\val es -> es { exportSpecialStrings = val }) , ignoredSetting ":" , ignoredSetting "<" - , ignoredSetting "\\n" + , booleanSetting "\\n" (\val es -> es { exportPreserveBreaks = val }) , archivedTreeSetting "arch" (\val es -> es { exportArchivedTrees = val }) , booleanSetting "author" (\val es -> es { exportWithAuthor = val }) , ignoredSetting "c" @@ -71,7 +71,7 @@ exportSetting = choice , ignoredSetting "pri" , ignoredSetting "prop" , ignoredSetting "stat" - , ignoredSetting "tags" + , booleanSetting "tags" (\val es -> es { exportWithTags = val }) , ignoredSetting "tasks" , ignoredSetting "tex" , ignoredSetting "timestamp" @@ -81,10 +81,11 @@ exportSetting = choice , ignoredSetting "|" ] <?> "export setting" -genericExportSetting :: OrgParser a +genericExportSetting :: Monad m + => OrgParser m a -> String -> ExportSettingSetter a - -> OrgParser () + -> OrgParser m () genericExportSetting optionParser settingIdentifier setter = try $ do _ <- string settingIdentifier *> char ':' value <- optionParser @@ -94,11 +95,11 @@ genericExportSetting optionParser settingIdentifier setter = try $ do st { orgStateExportSettings = setter val . orgStateExportSettings $ st } -- | A boolean option, either nil (False) or non-nil (True). -booleanSetting :: String -> ExportSettingSetter Bool -> OrgParser () +booleanSetting :: Monad m => String -> ExportSettingSetter Bool -> OrgParser m () booleanSetting = genericExportSetting elispBoolean -- | An integer-valued option. -integerSetting :: String -> ExportSettingSetter Int -> OrgParser () +integerSetting :: Monad m => String -> ExportSettingSetter Int -> OrgParser m () integerSetting = genericExportSetting parseInt where parseInt = try $ @@ -106,9 +107,10 @@ integerSetting = genericExportSetting parseInt -- | Either the string "headline" or an elisp boolean and treated as an -- @ArchivedTreesOption@. -archivedTreeSetting :: String +archivedTreeSetting :: Monad m + => String -> ExportSettingSetter ArchivedTreesOption - -> OrgParser () + -> OrgParser m () archivedTreeSetting = genericExportSetting $ archivedTreesHeadlineSetting <|> archivedTreesBoolean where @@ -125,9 +127,10 @@ archivedTreeSetting = else ArchivedTreesNoExport -- | A list or a complement list (i.e. a list starting with `not`). -complementableListSetting :: String +complementableListSetting :: Monad m + => String -> ExportSettingSetter (Either [String] [String]) - -> OrgParser () + -> OrgParser m () complementableListSetting = genericExportSetting $ choice [ Left <$> complementStringList , Right <$> stringList @@ -135,31 +138,31 @@ complementableListSetting = genericExportSetting $ choice ] where -- Read a plain list of strings. - stringList :: OrgParser [String] + stringList :: Monad m => OrgParser m [String] stringList = try $ char '(' *> sepBy elispString spaces <* char ')' -- Read an emacs lisp list specifying a complement set. - complementStringList :: OrgParser [String] + complementStringList :: Monad m => OrgParser m [String] complementStringList = try $ string "(not " *> sepBy elispString spaces <* char ')' - elispString :: OrgParser String + elispString :: Monad m => OrgParser m String elispString = try $ char '"' *> manyTill alphaNum (char '"') -- | Read but ignore the export setting. -ignoredSetting :: String -> OrgParser () +ignoredSetting :: Monad m => String -> OrgParser m () ignoredSetting s = try (() <$ string s <* char ':' <* many1 nonspaceChar) -- | Read an elisp boolean. Only NIL is treated as false, non-NIL values are -- interpreted as true. -elispBoolean :: OrgParser Bool +elispBoolean :: Monad m => OrgParser m Bool elispBoolean = try $ do value <- many1 nonspaceChar return $ case map toLower value of diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index 7e1bb61c2..3a12f38d0 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,8 +18,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.Inlines + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -33,73 +33,75 @@ module Text.Pandoc.Readers.Org.Inlines , linkTarget ) where -import Text.Pandoc.Readers.Org.BlockStarts ( endOfBlock, noteMarker ) -import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Readers.Org.Parsing -import Text.Pandoc.Readers.Org.Shared - ( cleanLinkString, isImageFilename, rundocBlockClass - , toRundocAttrib, translateLang ) +import Text.Pandoc.Readers.Org.BlockStarts (endOfBlock, noteMarker) +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.Shared (cleanLinkString, isImageFilename, + originalLang, translateLang) +import Text.Pandoc.Builder (Inlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Inlines ) -import Text.Pandoc.Definition -import Text.Pandoc.Options -import Text.Pandoc.Readers.LaTeX ( inlineCommand, rawLaTeXInline ) -import Text.TeXMath ( readTeX, writePandoc, DisplayType(..) ) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) +import Text.Pandoc.Shared (underlineSpan) +import Text.TeXMath (DisplayType (..), readTeX, writePandoc) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap -import Prelude hiding (sequence) -import Control.Monad ( guard, mplus, mzero, when, void ) -import Data.Char ( isAlphaNum, isSpace ) -import Data.List ( intersperse ) -import Data.Maybe ( fromMaybe ) +import Control.Monad (guard, mplus, mzero, unless, void, when) +import Control.Monad.Trans (lift) +import Data.Char (isAlphaNum, isSpace) +import Data.List (intersperse) import qualified Data.Map as M -import Data.Monoid ( (<>) ) -import Data.Traversable (sequence) +import Data.Maybe (fromMaybe) +import Data.Monoid ((<>)) +import Data.Traversable (sequence) +import Prelude hiding (sequence) -- -- Functions acting on the parser state -- -recordAnchorId :: String -> OrgParser () +recordAnchorId :: PandocMonad m => String -> OrgParser m () recordAnchorId i = updateState $ \s -> - s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + s{ orgStateAnchorIds = i : orgStateAnchorIds s } -pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack :: PandocMonad m => Char -> OrgParser m () pushToInlineCharStack c = updateState $ \s -> s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } -popInlineCharStack :: OrgParser () +popInlineCharStack :: PandocMonad m => OrgParser m () popInlineCharStack = updateState $ \s -> s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } -surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar :: PandocMonad m => OrgParser m [Char] surroundingEmphasisChar = take 1 . drop 1 . orgStateEmphasisCharStack <$> getState -startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting :: PandocMonad m => Int -> OrgParser m () startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> s{ orgStateEmphasisNewlines = Just maxNewlines } -decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount :: PandocMonad m => OrgParser m () decEmphasisNewlinesCount = updateState $ \s -> s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } -newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits :: PandocMonad m => OrgParser m Bool newlinesCountWithinLimits = do st <- getState return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True -resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines :: PandocMonad m => OrgParser m () resetEmphasisNewlines = updateState $ \s -> s{ orgStateEmphasisNewlines = Nothing } -addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable :: PandocMonad m => OrgNoteRecord -> OrgParser m () addToNotesTable note = do oldnotes <- orgStateNotes' <$> getState updateState $ \s -> s{ orgStateNotes' = note:oldnotes } -- | Parse a single Org-mode inline element -inline :: OrgParser (F Inlines) +inline :: PandocMonad m => OrgParser m (F Inlines) inline = choice [ whitespace , linebreak @@ -119,13 +121,14 @@ inline = , superscript , inlineLaTeX , exportSnippet + , macro , smart , symbol ] <* (guard =<< newlinesCountWithinLimits) <?> "inline" -- | Read the rest of the input as inlines. -inlines :: OrgParser (F Inlines) +inlines :: PandocMonad m => OrgParser m (F Inlines) inlines = trimInlinesF . mconcat <$> many1 inline -- treat these as potentially non-text when parsing inline: @@ -133,30 +136,31 @@ specialChars :: [Char] specialChars = "\"$'()*+-,./:;<=>@[\\]^_{|}~" -whitespace :: OrgParser (F Inlines) +whitespace :: PandocMonad m => OrgParser m (F Inlines) whitespace = pure B.space <$ skipMany1 spaceChar <* updateLastPreCharPos <* updateLastForbiddenCharPos <?> "whitespace" -linebreak :: OrgParser (F Inlines) +linebreak :: PandocMonad m => OrgParser m (F Inlines) linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline -str :: OrgParser (F Inlines) +str :: PandocMonad m => OrgParser m (F Inlines) str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") <* updateLastStrPos -- | An endline character that can be treated as a space, not a structural -- break. This should reflect the values of the Emacs variable -- @org-element-pagaraph-separate@. -endline :: OrgParser (F Inlines) +endline :: PandocMonad m => OrgParser m (F Inlines) endline = try $ do newline notFollowedBy' endOfBlock decEmphasisNewlinesCount guard =<< newlinesCountWithinLimits updateLastPreCharPos - return . return $ B.softbreak + useHardBreaks <- exportPreserveBreaks . orgStateExportSettings <$> getState + returnF (if useHardBreaks then B.linebreak else B.softbreak) -- @@ -174,7 +178,7 @@ endline = try $ do -- contributors. All this should be consolidated once an official Org-mode -- citation syntax has emerged. -cite :: OrgParser (F Inlines) +cite :: PandocMonad m => OrgParser m (F Inlines) cite = try $ berkeleyCite <|> do guardEnabled Ext_citations (cs, raw) <- withRaw $ choice @@ -182,43 +186,44 @@ cite = try $ berkeleyCite <|> do , orgRefCite , berkeleyTextualCite ] - return $ (flip B.cite (B.text raw)) <$> cs + return $ flip B.cite (B.text raw) <$> cs -- | A citation in Pandoc Org-mode style (@[prefix \@citekey suffix]@). -pandocOrgCite :: OrgParser (F [Citation]) +pandocOrgCite :: PandocMonad m => OrgParser m (F [Citation]) pandocOrgCite = try $ char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']' -orgRefCite :: OrgParser (F [Citation]) +orgRefCite :: PandocMonad m => OrgParser m (F [Citation]) orgRefCite = try $ choice [ normalOrgRefCite , fmap (:[]) <$> linkLikeOrgRefCite ] -normalOrgRefCite :: OrgParser (F [Citation]) +normalOrgRefCite :: PandocMonad m => OrgParser m (F [Citation]) normalOrgRefCite = try $ do mode <- orgRefCiteMode - -- org-ref style citation key, parsed into a citation of the given mode - let orgRefCiteItem :: OrgParser (F Citation) - orgRefCiteItem = try $ do - key <- orgRefCiteKey - returnF $ Citation - { citationId = key - , citationPrefix = mempty - , citationSuffix = mempty - , citationMode = mode - , citationNoteNum = 0 - , citationHash = 0 - } - firstCitation <- orgRefCiteItem - moreCitations <- many (try $ char ',' *> orgRefCiteItem) + firstCitation <- orgRefCiteList mode + moreCitations <- many (try $ char ',' *> orgRefCiteList mode) return . sequence $ firstCitation : moreCitations - where + where + -- | A list of org-ref style citation keys, parsed as citation of the given + -- citation mode. + orgRefCiteList :: PandocMonad m => CitationMode -> OrgParser m (F Citation) + orgRefCiteList citeMode = try $ do + key <- orgRefCiteKey + returnF Citation + { citationId = key + , citationPrefix = mempty + , citationSuffix = mempty + , citationMode = citeMode + , citationNoteNum = 0 + , citationHash = 0 + } -- | Read an Berkeley-style Org-mode citation. Berkeley citation style was -- develop and adjusted to Org-mode style by John MacFarlane and Richard -- Lawrence, respectively, both philosophers at UC Berkeley. -berkeleyCite :: OrgParser (F Inlines) +berkeleyCite :: PandocMonad m => OrgParser m (F Inlines) berkeleyCite = try $ do bcl <- berkeleyCitationList return $ do @@ -229,11 +234,11 @@ berkeleyCite = try $ do return $ if parens then toCite - . maybe id (\p -> alterFirst (prependPrefix p)) prefix - . maybe id (\s -> alterLast (appendSuffix s)) suffix + . maybe id (alterFirst . prependPrefix) prefix + . maybe id (alterLast . appendSuffix) suffix $ citationList else maybe mempty (<> " ") prefix - <> (toListOfCites $ map toInTextMode citationList) + <> toListOfCites (map toInTextMode citationList) <> maybe mempty (", " <>) suffix where toCite :: [Citation] -> Inlines @@ -247,7 +252,7 @@ berkeleyCite = try $ do alterFirst, alterLast :: (a -> a) -> [a] -> [a] alterFirst _ [] = [] - alterFirst f (c:cs) = (f c):cs + alterFirst f (c:cs) = f c : cs alterLast f = reverse . alterFirst f . reverse prependPrefix, appendSuffix :: Inlines -> Citation -> Citation @@ -255,12 +260,12 @@ berkeleyCite = try $ do appendSuffix suf c = c { citationSuffix = citationSuffix c <> B.toList suf } data BerkeleyCitationList = BerkeleyCitationList - { berkeleyCiteParens :: Bool + { berkeleyCiteParens :: Bool , berkeleyCiteCommonPrefix :: Maybe Inlines , berkeleyCiteCommonSuffix :: Maybe Inlines - , berkeleyCiteCitations :: [Citation] + , berkeleyCiteCitations :: [Citation] } -berkeleyCitationList :: OrgParser (F BerkeleyCitationList) +berkeleyCitationList :: PandocMonad m => OrgParser m (F BerkeleyCitationList) berkeleyCitationList = try $ do char '[' parens <- choice [ False <$ berkeleyBareTag, True <$ berkeleyParensTag ] @@ -268,29 +273,29 @@ berkeleyCitationList = try $ do skipSpaces commonPrefix <- optionMaybe (try $ citationListPart <* char ';') citations <- citeList - commonSuffix <- optionMaybe (try $ citationListPart) + commonSuffix <- optionMaybe (try citationListPart) char ']' return (BerkeleyCitationList parens <$> sequence commonPrefix <*> sequence commonSuffix <*> citations) where - citationListPart :: OrgParser (F Inlines) + citationListPart :: PandocMonad m => OrgParser m (F Inlines) citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do notFollowedBy' citeKey notFollowedBy (oneOf ";]") inline -berkeleyBareTag :: OrgParser () +berkeleyBareTag :: PandocMonad m => OrgParser m () berkeleyBareTag = try $ void berkeleyBareTag' -berkeleyParensTag :: OrgParser () +berkeleyParensTag :: PandocMonad m => OrgParser m () berkeleyParensTag = try . void $ enclosedByPair '(' ')' berkeleyBareTag' -berkeleyBareTag' :: OrgParser () +berkeleyBareTag' :: PandocMonad m => OrgParser m () berkeleyBareTag' = try $ void (string "cite") -berkeleyTextualCite :: OrgParser (F [Citation]) +berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) berkeleyTextualCite = try $ do (suppressAuthor, key) <- citeKey returnF . return $ Citation @@ -305,14 +310,14 @@ berkeleyTextualCite = try $ do -- The following is what a Berkeley-style bracketed textual citation parser -- would look like. However, as these citations are a subset of Pandoc's Org -- citation style, this isn't used. --- berkeleyBracketedTextualCite :: OrgParser (F [Citation]) +-- berkeleyBracketedTextualCite :: PandocMonad m => OrgParser m (F [Citation]) -- berkeleyBracketedTextualCite = try . (fmap head) $ -- enclosedByPair '[' ']' berkeleyTextualCite -- | Read a link-like org-ref style citation. The citation includes pre and -- post text. However, multiple citations are not possible due to limitations -- in the syntax. -linkLikeOrgRefCite :: OrgParser (F Citation) +linkLikeOrgRefCite :: PandocMonad m => OrgParser m (F Citation) linkLikeOrgRefCite = try $ do _ <- string "[[" mode <- orgRefCiteMode @@ -335,13 +340,20 @@ linkLikeOrgRefCite = try $ do -- | Read a citation key. The characters allowed in citation keys are taken -- from the `org-ref-cite-re` variable in `org-ref.el`. -orgRefCiteKey :: OrgParser String -orgRefCiteKey = try . many1 . satisfy $ \c -> - isAlphaNum c || c `elem` ("-_:\\./"::String) +orgRefCiteKey :: PandocMonad m => OrgParser m String +orgRefCiteKey = + let citeKeySpecialChars = "-_:\\./," :: String + isCiteKeySpecialChar c = c `elem` citeKeySpecialChars + isCiteKeyChar c = isAlphaNum c || isCiteKeySpecialChar c + endOfCitation = try $ do + many $ satisfy isCiteKeySpecialChar + satisfy $ not . isCiteKeyChar + in try $ satisfy isCiteKeyChar `many1Till` lookAhead endOfCitation + -- | Supported citation types. Only a small subset of org-ref types is -- supported for now. TODO: rewrite this, use LaTeX reader as template. -orgRefCiteMode :: OrgParser CitationMode +orgRefCiteMode :: PandocMonad m => OrgParser m CitationMode orgRefCiteMode = choice $ map (\(s, mode) -> mode <$ try (string s <* char ':')) [ ("cite", AuthorInText) @@ -352,10 +364,10 @@ orgRefCiteMode = , ("citeyear", SuppressAuthor) ] -citeList :: OrgParser (F [Citation]) +citeList :: PandocMonad m => OrgParser m (F [Citation]) citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) -citation :: OrgParser (F Citation) +citation :: PandocMonad m => OrgParser m (F Citation) citation = try $ do pref <- prefix (suppress_author, key) <- citeKey @@ -363,15 +375,16 @@ citation = try $ do return $ do x <- pref y <- suff - return $ Citation{ citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } + return Citation + { citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } where prefix = trimInlinesF . mconcat <$> manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) @@ -384,39 +397,39 @@ citation = try $ do then (B.space <>) <$> rest else rest -footnote :: OrgParser (F Inlines) +footnote :: PandocMonad m => OrgParser m (F Inlines) footnote = try $ inlineNote <|> referencedNote -inlineNote :: OrgParser (F Inlines) +inlineNote :: PandocMonad m => OrgParser m (F Inlines) inlineNote = try $ do string "[fn:" ref <- many alphaNum char ':' note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') - when (not $ null ref) $ + unless (null ref) $ addToNotesTable ("fn:" ++ ref, note) return $ B.note <$> note -referencedNote :: OrgParser (F Inlines) +referencedNote :: PandocMonad m => OrgParser m (F Inlines) referencedNote = try $ do ref <- noteMarker return $ do notes <- asksF orgStateNotes' case lookup ref notes of - Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Nothing -> return . B.str $ "[" ++ ref ++ "]" Just contents -> do st <- askF let contents' = runF contents st{ orgStateNotes' = [] } return $ B.note contents' -linkOrImage :: OrgParser (F Inlines) +linkOrImage :: PandocMonad m => OrgParser m (F Inlines) linkOrImage = explicitOrImageLink <|> selflinkOrImage <|> angleLink <|> plainLink <?> "link or image" -explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink :: PandocMonad m => OrgParser m (F Inlines) explicitOrImageLink = try $ do char '[' srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget @@ -427,34 +440,34 @@ explicitOrImageLink = try $ do src <- srcF case cleanLinkString title of Just imgSrc | isImageFilename imgSrc -> - pure $ B.link src "" $ B.image imgSrc mempty mempty + pure . B.link src "" $ B.image imgSrc mempty mempty _ -> linkToInlinesF src =<< title' -selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage :: PandocMonad m => OrgParser m (F Inlines) selflinkOrImage = try $ do src <- char '[' *> linkTarget <* char ']' return $ linkToInlinesF src (B.str src) -plainLink :: OrgParser (F Inlines) +plainLink :: PandocMonad m => OrgParser m (F Inlines) plainLink = try $ do (orig, src) <- uri returnF $ B.link src "" (B.str orig) -angleLink :: OrgParser (F Inlines) +angleLink :: PandocMonad m => OrgParser m (F Inlines) angleLink = try $ do char '<' link <- plainLink char '>' return link -linkTarget :: OrgParser String +linkTarget :: PandocMonad m => OrgParser m String linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") -possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget :: PandocMonad m => OrgParser m String possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") -applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat :: String -> OrgParser m (F String) applyCustomLinkFormat link = do let (linkType, rest) = break (== ':') link return $ do @@ -487,7 +500,7 @@ internalLink link title = do -- @anchor-id@ contains spaces, we are more restrictive in what is accepted as -- an anchor. -anchor :: OrgParser (F Inlines) +anchor :: PandocMonad m => OrgParser m (F Inlines) anchor = try $ do anchorId <- parseAnchor recordAnchorId anchorId @@ -509,23 +522,23 @@ solidify = map replaceSpecialChar | otherwise = '-' -- | Parses an inline code block and marks it as an babel block. -inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock :: PandocMonad m => OrgParser m (F Inlines) inlineCodeBlock = try $ do string "src_" lang <- many1 orgArgWordChar opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") - let attrClasses = [translateLang lang, rundocBlockClass] - let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + let attrClasses = [translateLang lang] + let attrKeyVal = originalLang lang <> opts returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode where - inlineBlockOption :: OrgParser (String, String) + inlineBlockOption :: PandocMonad m => OrgParser m (String, String) inlineBlockOption = try $ do argKey <- orgArgKey paramValue <- option "yes" orgInlineParamValue return (argKey, paramValue) - orgInlineParamValue :: OrgParser String + orgInlineParamValue :: PandocMonad m => OrgParser m String orgInlineParamValue = try $ skipSpaces *> notFollowedBy (char ':') @@ -533,7 +546,7 @@ inlineCodeBlock = try $ do <* skipSpaces -emphasizedText :: OrgParser (F Inlines) +emphasizedText :: PandocMonad m => OrgParser m (F Inlines) emphasizedText = do state <- getState guard . exportEmphasizedText . orgStateExportSettings $ state @@ -544,60 +557,64 @@ emphasizedText = do , underline ] -enclosedByPair :: Char -- ^ opening char +enclosedByPair :: PandocMonad m + => Char -- ^ opening char -> Char -- ^ closing char - -> OrgParser a -- ^ parser - -> OrgParser [a] + -> OrgParser m a -- ^ parser + -> OrgParser m [a] enclosedByPair s e p = char s *> many1Till p (char e) -emph :: OrgParser (F Inlines) +emph :: PandocMonad m => OrgParser m (F Inlines) emph = fmap B.emph <$> emphasisBetween '/' -strong :: OrgParser (F Inlines) +strong :: PandocMonad m => OrgParser m (F Inlines) strong = fmap B.strong <$> emphasisBetween '*' -strikeout :: OrgParser (F Inlines) +strikeout :: PandocMonad m => OrgParser m (F Inlines) strikeout = fmap B.strikeout <$> emphasisBetween '+' --- There is no underline, so we use strong instead. -underline :: OrgParser (F Inlines) -underline = fmap B.strong <$> emphasisBetween '_' +underline :: PandocMonad m => OrgParser m (F Inlines) +underline = fmap underlineSpan <$> emphasisBetween '_' -verbatim :: OrgParser (F Inlines) +verbatim :: PandocMonad m => OrgParser m (F Inlines) verbatim = return . B.code <$> verbatimBetween '=' -code :: OrgParser (F Inlines) +code :: PandocMonad m => OrgParser m (F Inlines) code = return . B.code <$> verbatimBetween '~' -subscript :: OrgParser (F Inlines) +subscript :: PandocMonad m => OrgParser m (F Inlines) subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) -superscript :: OrgParser (F Inlines) +superscript :: PandocMonad m => OrgParser m (F Inlines) superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) -math :: OrgParser (F Inlines) +math :: PandocMonad m => OrgParser m (F Inlines) math = return . B.math <$> choice [ math1CharBetween '$' , mathStringBetween '$' , rawMathBetween "\\(" "\\)" ] -displayMath :: OrgParser (F Inlines) +displayMath :: PandocMonad m => OrgParser m (F Inlines) displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" , rawMathBetween "$$" "$$" ] -updatePositions :: Char - -> OrgParser Char +updatePositions :: PandocMonad m + => Char + -> OrgParser m Char updatePositions c = do + st <- getState + let emphasisPreChars = orgStateEmphasisPreChars st when (c `elem` emphasisPreChars) updateLastPreCharPos when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos return c -symbol :: OrgParser (F Inlines) +symbol :: PandocMonad m => OrgParser m (F Inlines) symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) -emphasisBetween :: Char - -> OrgParser (F Inlines) +emphasisBetween :: PandocMonad m + => Char + -> OrgParser m (F Inlines) emphasisBetween c = try $ do startEmphasisNewlinesCounting emphasisAllowedNewlines res <- enclosedInlines (emphasisStart c) (emphasisEnd c) @@ -606,8 +623,9 @@ emphasisBetween c = try $ do resetEmphasisNewlines return res -verbatimBetween :: Char - -> OrgParser String +verbatimBetween :: PandocMonad m + => Char + -> OrgParser m String verbatimBetween c = try $ emphasisStart c *> many1TillNOrLessNewlines 1 verbatimChar (emphasisEnd c) @@ -615,8 +633,9 @@ verbatimBetween c = try $ verbatimChar = noneOf "\n\r" >>= updatePositions -- | Parses a raw string delimited by @c@ using Org's math rules -mathStringBetween :: Char - -> OrgParser String +mathStringBetween :: PandocMonad m + => Char + -> OrgParser m String mathStringBetween c = try $ do mathStart c body <- many1TillNOrLessNewlines mathAllowedNewlines @@ -626,8 +645,9 @@ mathStringBetween c = try $ do return $ body ++ [final] -- | Parse a single character between @c@ using math rules -math1CharBetween :: Char - -> OrgParser String +math1CharBetween :: PandocMonad m + => Char + -> OrgParser m String math1CharBetween c = try $ do char c res <- noneOf $ c:mathForbiddenBorderChars @@ -635,13 +655,14 @@ math1CharBetween c = try $ do eof <|> () <$ lookAhead (oneOf mathPostChars) return [res] -rawMathBetween :: String +rawMathBetween :: PandocMonad m + => String -> String - -> OrgParser String + -> OrgParser m String rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) -- | Parses the start (opening character) of emphasis -emphasisStart :: Char -> OrgParser Char +emphasisStart :: PandocMonad m => Char -> OrgParser m Char emphasisStart c = try $ do guard =<< afterEmphasisPreChar guard =<< notAfterString @@ -654,7 +675,7 @@ emphasisStart c = try $ do return c -- | Parses the closing character of emphasis -emphasisEnd :: Char -> OrgParser Char +emphasisEnd :: PandocMonad m => Char -> OrgParser m Char emphasisEnd c = try $ do guard =<< notAfterForbiddenBorderChar char c @@ -662,14 +683,16 @@ emphasisEnd c = try $ do updateLastStrPos popInlineCharStack return c - where acceptablePostChars = - surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + where + acceptablePostChars = do + emphasisPostChars <- orgStateEmphasisPostChars <$> getState + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) -mathStart :: Char -> OrgParser Char +mathStart :: PandocMonad m => Char -> OrgParser m Char mathStart c = try $ char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) -mathEnd :: Char -> OrgParser Char +mathEnd :: PandocMonad m => Char -> OrgParser m Char mathEnd c = try $ do res <- noneOf (c:mathForbiddenBorderChars) char c @@ -677,15 +700,15 @@ mathEnd c = try $ do return res -enclosedInlines :: OrgParser a - -> OrgParser b - -> OrgParser (F Inlines) +enclosedInlines :: (PandocMonad m, Show b) => OrgParser m a + -> OrgParser m b + -> OrgParser m (F Inlines) enclosedInlines start end = try $ trimInlinesF . mconcat <$> enclosed start end inline -enclosedRaw :: OrgParser a - -> OrgParser b - -> OrgParser String +enclosedRaw :: (PandocMonad m, Show b) => OrgParser m a + -> OrgParser m b + -> OrgParser m String enclosedRaw start end = try $ start *> (onSingleLine <|> spanningTwoLines) where onSingleLine = try $ many1Till (noneOf "\n\r") end @@ -694,10 +717,10 @@ enclosedRaw start end = try $ -- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume -- newlines. -many1TillNOrLessNewlines :: Int - -> OrgParser Char - -> OrgParser a - -> OrgParser String +many1TillNOrLessNewlines :: PandocMonad m => Int + -> OrgParser m Char + -> OrgParser m a + -> OrgParser m String many1TillNOrLessNewlines n p end = try $ nMoreLines (Just n) mempty >>= oneOrMore where @@ -715,17 +738,9 @@ many1TillNOrLessNewlines n p end = try $ -- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` -- for details). --- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) -emphasisPreChars :: [Char] -emphasisPreChars = "\t \"'({" - --- | Chars allowed at after emphasis -emphasisPostChars :: [Char] -emphasisPostChars = "\t\n !\"'),-.:;?\\}" - -- | Chars not allowed at the (inner) border of emphasis emphasisForbiddenBorderChars :: [Char] -emphasisForbiddenBorderChars = "\t\n\r \"'," +emphasisForbiddenBorderChars = "\t\n\r " -- | The maximum number of newlines within emphasisAllowedNewlines :: Int @@ -746,29 +761,29 @@ mathAllowedNewlines :: Int mathAllowedNewlines = 2 -- | Whether we are right behind a char allowed before emphasis -afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar :: PandocMonad m => OrgParser m Bool afterEmphasisPreChar = do pos <- getPosition lastPrePos <- orgStateLastPreCharPos <$> getState return . fromMaybe True $ (== pos) <$> lastPrePos -- | Whether the parser is right after a forbidden border char -notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar :: PandocMonad m => OrgParser m Bool notAfterForbiddenBorderChar = do pos <- getPosition lastFBCPos <- orgStateLastForbiddenCharPos <$> getState return $ lastFBCPos /= Just pos -- | Read a sub- or superscript expression -subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr :: PandocMonad m => OrgParser m (F Inlines) subOrSuperExpr = try $ - choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + choice [ charsInBalanced '{' '}' (noneOf "\n\r") , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") , simpleSubOrSuperString ] >>= parseFromString (mconcat <$> many inline) where enclosing (left, right) s = left : s ++ [right] -simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString :: PandocMonad m => OrgParser m String simpleSubOrSuperString = try $ do state <- getState guard . exportSubSuperscripts . orgStateExportSettings $ state @@ -777,17 +792,18 @@ simpleSubOrSuperString = try $ do <*> many1 alphaNum ] -inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX :: PandocMonad m => OrgParser m (F Inlines) inlineLaTeX = try $ do cmd <- inlineLaTeXCommand + ils <- (lift . lift) $ parseAsInlineLaTeX cmd maybe mzero returnF $ - parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` ils where parseAsMath :: String -> Maybe Inlines parseAsMath cs = B.fromList <$> texMathToPandoc cs - parseAsInlineLaTeX :: String -> Maybe Inlines - parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + parseAsInlineLaTeX :: PandocMonad m => String -> m (Maybe Inlines) + parseAsInlineLaTeX cs = maybeRight <$> runParserT inlineCommand state "" cs parseAsMathMLSym :: String -> Maybe Inlines parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) @@ -795,19 +811,22 @@ inlineLaTeX = try $ do where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 state :: ParserState - state = def{ stateOptions = def{ readerParseRaw = True }} + state = def{ stateOptions = def{ readerExtensions = + enableExtension Ext_raw_tex (readerExtensions def) } } texMathToPandoc :: String -> Maybe [Inline] - texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline + texMathToPandoc cs = maybeRight (readTeX cs) >>= writePandoc DisplayInline maybeRight :: Either a b -> Maybe b maybeRight = either (const Nothing) Just -inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand :: PandocMonad m => OrgParser m String inlineLaTeXCommand = try $ do rest <- getInput - case runParser rawLaTeXInline def "source" rest of - Right (RawInline _ cs) -> do + st <- getState + parsed <- (lift . lift) $ runParserT rawLaTeXInline st "source" rest + case parsed of + Right cs -> do -- drop any trailing whitespace, those are not be part of the command as -- far as org mode is concerned. let cmdNoSpc = dropWhileEnd isSpace cs @@ -820,33 +839,56 @@ inlineLaTeXCommand = try $ do dropWhileEnd :: (a -> Bool) -> [a] -> [a] dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] -exportSnippet :: OrgParser (F Inlines) +exportSnippet :: PandocMonad m => OrgParser m (F Inlines) exportSnippet = try $ do string "@@" format <- many1Till (alphaNum <|> char '-') (char ':') snippet <- manyTill anyChar (try $ string "@@") returnF $ B.rawInline format snippet -smart :: OrgParser (F Inlines) -smart = do - getOption readerSmart >>= guard - doubleQuoted <|> singleQuoted <|> - choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) +macro :: PandocMonad m => OrgParser m (F Inlines) +macro = try $ do + recursionDepth <- orgStateMacroDepth <$> getState + guard $ recursionDepth < 15 + string "{{{" + name <- many alphaNum + args <- ([] <$ string "}}}") + <|> char '(' *> argument `sepBy` char ',' <* eoa + expander <- lookupMacro name <$> getState + case expander of + Nothing -> mzero + Just fn -> do + updateState $ \s -> s { orgStateMacroDepth = recursionDepth + 1 } + res <- parseFromString (mconcat <$> many inline) $ fn args + updateState $ \s -> s { orgStateMacroDepth = recursionDepth } + return res + where + argument = many $ notFollowedBy eoa *> noneOf "," + eoa = string ")}}}" + +smart :: PandocMonad m => OrgParser m (F Inlines) +smart = choice [doubleQuoted, singleQuoted, orgApostrophe, orgDash, orgEllipses] where orgDash = do - guard =<< getExportSetting exportSpecialStrings - dash <* updatePositions '-' + guardOrSmartEnabled =<< getExportSetting exportSpecialStrings + pure <$> dash <* updatePositions '-' orgEllipses = do - guard =<< getExportSetting exportSpecialStrings - ellipses <* updatePositions '.' - orgApostrophe = - (char '\'' <|> char '\8217') <* updateLastPreCharPos - <* updateLastForbiddenCharPos - *> return (B.str "\x2019") - -singleQuoted :: OrgParser (F Inlines) + guardOrSmartEnabled =<< getExportSetting exportSpecialStrings + pure <$> ellipses <* updatePositions '.' + orgApostrophe = do + guardEnabled Ext_smart + (char '\'' <|> char '\8217') <* updateLastPreCharPos + <* updateLastForbiddenCharPos + returnF (B.str "\x2019") + +guardOrSmartEnabled :: PandocMonad m => Bool -> OrgParser m () +guardOrSmartEnabled b = do + smartExtension <- extensionEnabled Ext_smart <$> getOption readerExtensions + guard (b || smartExtension) + +singleQuoted :: PandocMonad m => OrgParser m (F Inlines) singleQuoted = try $ do - guard =<< getExportSetting exportSmartQuotes + guardOrSmartEnabled =<< getExportSetting exportSmartQuotes singleQuoteStart updatePositions '\'' withQuoteContext InSingleQuote $ @@ -856,12 +898,15 @@ singleQuoted = try $ do -- doubleQuoted will handle regular double-quoted sections, as well -- as dialogues with an open double-quote without a close double-quote -- in the same paragraph. -doubleQuoted :: OrgParser (F Inlines) +doubleQuoted :: PandocMonad m => OrgParser m (F Inlines) doubleQuoted = try $ do - guard =<< getExportSetting exportSmartQuotes + guardOrSmartEnabled =<< getExportSetting exportSmartQuotes doubleQuoteStart updatePositions '"' contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return - (fmap B.doubleQuoted . trimInlinesF $ contents)) - <|> (return $ return (B.str "\8220") <> contents) + let doubleQuotedContent = withQuoteContext InDoubleQuote $ do + doubleQuoteEnd + updateLastForbiddenCharPos + return . fmap B.doubleQuoted . trimInlinesF $ contents + let leftQuoteAndContent = return $ pure (B.str "\8220") <> contents + doubleQuotedContent <|> leftQuoteAndContent diff --git a/src/Text/Pandoc/Readers/Org/Meta.hs b/src/Text/Pandoc/Readers/Org/Meta.hs index 1fea3e890..6ad403fd8 100644 --- a/src/Text/Pandoc/Readers/Org/Meta.hs +++ b/src/Text/Pandoc/Readers/Org/Meta.hs @@ -1,7 +1,7 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE TupleSections #-} +{-# LANGUAGE TupleSections #-} {- -Copyright (C) 2014-2017 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Org.Meta - Copyright : Copyright (C) 2014-2017 Albert Krewinkel + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -33,25 +33,26 @@ module Text.Pandoc.Readers.Org.Meta , metaLine ) where -import Text.Pandoc.Readers.Org.BlockStarts -import Text.Pandoc.Readers.Org.ExportSettings ( exportSettings ) -import Text.Pandoc.Readers.Org.Inlines -import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.ExportSettings (exportSettings) +import Text.Pandoc.Readers.Org.Inlines +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Builder (Blocks, Inlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Blocks, Inlines ) -import Text.Pandoc.Definition +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Shared (safeRead) -import Control.Monad ( mzero, void ) -import Data.Char ( toLower ) -import Data.List ( intersperse ) +import Control.Monad (mzero, void, when) +import Data.Char (toLower) +import Data.List (intersperse) import qualified Data.Map as M -import Data.Monoid ( (<>) ) -import Network.HTTP ( urlEncode ) +import Network.HTTP (urlEncode) -- | Returns the current meta, respecting export options. -metaExport :: OrgParser (F Meta) +metaExport :: Monad m => OrgParser m (F Meta) metaExport = do st <- getState let settings = orgStateExportSettings st @@ -68,29 +69,32 @@ removeMeta key meta' = -- | Parse and handle a single line containing meta information -- The order, in which blocks are tried, makes sure that we're not looking at -- the beginning of a block, so we don't need to check for it -metaLine :: OrgParser Blocks +metaLine :: PandocMonad m => OrgParser m Blocks metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) -declarationLine :: OrgParser () +declarationLine :: PandocMonad m => OrgParser m () declarationLine = try $ do key <- map toLower <$> metaKey (key', value) <- metaValue key - updateState $ \st -> - let meta' = B.setMeta key' <$> value <*> pure nullMeta - in st { orgStateMeta = meta' <> orgStateMeta st } + let addMetaValue st = + st { orgStateMeta = B.setMeta key' <$> value <*> orgStateMeta st } + when (key' /= "results") $ updateState addMetaValue -metaKey :: OrgParser String +metaKey :: Monad m => OrgParser m String metaKey = map toLower <$> many1 (noneOf ": \n\r") <* char ':' <* skipSpaces -metaValue :: String -> OrgParser (String, (F MetaValue)) +metaValue :: PandocMonad m => String -> OrgParser m (String, F MetaValue) metaValue key = let inclKey = "header-includes" in case key of "author" -> (key,) <$> metaInlinesCommaSeparated + "keywords" -> (key,) <$> metaInlinesCommaSeparated "title" -> (key,) <$> metaInlines + "subtitle" -> (key,) <$> metaInlines "date" -> (key,) <$> metaInlines + "nocite" -> (key,) <$> accumulatingList key metaInlines "header-includes" -> (key,) <$> accumulatingList key metaInlines "latex_header" -> (inclKey,) <$> accumulatingList inclKey (metaExportSnippet "latex") @@ -103,32 +107,32 @@ metaValue key = accumulatingList inclKey (metaExportSnippet "html") _ -> (key,) <$> metaString -metaInlines :: OrgParser (F MetaValue) +metaInlines :: PandocMonad m => OrgParser m (F MetaValue) metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline -metaInlinesCommaSeparated :: OrgParser (F MetaValue) +metaInlinesCommaSeparated :: PandocMonad m => OrgParser m (F MetaValue) metaInlinesCommaSeparated = do - authStrs <- (many1 (noneOf ",\n")) `sepBy1` (char ',') + itemStrs <- many1 (noneOf ",\n") `sepBy1` char ',' newline - authors <- mapM (parseFromString inlinesTillNewline . (++ "\n")) authStrs + items <- mapM (parseFromString inlinesTillNewline . (++ "\n")) itemStrs let toMetaInlines = MetaInlines . B.toList - return $ MetaList . map toMetaInlines <$> sequence authors + return $ MetaList . map toMetaInlines <$> sequence items -metaString :: OrgParser (F MetaValue) +metaString :: Monad m => OrgParser m (F MetaValue) metaString = metaModifiedString id -metaModifiedString :: (String -> String) -> OrgParser (F MetaValue) +metaModifiedString :: Monad m => (String -> String) -> OrgParser m (F MetaValue) metaModifiedString f = return . MetaString . f <$> anyLine -- | Read an format specific meta definition -metaExportSnippet :: String -> OrgParser (F MetaValue) +metaExportSnippet :: Monad m => String -> OrgParser m (F MetaValue) metaExportSnippet format = return . MetaInlines . B.toList . B.rawInline format <$> anyLine -- | Accumulate the result of the @parser@ in a list under @key@. -accumulatingList :: String - -> OrgParser (F MetaValue) - -> OrgParser (F MetaValue) +accumulatingList :: Monad m => String + -> OrgParser m (F MetaValue) + -> OrgParser m (F MetaValue) accumulatingList key p = do value <- p meta' <- orgStateMeta <$> getState @@ -141,7 +145,7 @@ accumulatingList key p = do -- -- export options -- -optionLine :: OrgParser () +optionLine :: Monad m => OrgParser m () optionLine = try $ do key <- metaKey case key of @@ -150,16 +154,19 @@ optionLine = try $ do "todo" -> todoSequence >>= updateState . registerTodoSequence "seq_todo" -> todoSequence >>= updateState . registerTodoSequence "typ_todo" -> todoSequence >>= updateState . registerTodoSequence - _ -> mzero + "macro" -> macroDefinition >>= updateState . registerMacro + "pandoc-emphasis-pre" -> emphChars >>= updateState . setEmphasisPreChar + "pandoc-emphasis-post" -> emphChars >>= updateState . setEmphasisPostChar + _ -> mzero -addLinkFormat :: String +addLinkFormat :: Monad m => String -> (String -> String) - -> OrgParser () + -> OrgParser m () addLinkFormat key formatter = updateState $ \s -> let fs = orgStateLinkFormatters s in s{ orgStateLinkFormatters = M.insert key formatter fs } -parseLinkFormat :: OrgParser ((String, String -> String)) +parseLinkFormat :: Monad m => OrgParser m (String, String -> String) parseLinkFormat = try $ do linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces linkSubst <- parseFormat @@ -167,9 +174,8 @@ parseLinkFormat = try $ do -- | An ad-hoc, single-argument-only implementation of a printf-style format -- parser. -parseFormat :: OrgParser (String -> String) -parseFormat = try $ do - replacePlain <|> replaceUrl <|> justAppend +parseFormat :: Monad m => OrgParser m (String -> String) +parseFormat = try $ replacePlain <|> replaceUrl <|> justAppend where -- inefficient, but who cares replacePlain = try $ (\x -> concat . flip intersperse x) @@ -181,13 +187,34 @@ parseFormat = try $ do rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) -inlinesTillNewline :: OrgParser (F Inlines) -inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline +setEmphasisPreChar :: Maybe [Char] -> OrgParserState -> OrgParserState +setEmphasisPreChar csMb st = + let preChars = case csMb of + Nothing -> orgStateEmphasisPreChars defaultOrgParserState + Just cs -> cs + in st { orgStateEmphasisPreChars = preChars } + +setEmphasisPostChar :: Maybe [Char] -> OrgParserState -> OrgParserState +setEmphasisPostChar csMb st = + let postChars = case csMb of + Nothing -> orgStateEmphasisPostChars defaultOrgParserState + Just cs -> cs + in st { orgStateEmphasisPostChars = postChars } + +emphChars :: Monad m => OrgParser m (Maybe [Char]) +emphChars = do + skipSpaces + safeRead <$> anyLine + +inlinesTillNewline :: PandocMonad m => OrgParser m (F Inlines) +inlinesTillNewline = do + updateLastPreCharPos + trimInlinesF . mconcat <$> manyTill inline newline -- -- ToDo Sequences and Keywords -- -todoSequence :: OrgParser TodoSequence +todoSequence :: Monad m => OrgParser m TodoSequence todoSequence = try $ do todoKws <- todoKeywords doneKws <- optionMaybe $ todoDoneSep *> todoKeywords @@ -201,13 +228,13 @@ todoSequence = try $ do (x:xs) -> return $ keywordsToSequence (reverse xs) [x] where - todoKeywords :: OrgParser [String] + todoKeywords :: Monad m => OrgParser m [String] todoKeywords = try $ let keyword = many1 nonspaceChar <* skipSpaces endOfKeywords = todoDoneSep <|> void newline in manyTill keyword (lookAhead endOfKeywords) - todoDoneSep :: OrgParser () + todoDoneSep :: Monad m => OrgParser m () todoDoneSep = void . try $ skipSpaces *> char '|' <* skipSpaces1 keywordsToSequence :: [String] -> [String] -> TodoSequence @@ -215,3 +242,27 @@ todoSequence = try $ do let todoMarkers = map (TodoMarker Todo) todo doneMarkers = map (TodoMarker Done) done in todoMarkers ++ doneMarkers + +macroDefinition :: Monad m => OrgParser m (String, [String] -> String) +macroDefinition = try $ do + macroName <- many1 nonspaceChar <* skipSpaces + firstPart <- expansionPart + (elemOrder, parts) <- unzip <$> many ((,) <$> placeholder <*> expansionPart) + let expander = mconcat . alternate (firstPart:parts) . reorder elemOrder + return (macroName, expander) + where + placeholder :: Monad m => OrgParser m Int + placeholder = try . fmap read $ char '$' *> many1 digit + + expansionPart :: Monad m => OrgParser m String + expansionPart = try $ many (notFollowedBy placeholder *> noneOf "\n\r") + + alternate :: [a] -> [a] -> [a] + alternate [] ys = ys + alternate xs [] = xs + alternate (x:xs) (y:ys) = x : y : alternate xs ys + + reorder :: [Int] -> [String] -> [String] + reorder perm xs = + let element n = take 1 $ drop (n - 1) xs + in concatMap element perm diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs index 38f95ca95..6316766fa 100644 --- a/src/Text/Pandoc/Readers/Org/ParserState.hs +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -1,8 +1,7 @@ -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,8 +19,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.ParserState + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -30,16 +29,21 @@ Define the Org-mode parser state. -} module Text.Pandoc.Readers.Org.ParserState ( OrgParserState (..) + , defaultOrgParserState , OrgParserLocal (..) , OrgNoteRecord , HasReaderOptions (..) , HasQuoteContext (..) + , HasMacros (..) , TodoMarker (..) , TodoSequence , TodoState (..) , activeTodoMarkers , registerTodoSequence - , F(..) + , MacroExpander + , lookupMacro + , registerMacro + , F , askF , asksF , trimInlinesF @@ -50,24 +54,28 @@ module Text.Pandoc.Readers.Org.ParserState , optionsToParserState ) where -import Control.Monad (liftM, liftM2) -import Control.Monad.Reader (Reader, runReader, ask, asks, local) +import Control.Monad.Reader (ReaderT, asks, local) -import Data.Default (Default(..)) +import Data.Default (Default (..)) import qualified Data.Map as M import qualified Data.Set as Set - -import Text.Pandoc.Builder ( Inlines, Blocks, trimInlines ) -import Text.Pandoc.Definition ( Meta(..), nullMeta ) -import Text.Pandoc.Options ( ReaderOptions(..) ) -import Text.Pandoc.Parsing ( HasHeaderMap(..) - , HasIdentifierList(..) - , HasLastStrPosition(..) - , HasQuoteContext(..) - , HasReaderOptions(..) - , ParserContext(..) - , QuoteContext(..) - , SourcePos ) +import Data.Text (Text) + +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Definition (Meta (..), nullMeta) +import Text.Pandoc.Logging +import Text.Pandoc.Options (ReaderOptions (..)) +import Text.Pandoc.Parsing (Future, HasHeaderMap (..), HasIdentifierList (..), + HasIncludeFiles (..), HasLastStrPosition (..), + HasLogMessages (..), HasMacros (..), + HasQuoteContext (..), HasReaderOptions (..), + ParserContext (..), QuoteContext (..), SourcePos, + askF, asksF, returnF, runF, trimInlinesF) +import Text.Pandoc.Readers.LaTeX.Types (Macro) + +-- | This is used to delay evaluation until all relevant information has been +-- parsed and made available in the parser state. +type F = Future OrgParserState -- | An inline note / footnote containing the note key and its (inline) value. type OrgNoteRecord = (String, F Blocks) @@ -76,6 +84,8 @@ type OrgNoteTable = [OrgNoteRecord] -- | Map of functions for link transformations. The map key is refers to the -- link-type, the corresponding function transforms the given link string. type OrgLinkFormatters = M.Map String (String -> String) +-- | Macro expander function +type MacroExpander = [String] -> String -- | The states in which a todo item can be data TodoState = Todo | Done @@ -95,22 +105,34 @@ type TodoSequence = [TodoMarker] data OrgParserState = OrgParserState { orgStateAnchorIds :: [String] , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisPreChars :: [Char] -- ^ Chars allowed to occur before + -- emphasis; spaces and newlines are + -- always ok in addition to what is + -- specified here. + , orgStateEmphasisPostChars :: [Char] -- ^ Chars allowed at after emphasis , orgStateEmphasisNewlines :: Maybe Int , orgStateExportSettings :: ExportSettings , orgStateHeaderMap :: M.Map Inlines String , orgStateIdentifiers :: Set.Set String + , orgStateIncludeFiles :: [String] , orgStateLastForbiddenCharPos :: Maybe SourcePos , orgStateLastPreCharPos :: Maybe SourcePos , orgStateLastStrPos :: Maybe SourcePos , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMacros :: M.Map String MacroExpander + , orgStateMacroDepth :: Int , orgStateMeta :: F Meta , orgStateNotes' :: OrgNoteTable , orgStateOptions :: ReaderOptions , orgStateParserContext :: ParserContext , orgStateTodoSequences :: [TodoSequence] + , orgLogMessages :: [LogMessage] + , orgMacros :: M.Map Text Macro } -data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext } +data OrgParserLocal = OrgParserLocal + { orgLocalQuoteContext :: QuoteContext + } instance Default OrgParserLocal where def = OrgParserLocal NoQuote @@ -122,7 +144,7 @@ instance HasLastStrPosition OrgParserState where getLastStrPos = orgStateLastStrPos setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } -instance HasQuoteContext st (Reader OrgParserLocal) where +instance Monad m => HasQuoteContext st (ReaderT OrgParserLocal m) where getQuoteContext = asks orgLocalQuoteContext withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q}) @@ -134,26 +156,47 @@ instance HasHeaderMap OrgParserState where extractHeaderMap = orgStateHeaderMap updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) } +instance HasLogMessages OrgParserState where + addLogMessage msg st = st{ orgLogMessages = msg : orgLogMessages st } + getLogMessages st = reverse $ orgLogMessages st + +instance HasMacros OrgParserState where + extractMacros st = orgMacros st + updateMacros f st = st{ orgMacros = f (orgMacros st) } + +instance HasIncludeFiles OrgParserState where + getIncludeFiles = orgStateIncludeFiles + addIncludeFile f st = st { orgStateIncludeFiles = f : orgStateIncludeFiles st } + dropLatestIncludeFile st = + st { orgStateIncludeFiles = drop 1 $ orgStateIncludeFiles st } + instance Default OrgParserState where def = defaultOrgParserState defaultOrgParserState :: OrgParserState defaultOrgParserState = OrgParserState { orgStateAnchorIds = [] + , orgStateEmphasisPreChars = "-\t ('\"{" + , orgStateEmphasisPostChars = "-\t\n .,:!?;'\")}[" , orgStateEmphasisCharStack = [] , orgStateEmphasisNewlines = Nothing , orgStateExportSettings = def , orgStateHeaderMap = M.empty , orgStateIdentifiers = Set.empty + , orgStateIncludeFiles = [] , orgStateLastForbiddenCharPos = Nothing , orgStateLastPreCharPos = Nothing , orgStateLastStrPos = Nothing , orgStateLinkFormatters = M.empty + , orgStateMacros = M.empty + , orgStateMacroDepth = 0 , orgStateMeta = return nullMeta , orgStateNotes' = [] , orgStateOptions = def , orgStateParserContext = NullState , orgStateTodoSequences = [] + , orgLogMessages = [] + , orgMacros = M.empty } optionsToParserState :: ReaderOptions -> OrgParserState @@ -177,6 +220,15 @@ activeTodoSequences st = activeTodoMarkers :: OrgParserState -> TodoSequence activeTodoMarkers = concat . activeTodoSequences +lookupMacro :: String -> OrgParserState -> Maybe MacroExpander +lookupMacro macroName = M.lookup macroName . orgStateMacros + +registerMacro :: (String, MacroExpander) -> OrgParserState -> OrgParserState +registerMacro (name, expander) st = + let curMacros = orgStateMacros st + in st{ orgStateMacros = M.insert name expander curMacros } + + -- -- Export Settings @@ -191,20 +243,22 @@ data ArchivedTreesOption = -- | Export settings <http://orgmode.org/manual/Export-settings.html> -- These settings can be changed via OPTIONS statements. data ExportSettings = ExportSettings - { exportArchivedTrees :: ArchivedTreesOption -- ^ How to treat archived trees - , exportDrawers :: Either [String] [String] + { exportArchivedTrees :: ArchivedTreesOption -- ^ How to treat archived trees + , exportDrawers :: Either [String] [String] -- ^ Specify drawer names which should be exported. @Left@ names are -- explicitly excluded from the resulting output while @Right@ means that -- only the listed drawer names should be included. - , exportEmphasizedText :: Bool -- ^ Parse emphasized text - , exportHeadlineLevels :: Int + , exportEmphasizedText :: Bool -- ^ Parse emphasized text + , exportHeadlineLevels :: Int -- ^ Maximum depth of headlines, deeper headlines are convert to list - , exportSmartQuotes :: Bool -- ^ Parse quotes smartly - , exportSpecialStrings :: Bool -- ^ Parse ellipses and dashes smartly - , exportSubSuperscripts :: Bool -- ^ TeX-like syntax for sub- and superscripts - , exportWithAuthor :: Bool -- ^ Include author in final meta-data - , exportWithCreator :: Bool -- ^ Include creator in final meta-data - , exportWithEmail :: Bool -- ^ Include email in final meta-data + , exportPreserveBreaks :: Bool -- ^ Whether to preserve linebreaks + , exportSmartQuotes :: Bool -- ^ Parse quotes smartly + , exportSpecialStrings :: Bool -- ^ Parse ellipses and dashes smartly + , exportSubSuperscripts :: Bool -- ^ TeX-like syntax for sub- and superscripts + , exportWithAuthor :: Bool -- ^ Include author in final meta-data + , exportWithCreator :: Bool -- ^ Include creator in final meta-data + , exportWithEmail :: Bool -- ^ Include email in final meta-data + , exportWithTags :: Bool -- ^ Keep tags as part of headlines , exportWithTodoKeywords :: Bool -- ^ Keep TODO keywords in headers } @@ -217,43 +271,13 @@ defaultExportSettings = ExportSettings , exportDrawers = Left ["LOGBOOK"] , exportEmphasizedText = True , exportHeadlineLevels = 3 - , exportSmartQuotes = True + , exportPreserveBreaks = False + , exportSmartQuotes = False , exportSpecialStrings = True , exportSubSuperscripts = True , exportWithAuthor = True , exportWithCreator = True , exportWithEmail = True + , exportWithTags = True , exportWithTodoKeywords = True } - - --- --- Parser state reader --- - --- | Reader monad wrapping the parser state. This is used to delay evaluation --- until all relevant information has been parsed and made available in the --- parser state. See also the newtype of the same name in --- Text.Pandoc.Parsing. -newtype F a = F { unF :: Reader OrgParserState a - } deriving (Functor, Applicative, Monad) - -instance Monoid a => Monoid (F a) where - mempty = return mempty - mappend = liftM2 mappend - mconcat = fmap mconcat . sequence - -runF :: F a -> OrgParserState -> a -runF = runReader . unF - -askF :: F OrgParserState -askF = F ask - -asksF :: (OrgParserState -> a) -> F a -asksF f = F $ asks f - -trimInlinesF :: F Inlines -> F Inlines -trimInlinesF = liftM trimInlines - -returnF :: Monad m => a -> m (F a) -returnF = return . return diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs index 95415f823..36420478b 100644 --- a/src/Text/Pandoc/Readers/Org/Parsing.hs +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,8 +17,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.Parsing + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -31,6 +31,8 @@ functions are adapted to Org-mode specific functionality. module Text.Pandoc.Readers.Org.Parsing ( OrgParser , anyLine + , anyLineNewline + , indentWith , blanklines , newline , parseFromString @@ -70,8 +72,11 @@ module Text.Pandoc.Readers.Org.Parsing , dash , ellipses , citeKey + , gridTableWith + , insertIncludedFileF -- * Re-exports from Text.Pandoc.Parsec , runParser + , runParserT , getInput , char , letter @@ -107,24 +112,24 @@ module Text.Pandoc.Readers.Org.Parsing , getPosition ) where -import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Parsing hiding (F, anyLine, blanklines, newline, + parseFromString) import qualified Text.Pandoc.Parsing as P -import Text.Pandoc.Parsing hiding ( anyLine, blanklines, newline - , parseFromString ) -import Control.Monad ( guard ) -import Control.Monad.Reader ( Reader ) +import Control.Monad (guard) +import Control.Monad.Reader (ReaderT) -- | The parser used to read org files. -type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal) +type OrgParser m = ParserT [Char] OrgParserState (ReaderT OrgParserLocal m) -- -- Adaptions and specializations of parsing utilities -- -- | Parse any line of text -anyLine :: OrgParser String +anyLine :: Monad m => OrgParser m String anyLine = P.anyLine <* updateLastPreCharPos @@ -132,7 +137,7 @@ anyLine = -- The version Text.Pandoc.Parsing cannot be used, as we need additional parts -- of the state saved and restored. -parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString :: Monad m => OrgParser m a -> String -> OrgParser m a parseFromString parser str' = do oldLastPreCharPos <- orgStateLastPreCharPos <$> getState updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } @@ -141,33 +146,34 @@ parseFromString parser str' = do return result -- | Skip one or more tab or space characters. -skipSpaces1 :: OrgParser () +skipSpaces1 :: Monad m => OrgParser m () skipSpaces1 = skipMany1 spaceChar -- | Like @Text.Parsec.Char.newline@, but causes additional state changes. -newline :: OrgParser Char +newline :: Monad m => OrgParser m Char newline = P.newline <* updateLastPreCharPos <* updateLastForbiddenCharPos -- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes. -blanklines :: OrgParser [Char] +blanklines :: Monad m => OrgParser m [Char] blanklines = P.blanklines <* updateLastPreCharPos <* updateLastForbiddenCharPos -- | Succeeds when we're in list context. -inList :: OrgParser () +inList :: Monad m => OrgParser m () inList = do ctx <- orgStateParserContext <$> getState guard (ctx == ListItemState) -- | Parse in different context -withContext :: ParserContext -- ^ New parser context - -> OrgParser a -- ^ Parser to run in that context - -> OrgParser a +withContext :: Monad m + => ParserContext -- ^ New parser context + -> OrgParser m a -- ^ Parser to run in that context + -> OrgParser m a withContext context parser = do oldContext <- orgStateParserContext <$> getState updateState $ \s -> s{ orgStateParserContext = context } @@ -180,19 +186,19 @@ withContext context parser = do -- -- | Get an export setting. -getExportSetting :: (ExportSettings -> a) -> OrgParser a +getExportSetting :: Monad m => (ExportSettings -> a) -> OrgParser m a getExportSetting s = s . orgStateExportSettings <$> getState -- | Set the current position as the last position at which a forbidden char -- was found (i.e. a character which is not allowed at the inner border of -- markup). -updateLastForbiddenCharPos :: OrgParser () +updateLastForbiddenCharPos :: Monad m => OrgParser m () updateLastForbiddenCharPos = getPosition >>= \p -> updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} -- | Set the current parser position as the position at which a character was -- seen which allows inline markup to follow. -updateLastPreCharPos :: OrgParser () +updateLastPreCharPos :: Monad m => OrgParser m () updateLastPreCharPos = getPosition >>= \p -> updateState $ \s -> s{ orgStateLastPreCharPos = Just p} @@ -201,15 +207,15 @@ updateLastPreCharPos = getPosition >>= \p -> -- -- | Read the key of a plist style key-value list. -orgArgKey :: OrgParser String +orgArgKey :: Monad m => OrgParser m String orgArgKey = try $ skipSpaces *> char ':' *> many1 orgArgWordChar -- | Read the value of a plist style key-value list. -orgArgWord :: OrgParser String +orgArgWord :: Monad m => OrgParser m String orgArgWord = many1 orgArgWordChar -- | Chars treated as part of a word in plists. -orgArgWordChar :: OrgParser Char +orgArgWordChar :: Monad m => OrgParser m Char orgArgWordChar = alphaNum <|> oneOf "-_" diff --git a/src/Text/Pandoc/Readers/Org/Shared.hs b/src/Text/Pandoc/Readers/Org/Shared.hs index 8c87cfa25..cba72cc07 100644 --- a/src/Text/Pandoc/Readers/Org/Shared.hs +++ b/src/Text/Pandoc/Readers/Org/Shared.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2018 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,8 +18,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Org.Options - Copyright : Copyright (C) 2014-2016 Albert Krewinkel + Module : Text.Pandoc.Readers.Org.Shared + Copyright : Copyright (C) 2014-2018 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -29,14 +29,12 @@ Utility functions used in other Pandoc Org modules. module Text.Pandoc.Readers.Org.Shared ( cleanLinkString , isImageFilename - , rundocBlockClass - , toRundocAttrib + , originalLang , translateLang ) where -import Control.Arrow ( first ) -import Data.Char ( isAlphaNum ) -import Data.List ( isPrefixOf, isSuffixOf ) +import Data.Char (isAlphaNum) +import Data.List (isPrefixOf, isSuffixOf) -- | Check whether the given string looks like the path to of URL of an image. @@ -58,8 +56,8 @@ cleanLinkString s = '.':'/':_ -> Just s -- relative path '.':'.':'/':_ -> Just s -- relative path -- Relative path or URL (file schema) - 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' - _ | isUrl s -> Just s -- URL + 'f':'i':'l':'e':':':s' -> Just $ if "//" `isPrefixOf` s' then s else s' + _ | isUrl s -> Just s -- URL _ -> Nothing where isUrl :: String -> Bool @@ -68,17 +66,17 @@ cleanLinkString s = in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme && not (null path) --- | Prefix used for Rundoc classes and arguments. -rundocPrefix :: String -rundocPrefix = "rundoc-" +-- | Creates an key-value pair marking the original language name specified for +-- a piece of source code. --- | The class-name used to mark rundoc blocks. -rundocBlockClass :: String -rundocBlockClass = rundocPrefix ++ "block" - --- | Prefix the name of a attribute, marking it as a code execution parameter. -toRundocAttrib :: (String, String) -> (String, String) -toRundocAttrib = first (rundocPrefix ++) +-- | Creates an key-value attributes marking the original language name +-- specified for a piece of source code. +originalLang :: String -> [(String, String)] +originalLang lang = + let transLang = translateLang lang + in if transLang == lang + then [] + else [("org-language", lang)] -- | Translate from Org-mode's programming language identifiers to those used -- by Pandoc. This is useful to allow for proper syntax highlighting in diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index e05b6cba2..e88d997f0 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -1,7 +1,8 @@ -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} {- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2015 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -29,38 +30,50 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion from reStructuredText to 'Pandoc' document. -} -module Text.Pandoc.Readers.RST ( - readRST, - readRSTWithWarnings - ) where -import Text.Pandoc.Definition -import Text.Pandoc.Builder (setMeta, fromList) -import Text.Pandoc.Shared -import Text.Pandoc.Parsing -import Text.Pandoc.Options -import Control.Monad ( when, liftM, guard, mzero ) -import Data.List ( findIndex, intercalate, - transpose, sort, deleteFirstsBy, isSuffixOf , nub, union) -import Data.Maybe (fromMaybe) +module Text.Pandoc.Readers.RST ( readRST ) where +import Control.Arrow (second) +import Control.Monad (forM_, guard, liftM, mplus, mzero, when) +import Control.Monad.Except (throwError) +import Control.Monad.Identity (Identity (..)) +import Data.Char (isHexDigit, isSpace, toLower, toUpper) +import Data.List (deleteFirstsBy, elemIndex, intercalate, isInfixOf, isSuffixOf, + nub, sort, transpose, union) import qualified Data.Map as M -import Text.Printf ( printf ) -import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) -import qualified Text.Pandoc.Builder as B -import Data.Sequence (viewr, ViewR(..)) -import Data.Char (toLower, isHexDigit, isSpace) +import Data.Maybe (fromMaybe, isJust) import Data.Monoid ((<>)) +import Data.Sequence (ViewR (..), viewr) +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Builder (Blocks, Inlines, fromList, setMeta, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad, fetchItem, readFileFromDirs) +import Text.Pandoc.CSV (CSVOptions (..), defaultCSVOptions, parseCSV) +import Text.Pandoc.Definition import Text.Pandoc.Error +import Text.Pandoc.ImageSize (lengthToDim, scaleDimension) +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Shared +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Printf (printf) --- | Parse reStructuredText string and return Pandoc document. -readRST :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readRST opts s = (readWith parseRST) def{ stateOptions = opts } (s ++ "\n\n") - -readRSTWithWarnings :: ReaderOptions -> String -> Either PandocError (Pandoc, [String]) -readRSTWithWarnings opts s = (readWithWarnings parseRST) def{ stateOptions = opts } (s ++ "\n\n") +-- TODO: +-- [ ] .. parsed-literal -type RSTParser = Parser [Char] ParserState +-- | Parse reStructuredText string and return Pandoc document. +readRST :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readRST opts s = do + parsed <- readWithM parseRST def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e + +type RSTParser m = ParserT [Char] ParserState m -- -- Constants and data structure definitions @@ -87,9 +100,9 @@ isHeader _ _ = False -- | Promote all headers in a list of blocks. (Part of -- title transformation for RST.) promoteHeaders :: Int -> [Block] -> [Block] -promoteHeaders num ((Header level attr text):rest) = - (Header (level - num) attr text):(promoteHeaders num rest) -promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders num (Header level attr text:rest) = + Header (level - num) attr text:promoteHeaders num rest +promoteHeaders num (other:rest) = other:promoteHeaders num rest promoteHeaders _ [] = [] -- | If list of blocks starts with a header (or a header and subheader) @@ -101,11 +114,11 @@ titleTransform :: ([Block], Meta) -- ^ list of blocks, metadata titleTransform (bs, meta) = let (bs', meta') = case bs of - ((Header 1 _ head1):(Header 2 _ head2):rest) + (Header 1 _ head1:Header 2 _ head2:rest) | not (any (isHeader 1) rest || any (isHeader 2) rest) -> -- tit/sub (promoteHeaders 2 rest, setMeta "title" (fromList head1) $ setMeta "subtitle" (fromList head2) meta) - ((Header 1 _ head1):rest) + (Header 1 _ head1:rest) | not (any (isHeader 1) rest) -> -- title only (promoteHeaders 1 rest, setMeta "title" (fromList head1) meta) @@ -121,8 +134,10 @@ metaFromDefList ds meta = adjustAuthors $ foldr f meta ds adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author" $ M.adjust toPlain "date" $ M.adjust toPlain "title" - $ M.mapKeys (\k -> if k == "authors" then "author" else k) - $ metamap + $ M.mapKeys (\k -> + if k == "authors" + then "author" + else k) metamap toPlain (MetaBlocks [Para xs]) = MetaInlines xs toPlain x = x splitAuthors (MetaBlocks [Para xs]) @@ -131,6 +146,12 @@ metaFromDefList ds meta = adjustAuthors $ foldr f meta ds splitAuthors x = x splitAuthors' = map normalizeSpaces . splitOnSemi . concatMap factorSemi + normalizeSpaces = reverse . dropWhile isSp . reverse . + dropWhile isSp + isSp Space = True + isSp SoftBreak = True + isSp LineBreak = True + isSp _ = False splitOnSemi = splitBy (==Str ";") factorSemi (Str []) = [] factorSemi (Str s) = case break (==';') s of @@ -141,41 +162,61 @@ metaFromDefList ds meta = adjustAuthors $ foldr f meta ds factorSemi (Str ys) factorSemi x = [x] -parseRST :: RSTParser Pandoc +parseRST :: PandocMonad m => RSTParser m Pandoc parseRST = do optional blanklines -- skip blank lines at beginning of file startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys were... docMinusKeys <- concat <$> - manyTill (referenceKey <|> noteBlock <|> lineClump) eof + manyTill (referenceKey <|> anchorDef <|> + noteBlock <|> citationBlock <|> + headerBlock <|> lineClump) eof setInput docMinusKeys setPosition startPos st' <- getState let reversedNotes = stateNotes st' - updateState $ \s -> s { stateNotes = reverse reversedNotes } + updateState $ \s -> s { stateNotes = reverse reversedNotes + , stateHeaders = mempty + , stateIdentifiers = mempty } -- now parse it for real... blocks <- B.toList <$> parseBlocks + citations <- (sort . M.toList . stateCitations) <$> getState + citationItems <- mapM parseCitation citations + let refBlock = if null citationItems + then [] + else [Div ("citations",[],[]) $ + B.toList $ B.definitionList citationItems] standalone <- getOption readerStandalone state <- getState let meta = stateMeta state let (blocks', meta') = if standalone then titleTransform (blocks, meta) else (blocks, meta) - return $ Pandoc meta' blocks' + reportLogMessages + return $ Pandoc meta' (blocks' ++ refBlock) + +parseCitation :: PandocMonad m + => (String, String) -> RSTParser m (Inlines, [Blocks]) +parseCitation (ref, raw) = do + contents <- parseFromString' parseBlocks raw + return (B.spanWith (ref, ["citation-label"], []) (B.str ref), + [contents]) + -- -- parsing blocks -- -parseBlocks :: RSTParser Blocks +parseBlocks :: PandocMonad m => RSTParser m Blocks parseBlocks = mconcat <$> manyTill block eof -block :: RSTParser Blocks +block :: PandocMonad m => RSTParser m Blocks block = choice [ codeBlock , blockQuote , fieldList , directive + , anchor , comment , header , hrule @@ -191,7 +232,7 @@ block = choice [ codeBlock -- field list -- -rawFieldListItem :: Int -> RSTParser (String, String) +rawFieldListItem :: Monad m => Int -> RSTParser m (String, String) rawFieldListItem minIndent = try $ do indent <- length <$> many (char ' ') guard $ indent >= minIndent @@ -204,15 +245,15 @@ rawFieldListItem minIndent = try $ do let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n" return (name, raw) -fieldListItem :: Int -> RSTParser (Inlines, [Blocks]) +fieldListItem :: PandocMonad m => Int -> RSTParser m (Inlines, [Blocks]) fieldListItem minIndent = try $ do (name, raw) <- rawFieldListItem minIndent term <- parseInlineFromString name - contents <- parseFromString parseBlocks raw + contents <- parseFromString' parseBlocks raw optional blanklines return (term, [contents]) -fieldList :: RSTParser Blocks +fieldList :: PandocMonad m => RSTParser m Blocks fieldList = try $ do indent <- length <$> lookAhead (many spaceChar) items <- many1 $ fieldListItem indent @@ -224,43 +265,62 @@ fieldList = try $ do -- line block -- -lineBlock :: RSTParser Blocks +lineBlock :: PandocMonad m => RSTParser m Blocks lineBlock = try $ do lines' <- lineBlockLines lines'' <- mapM parseInlineFromString lines' return $ B.lineBlock lines'' +lineBlockDirective :: PandocMonad m => String -> RSTParser m Blocks +lineBlockDirective body = do + lines' <- mapM parseInlineFromString $ lines $ stripTrailingNewlines body + return $ B.lineBlock lines' + -- -- paragraph block -- -- note: paragraph can end in a :: starting a code block -para :: RSTParser Blocks +para :: PandocMonad m => RSTParser m Blocks para = try $ do result <- trimInlines . mconcat <$> many1 inline option (B.plain result) $ try $ do newline blanklines case viewr (B.unMany result) of - ys :> (Str xs) | "::" `isSuffixOf` xs -> do + ys :> Str xs | "::" `isSuffixOf` xs -> do raw <- option mempty codeBlockBody return $ B.para (B.Many ys <> B.str (take (length xs - 1) xs)) <> raw _ -> return (B.para result) -plain :: RSTParser Blocks +plain :: PandocMonad m => RSTParser m Blocks plain = B.plain . trimInlines . mconcat <$> many1 inline -- -- header blocks -- -header :: RSTParser Blocks +header :: PandocMonad m => RSTParser m Blocks header = doubleHeader <|> singleHeader <?> "header" -- a header with lines on top and bottom -doubleHeader :: RSTParser Blocks -doubleHeader = try $ do +doubleHeader :: PandocMonad m => RSTParser m Blocks +doubleHeader = do + (txt, c) <- doubleHeader' + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case elemIndex (DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], length headerTable + 1) + setState (state { stateHeaderTable = headerTable' }) + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt + +doubleHeader' :: PandocMonad m => RSTParser m (Inlines, Char) +doubleHeader' = try $ do c <- oneOf underlineChars rest <- many (char c) -- the top line let lenTop = length (c:rest) @@ -268,48 +328,45 @@ doubleHeader = try $ do newline txt <- trimInlines . mconcat <$> many1 (notFollowedBy blankline >> inline) pos <- getPosition - let len = (sourceColumn pos) - 1 - if (len > lenTop) then fail "title longer than border" else return () + let len = sourceColumn pos - 1 + when (len > lenTop) $ fail "title longer than border" blankline -- spaces and newline count lenTop (char c) -- the bottom line blanklines - -- check to see if we've had this kind of header before. - -- if so, get appropriate level. if not, add to list. + return (txt, c) + +-- a header with line on the bottom only +singleHeader :: PandocMonad m => RSTParser m Blocks +singleHeader = do + (txt, c) <- singleHeader' state <- getState let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of + let (headerTable',level) = case elemIndex (SingleHeader c) headerTable of Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + Nothing -> (headerTable ++ [SingleHeader c], length headerTable + 1) setState (state { stateHeaderTable = headerTable' }) attr <- registerHeader nullAttr txt return $ B.headerWith attr level txt --- a header with line on the bottom only -singleHeader :: RSTParser Blocks -singleHeader = try $ do +singleHeader' :: PandocMonad m => RSTParser m (Inlines, Char) +singleHeader' = try $ do notFollowedBy' whitespace - txt <- trimInlines . mconcat <$> many1 (do {notFollowedBy blankline; inline}) + lookAhead $ anyLine >> oneOf underlineChars + txt <- trimInlines . mconcat <$> many1 (notFollowedBy blankline >> inline) pos <- getPosition - let len = (sourceColumn pos) - 1 + let len = sourceColumn pos - 1 blankline c <- oneOf underlineChars count (len - 1) (char c) many (char c) blanklines - state <- getState - let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of - Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) - setState (state { stateHeaderTable = headerTable' }) - attr <- registerHeader nullAttr txt - return $ B.headerWith attr level txt + return (txt, c) -- -- hrule block -- -hrule :: Parser [Char] st Blocks +hrule :: Monad m => ParserT [Char] st m Blocks hrule = try $ do chr <- oneOf underlineChars count 3 (char chr) @@ -323,14 +380,14 @@ hrule = try $ do -- -- read a line indented by a given string -indentedLine :: String -> Parser [Char] st [Char] +indentedLine :: Monad m => String -> ParserT [Char] st m [Char] indentedLine indents = try $ do string indents anyLine -- one or more indented lines, possibly separated by blank lines. -- any amount of indentation will work. -indentedBlock :: Parser [Char] st [Char] +indentedBlock :: Monad m => ParserT [Char] st m [Char] indentedBlock = try $ do indents <- lookAhead $ many1 spaceChar lns <- many1 $ try $ do b <- option "" blanklines @@ -339,24 +396,24 @@ indentedBlock = try $ do optional blanklines return $ unlines lns -quotedBlock :: Parser [Char] st [Char] +quotedBlock :: Monad m => ParserT [Char] st m [Char] quotedBlock = try $ do quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" lns <- many1 $ lookAhead (char quote) >> anyLine optional blanklines return $ unlines lns -codeBlockStart :: Parser [Char] st Char +codeBlockStart :: Monad m => ParserT [Char] st m Char codeBlockStart = string "::" >> blankline >> blankline -codeBlock :: Parser [Char] st Blocks +codeBlock :: Monad m => ParserT [Char] st m Blocks codeBlock = try $ codeBlockStart >> codeBlockBody -codeBlockBody :: Parser [Char] st Blocks +codeBlockBody :: Monad m => ParserT [Char] st m Blocks codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> (indentedBlock <|> quotedBlock) -lhsCodeBlock :: RSTParser Blocks +lhsCodeBlock :: Monad m => RSTParser m Blocks lhsCodeBlock = try $ do getPosition >>= guard . (==1) . sourceColumn guardEnabled Ext_literate_haskell @@ -366,14 +423,14 @@ lhsCodeBlock = try $ do return $ B.codeBlockWith ("", ["sourceCode", "literate", "haskell"], []) $ intercalate "\n" lns -latexCodeBlock :: Parser [Char] st [[Char]] +latexCodeBlock :: Monad m => ParserT [Char] st m [[Char]] latexCodeBlock = try $ do try (latexBlockLine "\\begin{code}") many1Till anyLine (try $ latexBlockLine "\\end{code}") where latexBlockLine s = skipMany spaceChar >> string s >> blankline -birdCodeBlock :: Parser [Char] st [[Char]] +birdCodeBlock :: Monad m => ParserT [Char] st m [[Char]] birdCodeBlock = filterSpace <$> many1 birdTrackLine where filterSpace lns = -- if (as is normal) there is always a space after >, drop it @@ -381,99 +438,159 @@ birdCodeBlock = filterSpace <$> many1 birdTrackLine then map (drop 1) lns else lns -birdTrackLine :: Parser [Char] st [Char] +birdTrackLine :: Monad m => ParserT [Char] st m [Char] birdTrackLine = char '>' >> anyLine -- -- block quotes -- -blockQuote :: RSTParser Blocks +blockQuote :: PandocMonad m => RSTParser m Blocks blockQuote = do raw <- indentedBlock -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" + contents <- parseFromString' parseBlocks $ raw ++ "\n\n" return $ B.blockQuote contents +{- +Unsupported options for include: +tab-width +encoding +-} + +includeDirective :: PandocMonad m + => String -> [(String, String)] -> String + -> RSTParser m Blocks +includeDirective top fields body = do + let f = trim top + guard $ not (null f) + guard $ null (trim body) + -- options + let (startLine :: Maybe Int) = lookup "start-line" fields >>= safeRead + let (endLine :: Maybe Int) = lookup "end-line" fields >>= safeRead + oldPos <- getPosition + oldInput <- getInput + containers <- stateContainers <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show oldPos + updateState $ \s -> s{ stateContainers = f : stateContainers s } + mbContents <- readFileFromDirs ["."] f + contentLines <- case mbContents of + Just s -> return $ lines s + Nothing -> do + logMessage $ CouldNotLoadIncludeFile f oldPos + return [] + let numLines = length contentLines + let startLine' = case startLine of + Nothing -> 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + let endLine' = case endLine of + Nothing -> numLines + 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + let contentLines' = drop (startLine' - 1) + $ take (endLine' - 1) contentLines + let contentLines'' = (case trim <$> lookup "end-before" fields of + Just patt -> takeWhile (not . (patt `isInfixOf`)) + Nothing -> id) . + (case trim <$> lookup "start-after" fields of + Just patt -> drop 1 . + dropWhile (not . (patt `isInfixOf`)) + Nothing -> id) $ contentLines' + let contents' = unlines contentLines'' ++ "\n" + case lookup "code" fields of + Just lang -> do + let numberLines = lookup "number-lines" fields + let classes = trimr lang : ["numberLines" | isJust numberLines] ++ + maybe [] words (lookup "class" fields) + let kvs = maybe [] (\n -> [("startFrom", trimr n)]) numberLines + let ident = maybe "" trimr $ lookup "name" fields + let attribs = (ident, classes, kvs) + return $ B.codeBlockWith attribs contents' + Nothing -> case lookup "literal" fields of + Just _ -> return $ B.rawBlock "rst" contents' + Nothing -> do + setPosition $ newPos f 1 1 + setInput contents' + bs <- optional blanklines >> + (mconcat <$> many block) + setInput oldInput + setPosition oldPos + updateState $ \s -> s{ stateContainers = + tail $ stateContainers s } + return bs + + -- -- list blocks -- -list :: RSTParser Blocks +list :: PandocMonad m => RSTParser m Blocks list = choice [ bulletList, orderedList, definitionList ] <?> "list" -definitionListItem :: RSTParser (Inlines, [Blocks]) +definitionListItem :: PandocMonad m => RSTParser m (Inlines, [Blocks]) definitionListItem = try $ do -- avoid capturing a directive or comment notFollowedBy (try $ char '.' >> char '.') term <- trimInlines . mconcat <$> many1Till inline endline raw <- indentedBlock -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n" + contents <- parseFromString' parseBlocks $ raw ++ "\n" return (term, [contents]) -definitionList :: RSTParser Blocks +definitionList :: PandocMonad m => RSTParser m Blocks definitionList = B.definitionList <$> many1 definitionListItem -- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart :: Parser [Char] st Int +bulletListStart :: Monad m => ParserT [Char] st m Int bulletListStart = try $ do notFollowedBy' hrule -- because hrules start out just like lists marker <- oneOf bulletListMarkers - white <- many1 spaceChar + white <- many1 spaceChar <|> "" <$ lookAhead (char '\n') return $ length (marker:white) -- parses ordered list start and returns its length (inc following whitespace) -orderedListStart :: ListNumberStyle +orderedListStart :: Monad m => ListNumberStyle -> ListNumberDelim - -> RSTParser Int + -> RSTParser m Int orderedListStart style delim = try $ do (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) - white <- many1 spaceChar + white <- many1 spaceChar <|> "" <$ lookAhead (char '\n') return $ markerLen + length white -- parse a line of a list item -listLine :: Int -> RSTParser [Char] +listLine :: Monad m => Int -> RSTParser m [Char] listLine markerLength = try $ do notFollowedBy blankline indentWith markerLength - line <- anyLine - return $ line ++ "\n" - --- indent by specified number of spaces (or equiv. tabs) -indentWith :: Int -> RSTParser [Char] -indentWith num = do - tabStop <- getOption readerTabStop - if (num < tabStop) - then count num (char ' ') - else choice [ try (count num (char ' ')), - (try (char '\t' >> count (num - tabStop) (char ' '))) ] + anyLineNewline -- parse raw text for one list item, excluding start marker and continuations -rawListItem :: RSTParser Int - -> RSTParser (Int, [Char]) +rawListItem :: Monad m => RSTParser m Int + -> RSTParser m (Int, [Char]) rawListItem start = try $ do markerLength <- start - firstLine <- anyLine + firstLine <- anyLineNewline restLines <- many (listLine markerLength) - return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + return (markerLength, firstLine ++ concat restLines) -- continuation of a list item - indented and separated by blankline or -- (in compact lists) endline. -- Note: nested lists are parsed as continuations. -listContinuation :: Int -> RSTParser [Char] +listContinuation :: Monad m => Int -> RSTParser m [Char] listContinuation markerLength = try $ do blanks <- many1 blankline result <- many1 (listLine markerLength) return $ blanks ++ concat result -listItem :: RSTParser Int - -> RSTParser Blocks +listItem :: PandocMonad m + => RSTParser m Int + -> RSTParser m Blocks listItem start = try $ do (markerLength, first) <- rawListItem start rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline <* lookAhead start), - many1 blankline ] -- whole list must end with blank. + skipMany1 blankline <|> () <$ lookAhead start -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. -- see definition of "endline" @@ -481,52 +598,52 @@ listItem start = try $ do let oldContext = stateParserContext state setState $ state {stateParserContext = ListItemState} -- parse the extracted block, which may itself contain block elements - parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks + parsed <- parseFromString' parseBlocks $ concat (first:rest) ++ "\n" updateState (\st -> st {stateParserContext = oldContext}) return $ case B.toList parsed of - [Para xs] -> B.singleton $ Plain xs - [Para xs, BulletList ys] -> B.fromList [Plain xs, BulletList ys] - [Para xs, OrderedList s ys] -> B.fromList [Plain xs, OrderedList s ys] - [Para xs, DefinitionList ys] -> B.fromList [Plain xs, DefinitionList ys] + [Para xs] -> + B.singleton $ Plain xs + [Para xs, BulletList ys] -> + B.fromList [Plain xs, BulletList ys] + [Para xs, OrderedList s ys] -> + B.fromList [Plain xs, OrderedList s ys] + [Para xs, DefinitionList ys] -> + B.fromList [Plain xs, DefinitionList ys] _ -> parsed -orderedList :: RSTParser Blocks +orderedList :: PandocMonad m => RSTParser m Blocks orderedList = try $ do (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar) items <- many1 (listItem (orderedListStart style delim)) - let items' = compactify' items + let items' = compactify items return $ B.orderedListWith (start, style, delim) items' -bulletList :: RSTParser Blocks -bulletList = B.bulletList . compactify' <$> many1 (listItem bulletListStart) +bulletList :: PandocMonad m => RSTParser m Blocks +bulletList = B.bulletList . compactify <$> many1 (listItem bulletListStart) -- -- directive (e.g. comment, container, compound-paragraph) -- -comment :: RSTParser Blocks +comment :: Monad m => RSTParser m Blocks comment = try $ do string ".." skipMany1 spaceChar <|> (() <$ lookAhead newline) - notFollowedBy' directiveLabel + -- notFollowedBy' directiveLabel -- comment comes after directive so unnec. manyTill anyChar blanklines optional indentedBlock return mempty -directiveLabel :: RSTParser String +directiveLabel :: Monad m => RSTParser m String directiveLabel = map toLower <$> many1Till (letter <|> char '-') (try $ string "::") -directive :: RSTParser Blocks +directive :: PandocMonad m => RSTParser m Blocks directive = try $ do string ".." directive' --- TODO: line-block, parsed-literal, table, csv-table, list-table --- date --- include --- title -directive' :: RSTParser Blocks +directive' :: PandocMonad m => RSTParser m Blocks directive' = do skipMany1 spaceChar label <- directiveLabel @@ -541,43 +658,71 @@ directive' = do body <- option "" $ try $ blanklines >> indentedBlock optional blanklines let body' = body ++ "\n\n" - imgAttr cl = ("", classes, getAtt "width" ++ getAtt "height") + name = trim $ fromMaybe "" (lookup "name" fields) + imgAttr cl = ("", classes, widthAttr ++ heightAttr) where - classes = words $ maybe "" trim $ lookup cl fields - getAtt k = case lookup k fields of - Just v -> [(k, filter (not . isSpace) v)] - Nothing -> [] + classes = words $ maybe "" trim (lookup cl fields) ++ + maybe "" (\x -> "align-" ++ trim x) + (lookup "align" fields) + scale = case trim <$> lookup "scale" fields of + Just v -> case reverse v of + '%':vv -> + case safeRead (reverse vv) of + Just (percent :: Double) + -> percent / 100.0 + Nothing -> 1.0 + _ -> + case safeRead v of + Just (s :: Double) -> s + Nothing -> 1.0 + Nothing -> 1.0 + widthAttr = maybe [] (\x -> [("width", + show $ scaleDimension scale x)]) + $ lookup "width" fields >>= + (lengthToDim . filter (not . isSpace)) + heightAttr = maybe [] (\x -> [("height", + show $ scaleDimension scale x)]) + $ lookup "height" fields >>= + (lengthToDim . filter (not . isSpace)) case label of + "include" -> includeDirective top fields body' + "table" -> tableDirective top fields body' + "list-table" -> listTableDirective top fields body' + "csv-table" -> csvTableDirective top fields body' + "line-block" -> lineBlockDirective body' "raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body) - "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields - "container" -> parseFromString parseBlocks body' + "role" -> addNewRole top $ map (second trim) fields + "container" -> B.divWith (name, "container" : words top, []) <$> + parseFromString' parseBlocks body' "replace" -> B.para <$> -- consumed by substKey parseInlineFromString (trim top) "unicode" -> B.para <$> -- consumed by substKey parseInlineFromString (trim $ unicodeTransform top) - "compound" -> parseFromString parseBlocks body' - "pull-quote" -> B.blockQuote <$> parseFromString parseBlocks body' - "epigraph" -> B.blockQuote <$> parseFromString parseBlocks body' - "highlights" -> B.blockQuote <$> parseFromString parseBlocks body' + "compound" -> parseFromString' parseBlocks body' + "pull-quote" -> B.blockQuote <$> parseFromString' parseBlocks body' + "epigraph" -> B.blockQuote <$> parseFromString' parseBlocks body' + "highlights" -> B.blockQuote <$> parseFromString' parseBlocks body' "rubric" -> B.para . B.strong <$> parseInlineFromString top _ | label `elem` ["attention","caution","danger","error","hint", - "important","note","tip","warning"] -> - do bod <- parseFromString parseBlocks $ top ++ "\n\n" ++ body' - return $ B.divWith ("",["admonition", label],[]) bod - "admonition" -> - do bod <- parseFromString parseBlocks $ top ++ "\n\n" ++ body' - return $ B.divWith ("",["admonition"],[]) bod + "important","note","tip","warning","admonition"] -> + do bod <- parseFromString' parseBlocks $ top ++ "\n\n" ++ body' + let lab = case label of + "admonition" -> mempty + (l:ls) -> B.divWith ("",["admonition-title"],[]) + (B.para (B.str (toUpper l : ls))) + [] -> mempty + return $ B.divWith ("",[label],[]) (lab <> bod) "sidebar" -> do let subtit = maybe "" trim $ lookup "subtitle" fields tit <- B.para . B.strong <$> parseInlineFromString (trim top ++ if null subtit then "" else (": " ++ subtit)) - bod <- parseFromString parseBlocks body' + bod <- parseFromString' parseBlocks body' return $ B.divWith ("",["sidebar"],[]) $ tit <> bod "topic" -> do tit <- B.para . B.strong <$> parseInlineFromString top - bod <- parseFromString parseBlocks body' + bod <- parseFromString' parseBlocks body' return $ B.divWith ("",["topic"],[]) $ tit <> bod "default-role" -> mempty <$ updateState (\s -> s { stateRstDefaultRole = @@ -588,14 +733,15 @@ directive' = do codeblock (words $ fromMaybe [] $ lookup "class" fields) (lookup "number-lines" fields) (trim top) body "aafig" -> do - let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields) + let attribs = ("", ["aafig"], map (second trimr) fields) return $ B.codeBlockWith attribs $ stripTrailingNewlines body "math" -> return $ B.para $ mconcat $ map B.displayMath $ toChunks $ top ++ "\n\n" ++ body "figure" -> do - (caption, legend) <- parseFromString extractCaption body' + (caption, legend) <- parseFromString' extractCaption body' let src = escapeURI $ trim top - return $ B.para (B.imageWith (imgAttr "figclass") src "fig:" caption) <> legend + return $ B.para (B.imageWith (imgAttr "figclass") src "fig:" + caption) <> legend "image" -> do let src = escapeURI $ trim top let alt = B.str $ maybe "image" trim $ lookup "alt" fields @@ -606,28 +752,150 @@ directive' = do $ B.imageWith attr src "" alt Nothing -> B.imageWith attr src "" alt "class" -> do - let attrs = ("", (splitBy isSpace $ trim top), map (\(k,v) -> (k, trimr v)) fields) + let attrs = ("", splitBy isSpace $ trim top, + map (second trimr) fields) -- directive content or the first immediately following element children <- case body of "" -> block - _ -> parseFromString parseBlocks body' + _ -> parseFromString' parseBlocks body' return $ B.divWith attrs children other -> do pos <- getPosition - addWarning (Just pos) $ "ignoring unknown directive: " ++ other - return mempty + logMessage $ SkippedContent (".. " ++ other) pos + bod <- parseFromString' parseBlocks $ top ++ "\n\n" ++ body' + return $ B.divWith ("",[other],[]) bod + +tableDirective :: PandocMonad m + => String -> [(String, String)] -> String -> RSTParser m Blocks +tableDirective top fields body = do + bs <- parseFromString' parseBlocks body + case B.toList bs of + [Table _ aligns' widths' header' rows'] -> do + title <- parseFromString' (trimInlines . mconcat <$> many inline) top + columns <- getOption readerColumns + let numOfCols = length header' + let normWidths ws = + map (/ max 1.0 (fromIntegral (columns - numOfCols))) ws + let widths = case trim <$> lookup "widths" fields of + Just "auto" -> replicate numOfCols 0.0 + Just "grid" -> widths' + Just specs -> normWidths + $ map (fromMaybe (0 :: Double) . safeRead) + $ splitBy (`elem` (" ," :: String)) specs + Nothing -> widths' + -- align is not applicable since we can't represent whole table align + return $ B.singleton $ Table (B.toList title) + aligns' widths header' rows' + _ -> return mempty + + +-- TODO: :stub-columns:. +-- Only the first row becomes the header even if header-rows: > 1, +-- since Pandoc doesn't support a table with multiple header rows. +-- We don't need to parse :align: as it represents the whole table align. +listTableDirective :: PandocMonad m + => String -> [(String, String)] -> String + -> RSTParser m Blocks +listTableDirective top fields body = do + bs <- parseFromString' parseBlocks body + title <- parseFromString' (trimInlines . mconcat <$> many inline) top + let rows = takeRows $ B.toList bs + headerRowsNum = fromMaybe (0 :: Int) $ + lookup "header-rows" fields >>= safeRead + (headerRow,bodyRows,numOfCols) = case rows of + x:xs -> if headerRowsNum > 0 + then (x, xs, length x) + else ([], rows, length x) + _ -> ([],[],0) + widths = case trim <$> lookup "widths" fields of + Just "auto" -> replicate numOfCols 0 + Just specs -> normWidths $ map (fromMaybe (0 :: Double) . safeRead) $ + splitBy (`elem` (" ," :: String)) specs + _ -> replicate numOfCols 0 + return $ B.table title + (zip (replicate numOfCols AlignDefault) widths) + headerRow + bodyRows + where takeRows [BulletList rows] = map takeCells rows + takeRows _ = [] + takeCells [BulletList cells] = map B.fromList cells + takeCells _ = [] + normWidths ws = map (/ max 1 (sum ws)) ws + +csvTableDirective :: PandocMonad m + => String -> [(String, String)] -> String + -> RSTParser m Blocks +csvTableDirective top fields rawcsv = do + let explicitHeader = trim <$> lookup "header" fields + let opts = defaultCSVOptions{ + csvDelim = case trim <$> lookup "delim" fields of + Just "tab" -> '\t' + Just "space" -> ' ' + Just [c] -> c + _ -> ',' + , csvQuote = case trim <$> lookup "quote" fields of + Just [c] -> c + _ -> '"' + , csvEscape = case trim <$> lookup "escape" fields of + Just [c] -> Just c + _ -> Nothing + , csvKeepSpace = case trim <$> lookup "keepspace" fields of + Just "true" -> True + _ -> False + } + let headerRowsNum = fromMaybe (case explicitHeader of + Just _ -> 1 :: Int + Nothing -> 0 :: Int) $ + lookup "header-rows" fields >>= safeRead + rawcsv' <- case trim <$> + lookup "file" fields `mplus` lookup "url" fields of + Just u -> do + (bs, _) <- fetchItem u + return $ UTF8.toString bs + Nothing -> return rawcsv + let res = parseCSV opts (T.pack $ case explicitHeader of + Just h -> h ++ "\n" ++ rawcsv' + Nothing -> rawcsv') + case res of + Left e -> + throwError $ PandocParsecError "csv table" e + Right rawrows -> do + let parseCell = parseFromString' (plain <|> return mempty) . T.unpack + let parseRow = mapM parseCell + rows <- mapM parseRow rawrows + let (headerRow,bodyRows,numOfCols) = + case rows of + x:xs -> if headerRowsNum > 0 + then (x, xs, length x) + else ([], rows, length x) + _ -> ([],[],0) + title <- parseFromString' (trimInlines . mconcat <$> many inline) top + let normWidths ws = map (/ max 1 (sum ws)) ws + let widths = + case trim <$> lookup "widths" fields of + Just "auto" -> replicate numOfCols 0 + Just specs -> normWidths + $ map (fromMaybe (0 :: Double) . safeRead) + $ splitBy (`elem` (" ," :: String)) specs + _ -> replicate numOfCols 0 + return $ B.table title + (zip (replicate numOfCols AlignDefault) widths) + headerRow + bodyRows -- TODO: -- - Only supports :format: fields with a single format for :raw: roles, -- change Text.Pandoc.Definition.Format to fix -addNewRole :: String -> [(String, String)] -> RSTParser Blocks +addNewRole :: PandocMonad m + => String -> [(String, String)] -> RSTParser m Blocks addNewRole roleString fields = do - (role, parentRole) <- parseFromString inheritedRole roleString + pos <- getPosition + (role, parentRole) <- parseFromString' inheritedRole roleString customRoles <- stateRstCustomRoles <$> getState let getBaseRole (r, f, a) roles = case M.lookup r roles of Just (r', f', a') -> getBaseRole (r', f', a') roles - Nothing -> (r, f, a) + Nothing -> (r, f, a) (baseRole, baseFmt, baseAttr) = getBaseRole (parentRole, Nothing, nullAttr) customRoles fmt = if parentRole == "raw" then lookup "format" fields else baseFmt @@ -641,34 +909,32 @@ addNewRole roleString fields = do in (ident, nub . (role :) . annotate $ classes, keyValues) -- warn about syntax we ignore - flip mapM_ fields $ \(key, _) -> case key of - "language" -> when (baseRole /= "code") $ addWarning Nothing $ - "ignoring :language: field because the parent of role :" ++ - role ++ ": is :" ++ baseRole ++ ": not :code:" - "format" -> when (baseRole /= "raw") $ addWarning Nothing $ - "ignoring :format: field because the parent of role :" ++ - role ++ ": is :" ++ baseRole ++ ": not :raw:" - _ -> addWarning Nothing $ "ignoring unknown field :" ++ key ++ - ": in definition of role :" ++ role ++ ": in" + forM_ fields $ \(key, _) -> case key of + "language" -> when (baseRole /= "code") $ logMessage $ + SkippedContent ":language: [because parent of role is not :code:]" + pos + "format" -> when (baseRole /= "raw") $ logMessage $ + SkippedContent ":format: [because parent of role is not :raw:]" pos + _ -> logMessage $ SkippedContent (":" ++ key ++ ":") pos when (parentRole == "raw" && countKeys "format" > 1) $ - addWarning Nothing $ - "ignoring :format: fields after the first in the definition of role :" - ++ role ++": in" + logMessage $ SkippedContent + ":format: [after first in definition of role]" + pos when (parentRole == "code" && countKeys "language" > 1) $ - addWarning Nothing $ - "ignoring :language: fields after the first in the definition of role :" - ++ role ++": in" + logMessage $ SkippedContent + ":language: [after first in definition of role]" pos updateState $ \s -> s { stateRstCustomRoles = M.insert role (baseRole, fmt, attr) customRoles } - return $ B.singleton Null + return mempty where countKeys k = length . filter (== k) . map fst $ fields inheritedRole = - (,) <$> roleName <*> ((char '(' *> roleName <* char ')') <|> pure "span") + (,) <$> roleName <*> ((char '(' *> roleName <* char ')') + <|> pure "span") -- Can contain character codes as decimal numbers or @@ -700,24 +966,29 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc where (ds,rest) = span isHexDigit s mbc = safeRead ('\'':'\\':'x':ds ++ "'") -extractCaption :: RSTParser (Inlines, Blocks) +extractCaption :: PandocMonad m => RSTParser m (Inlines, Blocks) extractCaption = do capt <- trimInlines . mconcat <$> many inline legend <- optional blanklines >> (mconcat <$> many block) return (capt,legend) --- divide string by blanklines +-- divide string by blanklines, and surround with +-- \begin{aligned}...\end{aligned} if needed. toChunks :: String -> [String] toChunks = dropWhile null - . map (trim . unlines) + . map (addAligned . trim . unlines) . splitBy (all (`elem` (" \t" :: String))) . lines + -- we put this in an aligned environment if it contains \\, see #4254 + where addAligned s = if "\\\\" `isInfixOf` s + then "\\begin{aligned}\n" ++ s ++ "\n\\end{aligned}" + else s -codeblock :: [String] -> Maybe String -> String -> String -> RSTParser Blocks +codeblock :: [String] -> Maybe String -> String -> String -> RSTParser m Blocks codeblock classes numberLines lang body = return $ B.codeBlockWith attribs $ stripTrailingNewlines body where attribs = ("", classes', kvs) classes' = "sourceCode" : lang - : maybe [] (\_ -> ["numberLines"]) numberLines + : maybe [] (const ["numberLines"]) numberLines ++ classes kvs = case numberLines of Just "" -> [] @@ -728,30 +999,52 @@ codeblock classes numberLines lang body = --- note block --- -noteBlock :: RSTParser [Char] +noteBlock :: Monad m => RSTParser m [Char] noteBlock = try $ do + (ref, raw, replacement) <- noteBlock' noteMarker + updateState $ \s -> s { stateNotes = (ref, raw) : stateNotes s } + -- return blanks so line count isn't affected + return replacement + +citationBlock :: Monad m => RSTParser m [Char] +citationBlock = try $ do + (ref, raw, replacement) <- noteBlock' citationMarker + updateState $ \s -> + s { stateCitations = M.insert ref raw (stateCitations s), + stateKeys = M.insert (toKey ref) (('#':ref,""), ("",["citation"],[])) + (stateKeys s) } + -- return blanks so line count isn't affected + return replacement + +noteBlock' :: Monad m + => RSTParser m String -> RSTParser m (String, String, String) +noteBlock' marker = try $ do startPos <- getPosition string ".." spaceChar >> skipMany spaceChar - ref <- noteMarker + ref <- marker first <- (spaceChar >> skipMany spaceChar >> anyLine) <|> (newline >> return "") blanks <- option "" blanklines rest <- option "" indentedBlock endPos <- getPosition let raw = first ++ "\n" ++ blanks ++ rest ++ "\n" - let newnote = (ref, raw) - st <- getState - let oldnotes = stateNotes st - updateState $ \s -> s { stateNotes = newnote : oldnotes } - -- return blanks so line count isn't affected - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + let replacement =replicate (sourceLine endPos - sourceLine startPos) '\n' + return (ref, raw, replacement) + +citationMarker :: Monad m => RSTParser m [Char] +citationMarker = do + char '[' + res <- simpleReferenceName + char ']' + return res -noteMarker :: RSTParser [Char] +noteMarker :: Monad m => RSTParser m [Char] noteMarker = do char '[' res <- many1 digit - <|> (try $ char '#' >> liftM ('#':) simpleReferenceName') + <|> + try (char '#' >> liftM ('#':) simpleReferenceName) <|> count 1 (oneOf "#*") char ']' return res @@ -760,39 +1053,26 @@ noteMarker = do -- reference key -- -quotedReferenceName :: RSTParser Inlines +quotedReferenceName :: PandocMonad m => RSTParser m String quotedReferenceName = try $ do char '`' >> notFollowedBy (char '`') -- `` means inline code! - label' <- trimInlines . mconcat <$> many1Till inline (char '`') - return label' - -unquotedReferenceName :: RSTParser Inlines -unquotedReferenceName = try $ do - label' <- trimInlines . mconcat <$> many1Till inline (lookAhead $ char ':') - return label' + manyTill anyChar (char '`') -- Simple reference names are single words consisting of alphanumerics -- plus isolated (no two adjacent) internal hyphens, underscores, -- periods, colons and plus signs; no whitespace or other characters -- are allowed. -simpleReferenceName' :: Parser [Char] st String -simpleReferenceName' = do +simpleReferenceName :: Monad m => ParserT [Char] st m String +simpleReferenceName = do x <- alphaNum xs <- many $ alphaNum - <|> (try $ oneOf "-_:+." <* lookAhead alphaNum) + <|> try (oneOf "-_:+." <* lookAhead alphaNum) return (x:xs) -simpleReferenceName :: Parser [Char] st Inlines -simpleReferenceName = do - raw <- simpleReferenceName' - return $ B.str raw +referenceName :: PandocMonad m => RSTParser m String +referenceName = quotedReferenceName <|> simpleReferenceName -referenceName :: RSTParser Inlines -referenceName = quotedReferenceName <|> - (try $ simpleReferenceName <* lookAhead (char ':')) <|> - unquotedReferenceName - -referenceKey :: RSTParser [Char] +referenceKey :: PandocMonad m => RSTParser m [Char] referenceKey = do startPos <- getPosition choice [substKey, anonymousKey, regularKey] @@ -801,16 +1081,16 @@ referenceKey = do -- return enough blanks to replace key return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -targetURI :: Parser [Char] st [Char] +targetURI :: Monad m => ParserT [Char] st m [Char] targetURI = do skipSpaces optional newline contents <- many1 (try (many spaceChar >> newline >> many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") blanklines - return $ escapeURI $ trim $ contents + return $ escapeURI $ trim contents -substKey :: RSTParser () +substKey :: PandocMonad m => RSTParser m () substKey = try $ do string ".." skipMany1 spaceChar @@ -826,31 +1106,85 @@ substKey = try $ do [Para ils] -> return $ B.fromList ils _ -> mzero let key = toKey $ stripFirstAndLast ref - updateState $ \s -> s{ stateSubstitutions = M.insert key il $ stateSubstitutions s } + updateState $ \s -> s{ stateSubstitutions = + M.insert key il $ stateSubstitutions s } -anonymousKey :: RSTParser () +anonymousKey :: Monad m => RSTParser m () anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI pos <- getPosition let key = toKey $ "_" ++ printf "%09d" (sourceLine pos) - --TODO: parse width, height, class and name attributes - updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } - -stripTicks :: String -> String -stripTicks = reverse . stripTick . reverse . stripTick - where stripTick ('`':xs) = xs - stripTick xs = xs - -regularKey :: RSTParser () + updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ + stateKeys s } + +referenceNames :: PandocMonad m => RSTParser m [String] +referenceNames = do + let rn = try $ do + string ".. _" + ref <- quotedReferenceName + <|> many ( noneOf ":\n" + <|> try (char '\n' <* + string " " <* + notFollowedBy blankline) + <|> try (char ':' <* lookAhead alphaNum) + ) + char ':' + return ref + first <- rn + rest <- many (try (blanklines *> rn)) + return (first:rest) + +regularKey :: PandocMonad m => RSTParser m () regularKey = try $ do - string ".. _" - (_,ref) <- withRaw referenceName - char ':' + -- we allow several references to the same URL, e.g. + -- .. _hello: + -- .. _goodbye: url.com + refs <- referenceNames src <- targetURI - let key = toKey $ stripTicks ref - --TODO: parse width, height, class and name attributes - updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } + guard $ not (null src) + let keys = map toKey refs + forM_ keys $ \key -> + updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ + stateKeys s } + +anchorDef :: PandocMonad m => RSTParser m [Char] +anchorDef = try $ do + (refs, raw) <- withRaw $ try (referenceNames <* blanklines) + forM_ refs $ \rawkey -> + updateState $ \s -> s { stateKeys = + M.insert (toKey rawkey) (('#':rawkey,""), nullAttr) $ stateKeys s } + -- keep this for 2nd round of parsing, where we'll add the divs (anchor) + return raw + +anchor :: PandocMonad m => RSTParser m Blocks +anchor = try $ do + refs <- referenceNames + blanklines + b <- block + let addDiv ref = B.divWith (ref, [], []) + let emptySpanWithId id' = Span (id',[],[]) [] + -- put identifier on next block: + case B.toList b of + [Header lev (_,classes,kvs) txt] -> + case reverse refs of + [] -> return b + (r:rs) -> return $ B.singleton $ + Header lev (r,classes,kvs) + (txt ++ map emptySpanWithId rs) + -- we avoid generating divs for headers, + -- because it hides them from promoteHeader, see #4240 + _ -> return $ foldr addDiv b refs + +headerBlock :: PandocMonad m => RSTParser m [Char] +headerBlock = do + ((txt, _), raw) <- withRaw (doubleHeader' <|> singleHeader') + (ident,_,_) <- registerHeader nullAttr txt + let key = toKey (stringify txt) + updateState $ \s -> s { stateKeys = M.insert key (('#':ident,""), nullAttr) + $ stateKeys s } + return raw + -- -- tables @@ -869,45 +1203,53 @@ regularKey = try $ do -- Grid tables TODO: -- - column spans -dashedLine :: Char -> Parser [Char] st (Int, Int) +dashedLine :: Monad m => Char -> ParserT [Char] st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many (char ' ') return (length dashes, length $ dashes ++ sp) -simpleDashedLines :: Char -> Parser [Char] st [(Int,Int)] +simpleDashedLines :: Monad m => Char -> ParserT [Char] st m [(Int,Int)] simpleDashedLines ch = try $ many1 (dashedLine ch) -- Parse a table row separator -simpleTableSep :: Char -> RSTParser Char +simpleTableSep :: Monad m => Char -> RSTParser m Char simpleTableSep ch = try $ simpleDashedLines ch >> newline -- Parse a table footer -simpleTableFooter :: RSTParser [Char] +simpleTableFooter :: Monad m => RSTParser m [Char] simpleTableFooter = try $ simpleTableSep '=' >> blanklines -- Parse a raw line and split it into chunks by indices. -simpleTableRawLine :: [Int] -> RSTParser [String] -simpleTableRawLine indices = do - line <- many1Till anyChar newline - return (simpleTableSplitLine indices line) +simpleTableRawLine :: Monad m => [Int] -> RSTParser m [String] +simpleTableRawLine indices = simpleTableSplitLine indices <$> anyLine + +simpleTableRawLineWithEmptyCell :: Monad m => [Int] -> RSTParser m [String] +simpleTableRawLineWithEmptyCell indices = try $ do + cs <- simpleTableRawLine indices + let isEmptyCell = all (\c -> c == ' ' || c == '\t') + guard $ any isEmptyCell cs + return cs -- Parse a table row and return a list of blocks (columns). -simpleTableRow :: [Int] -> RSTParser [[Block]] +simpleTableRow :: PandocMonad m => [Int] -> RSTParser m [Blocks] simpleTableRow indices = do notFollowedBy' simpleTableFooter firstLine <- simpleTableRawLine indices - colLines <- return [] -- TODO - let cols = map unlines . transpose $ firstLine : colLines - mapM (parseFromString (B.toList . mconcat <$> many plain)) cols + conLines <- many $ simpleTableRawLineWithEmptyCell indices + let cols = map unlines . transpose $ firstLine : conLines ++ + [replicate (length indices) "" + | not (null conLines)] + mapM (parseFromString' parseBlocks) cols simpleTableSplitLine :: [Int] -> String -> [String] simpleTableSplitLine indices line = - map trim + map trimr $ tail $ splitByIndices (init indices) line -simpleTableHeader :: Bool -- ^ Headerless table - -> RSTParser ([[Block]], [Alignment], [Int]) +simpleTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m ([Blocks], [Alignment], [Int]) simpleTableHeader headless = try $ do optional blanklines rawContent <- if headless @@ -921,26 +1263,37 @@ simpleTableHeader headless = try $ do let rawHeads = if headless then replicate (length dashes) "" else simpleTableSplitLine indices rawContent - heads <- mapM (parseFromString (B.toList . mconcat <$> many plain)) $ - map trim rawHeads + heads <- mapM ( parseFromString' (mconcat <$> many plain) . trim) rawHeads return (heads, aligns, indices) -- Parse a simple table. -simpleTable :: Bool -- ^ Headerless table - -> RSTParser Blocks +simpleTable :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m Blocks simpleTable headless = do - Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter + let wrapIdFst (a, b, c) = (Identity a, b, c) + wrapId = fmap Identity + tbl <- runIdentity <$> tableWith + (wrapIdFst <$> simpleTableHeader headless) + (wrapId <$> simpleTableRow) + sep simpleTableFooter -- Simple tables get 0s for relative column widths (i.e., use default) - return $ B.singleton $ Table c a (replicate (length a) 0) h l + case B.toList tbl of + [Table c a _w h l] -> return $ B.singleton $ + Table c a (replicate (length a) 0) h l + _ -> + throwError $ PandocShouldNeverHappenError + "tableWith returned something unexpected" where sep = return () -- optional (simpleTableSep '-') -gridTable :: Bool -- ^ Headerless table - -> RSTParser Blocks -gridTable headerless = B.singleton - <$> gridTableWith (B.toList <$> parseBlocks) headerless +gridTable :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m Blocks +gridTable headerless = runIdentity <$> + gridTableWith (Identity <$> parseBlocks) headerless -table :: RSTParser Blocks +table :: PandocMonad m => RSTParser m Blocks table = gridTable False <|> simpleTable False <|> gridTable True <|> simpleTable True <?> "table" @@ -948,7 +1301,7 @@ table = gridTable False <|> simpleTable False <|> -- inline -- -inline :: RSTParser Inlines +inline :: PandocMonad m => RSTParser m Inlines inline = choice [ note -- can start with whitespace, so try before ws , whitespace , link @@ -964,29 +1317,30 @@ inline = choice [ note -- can start with whitespace, so try before ws , escapedChar , symbol ] <?> "inline" -parseInlineFromString :: String -> RSTParser Inlines -parseInlineFromString = parseFromString (trimInlines . mconcat <$> many inline) +parseInlineFromString :: PandocMonad m => String -> RSTParser m Inlines +parseInlineFromString = parseFromString' (trimInlines . mconcat <$> many inline) -hyphens :: RSTParser Inlines +hyphens :: Monad m => RSTParser m Inlines hyphens = do result <- many1 (char '-') optional endline -- don't want to treat endline after hyphen or dash as a space return $ B.str result -escapedChar :: Parser [Char] st Inlines +escapedChar :: Monad m => ParserT [Char] st m Inlines escapedChar = do c <- escaped anyChar - return $ if c == ' ' -- '\ ' is null in RST + return $ if c == ' ' || c == '\n' || c == '\r' + -- '\ ' is null in RST then mempty else B.str [c] -symbol :: RSTParser Inlines +symbol :: Monad m => RSTParser m Inlines symbol = do result <- oneOf specialChars return $ B.str [result] -- parses inline code, between codeStart and codeEnd -code :: RSTParser Inlines +code :: Monad m => RSTParser m Inlines code = try $ do string "``" result <- manyTill anyChar (try (string "``")) @@ -994,7 +1348,7 @@ code = try $ do $ trim $ unwords $ lines result -- succeeds only if we're not right after a str (ie. in middle of word) -atStart :: RSTParser a -> RSTParser a +atStart :: Monad m => RSTParser m a -> RSTParser m a atStart p = do pos <- getPosition st <- getState @@ -1002,11 +1356,11 @@ atStart p = do guard $ stateLastStrPos st /= Just pos p -emph :: RSTParser Inlines +emph :: PandocMonad m => RSTParser m Inlines emph = B.emph . trimInlines . mconcat <$> enclosed (atStart $ char '*') (char '*') inline -strong :: RSTParser Inlines +strong :: PandocMonad m => RSTParser m Inlines strong = B.strong . trimInlines . mconcat <$> enclosed (atStart $ string "**") (try $ string "**") inline @@ -1018,12 +1372,13 @@ strong = B.strong . trimInlines . mconcat <$> -- - Classes are silently discarded in addNewRole -- - Lacks sensible implementation for title-reference (which is the default) -- - Allows direct use of the :raw: role, rST only allows inherited use. -interpretedRole :: RSTParser Inlines +interpretedRole :: PandocMonad m => RSTParser m Inlines interpretedRole = try $ do (role, contents) <- roleBefore <|> roleAfter renderRole contents Nothing role nullAttr -renderRole :: String -> Maybe String -> String -> Attr -> RSTParser Inlines +renderRole :: PandocMonad m + => String -> Maybe String -> String -> Attr -> RSTParser m Inlines renderRole contents fmt role attr = case role of "sup" -> return $ B.superscript $ B.str contents "superscript" -> return $ B.superscript $ B.str contents @@ -1048,10 +1403,8 @@ renderRole contents fmt role attr = case role of case M.lookup custom customRoles of Just (newRole, newFmt, newAttr) -> renderRole contents newFmt newRole newAttr - Nothing -> do - pos <- getPosition - addWarning (Just pos) $ "ignoring unknown role :" ++ custom ++ ": in" - return $ B.str contents -- Undefined role + Nothing -> -- undefined role + return $ B.spanWith ("",[],[("role",role)]) (B.str contents) where titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo) @@ -1061,33 +1414,33 @@ renderRole contents fmt role attr = case role of pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/" addClass :: String -> Attr -> Attr -addClass c (ident, classes, keyValues) = (ident, union classes [c], keyValues) +addClass c (ident, classes, keyValues) = (ident, classes `union` [c], keyValues) -roleName :: RSTParser String +roleName :: PandocMonad m => RSTParser m String roleName = many1 (letter <|> char '-') -roleMarker :: RSTParser String +roleMarker :: PandocMonad m => RSTParser m String roleMarker = char ':' *> roleName <* char ':' -roleBefore :: RSTParser (String,String) +roleBefore :: PandocMonad m => RSTParser m (String,String) roleBefore = try $ do role <- roleMarker contents <- unmarkedInterpretedText return (role,contents) -roleAfter :: RSTParser (String,String) +roleAfter :: PandocMonad m => RSTParser m (String,String) roleAfter = try $ do contents <- unmarkedInterpretedText role <- roleMarker <|> (stateRstDefaultRole <$> getState) return (role,contents) -unmarkedInterpretedText :: RSTParser [Char] +unmarkedInterpretedText :: PandocMonad m => RSTParser m [Char] unmarkedInterpretedText = enclosed (atStart $ char '`') (char '`') anyChar -whitespace :: RSTParser Inlines +whitespace :: PandocMonad m => RSTParser m Inlines whitespace = B.space <$ skipMany1 spaceChar <?> "whitespace" -str :: RSTParser Inlines +str :: Monad m => RSTParser m Inlines str = do let strChar = noneOf ("\t\n " ++ specialChars) result <- many1 strChar @@ -1095,26 +1448,24 @@ str = do return $ B.str result -- an endline character that can be treated as a space, not a structural break -endline :: RSTParser Inlines +endline :: Monad m => RSTParser m Inlines endline = try $ do newline notFollowedBy blankline -- parse potential list-starts at beginning of line differently in a list: st <- getState - if (stateParserContext st) == ListItemState - then notFollowedBy (anyOrderedListMarker >> spaceChar) >> + when (stateParserContext st == ListItemState) $ notFollowedBy (anyOrderedListMarker >> spaceChar) >> notFollowedBy' bulletListStart - else return () return B.softbreak -- -- links -- -link :: RSTParser Inlines +link :: PandocMonad m => RSTParser m Inlines link = choice [explicitLink, referenceLink, autoLink] <?> "link" -explicitLink :: RSTParser Inlines +explicitLink :: PandocMonad m => RSTParser m Inlines explicitLink = try $ do char '`' notFollowedBy (char '`') -- `` marks start of inline code @@ -1128,61 +1479,73 @@ explicitLink = try $ do then B.str src else label' -- `link <google_>` is a reference link to _google! - (src',tit,attr) <- case reverse src of - '_':xs -> do - keyTable <- stateKeys <$> getState - let key = toKey $ reverse xs - case M.lookup key keyTable of - Nothing -> do - pos <- getPosition - addWarning (Just pos) $ - "Could not find reference for " ++ - show key - return ("","",nullAttr) - Just ((s,t),a) -> return (s,t,a) - _ -> return (src, "", nullAttr) + ((src',tit),attr) <- case reverse src of + '_':xs -> lookupKey [] (toKey (reverse xs)) + _ -> return ((src, ""), nullAttr) return $ B.linkWith attr (escapeURI src') tit label'' -referenceLink :: RSTParser Inlines +citationName :: PandocMonad m => RSTParser m String +citationName = do + raw <- citationMarker + return $ "[" ++ raw ++ "]" + +referenceLink :: PandocMonad m => RSTParser m Inlines referenceLink = try $ do - (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <* - char '_' - state <- getState - let keyTable = stateKeys state + ref <- (referenceName <|> citationName) <* char '_' + let label' = B.text ref let isAnonKey (Key ('_':_)) = True isAnonKey _ = False - key <- option (toKey $ stripTicks ref) $ + state <- getState + let keyTable = stateKeys state + key <- option (toKey ref) $ do char '_' let anonKeys = sort $ filter isAnonKey $ M.keys keyTable - if null anonKeys - then mzero - else return (head anonKeys) - ((src,tit), attr) <- case M.lookup key keyTable of - Nothing -> do - pos <- getPosition - addWarning (Just pos) $ - "Could not find reference for " ++ - show key - return (("",""),nullAttr) - Just val -> return val + case anonKeys of + [] -> mzero + (k:_) -> return k + ((src,tit), attr) <- lookupKey [] key -- if anonymous link, remove key so it won't be used again - when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable } + when (isAnonKey key) $ updateState $ \s -> + s{ stateKeys = M.delete key keyTable } return $ B.linkWith attr src tit label' -autoURI :: RSTParser Inlines +-- We keep a list of oldkeys so we can detect lookup loops. +lookupKey :: PandocMonad m + => [Key] -> Key -> RSTParser m ((String, String), Attr) +lookupKey oldkeys key = do + pos <- getPosition + state <- getState + let keyTable = stateKeys state + case M.lookup key keyTable of + Nothing -> do + let Key key' = key + logMessage $ ReferenceNotFound key' pos + return (("",""),nullAttr) + -- check for keys of the form link_, which need to be resolved: + Just ((u@(_:_),""),_) | last u == '_' -> do + let rawkey = init u + let newkey = toKey rawkey + if newkey `elem` oldkeys + then do + logMessage $ CircularReference rawkey pos + return (("",""),nullAttr) + else lookupKey (key:oldkeys) newkey + Just val -> return val + +autoURI :: Monad m => RSTParser m Inlines autoURI = do (orig, src) <- uri return $ B.link src "" $ B.str orig -autoEmail :: RSTParser Inlines +autoEmail :: Monad m => RSTParser m Inlines autoEmail = do (orig, src) <- emailAddress return $ B.link src "" $ B.str orig -autoLink :: RSTParser Inlines +autoLink :: PandocMonad m => RSTParser m Inlines autoLink = autoURI <|> autoEmail -subst :: RSTParser Inlines +subst :: PandocMonad m => RSTParser m Inlines subst = try $ do (_,ref) <- withRaw $ enclosed (char '|') (char '|') inline state <- getState @@ -1191,12 +1554,11 @@ subst = try $ do case M.lookup key substTable of Nothing -> do pos <- getPosition - addWarning (Just pos) $ - "Could not find reference for " ++ show key + logMessage $ ReferenceNotFound (show key) pos return mempty Just target -> return target -note :: RSTParser Inlines +note :: PandocMonad m => RSTParser m Inlines note = try $ do optional whitespace ref <- noteMarker @@ -1206,8 +1568,7 @@ note = try $ do case lookup ref notes of Nothing -> do pos <- getPosition - addWarning (Just pos) $ - "Could not find note for " ++ show ref + logMessage $ ReferenceNotFound ref pos return mempty Just raw -> do -- We temporarily empty the note list while parsing the note, @@ -1215,8 +1576,8 @@ note = try $ do -- Note references inside other notes are allowed in reST, but -- not yet in this implementation. updateState $ \st -> st{ stateNotes = [] } - contents <- parseFromString parseBlocks raw - let newnotes = if (ref == "*" || ref == "#") -- auto-numbered + contents <- parseFromString' parseBlocks raw + let newnotes = if ref == "*" || ref == "#" -- auto-numbered -- delete the note so the next auto-numbered note -- doesn't get the same contents: then deleteFirstsBy (==) notes [(ref,raw)] @@ -1224,20 +1585,20 @@ note = try $ do updateState $ \st -> st{ stateNotes = newnotes } return $ B.note contents -smart :: RSTParser Inlines +smart :: PandocMonad m => RSTParser m Inlines smart = do - getOption readerSmart >>= guard + guardEnabled Ext_smart doubleQuoted <|> singleQuoted <|> choice [apostrophe, dash, ellipses] -singleQuoted :: RSTParser Inlines +singleQuoted :: PandocMonad m => RSTParser m Inlines singleQuoted = try $ do singleQuoteStart withQuoteContext InSingleQuote $ B.singleQuoted . trimInlines . mconcat <$> many1Till inline singleQuoteEnd -doubleQuoted :: RSTParser Inlines +doubleQuoted :: PandocMonad m => RSTParser m Inlines doubleQuoted = try $ do doubleQuoteStart withQuoteContext InDoubleQuote $ diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs index 76a25ad82..75e3f89eb 100644 --- a/src/Text/Pandoc/Readers/TWiki.hs +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -1,4 +1,7 @@ -{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances, FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE RelaxedPolyRec #-} +{-# LANGUAGE TypeSynonymInstances #-} -- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- Copyright (C) 2014 Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> @@ -30,54 +33,50 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of twiki text to 'Pandoc' document. -} module Text.Pandoc.Readers.TWiki ( readTWiki - , readTWikiWithWarnings ) where -import Text.Pandoc.Definition +import Control.Monad +import Control.Monad.Except (throwError) +import Data.Char (isAlphaNum) +import qualified Data.Foldable as F +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (enclosed, macro, nested) +import Text.Pandoc.Parsing hiding (enclosed, nested) import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) -import Control.Monad -import Text.Printf (printf) -import Debug.Trace (trace) +import Text.Pandoc.Shared (crFilter) import Text.Pandoc.XML (fromEntities) -import Data.Maybe (fromMaybe) -import Text.HTML.TagSoup -import Data.Char (isAlphaNum) -import qualified Data.Foldable as F -import Text.Pandoc.Error -- | Read twiki from an input string and return a Pandoc document. -readTWiki :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readTWiki opts s = - (readWith parseTWiki) def{ stateOptions = opts } (s ++ "\n\n") - -readTWikiWithWarnings :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError (Pandoc, [String]) -readTWikiWithWarnings opts s = - (readWith parseTWikiWithWarnings) def{ stateOptions = opts } (s ++ "\n\n") - where parseTWikiWithWarnings = do - doc <- parseTWiki - warnings <- stateWarnings <$> getState - return (doc, warnings) - -type TWParser = Parser [Char] ParserState +readTWiki :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readTWiki opts s = do + res <- readWithM parseTWiki def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case res of + Left e -> throwError e + Right d -> return d + +type TWParser = ParserT [Char] ParserState -- -- utility functions -- -tryMsg :: String -> TWParser a -> TWParser a +tryMsg :: String -> TWParser m a -> TWParser m a tryMsg msg p = try p <?> msg -skip :: TWParser a -> TWParser () +skip :: TWParser m a -> TWParser m () skip parser = parser >> return () -nested :: TWParser a -> TWParser a +nested :: PandocMonad m => TWParser m a -> TWParser m a nested p = do nestlevel <- stateMaxNestingLevel <$> getState guard $ nestlevel > 0 @@ -86,7 +85,7 @@ nested p = do updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res -htmlElement :: String -> TWParser (Attr, String) +htmlElement :: PandocMonad m => String -> TWParser m (Attr, String) htmlElement tag = tryMsg tag $ do (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) content <- manyTill anyChar (endtag <|> endofinput) @@ -103,23 +102,24 @@ htmlAttrToPandoc attrs = (ident, classes, keyvals) classes = maybe [] words $ lookup "class" attrs keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] -parseHtmlContentWithAttrs :: String -> TWParser a -> TWParser (Attr, [a]) +parseHtmlContentWithAttrs :: PandocMonad m + => String -> TWParser m a -> TWParser m (Attr, [a]) parseHtmlContentWithAttrs tag parser = do (attr, content) <- htmlElement tag parsedContent <- try $ parseContent content return (attr, parsedContent) where - parseContent = parseFromString $ nested $ manyTill parser endOfContent + parseContent = parseFromString' $ nested $ manyTill parser endOfContent endOfContent = try $ skipMany blankline >> skipSpaces >> eof -parseHtmlContent :: String -> TWParser a -> TWParser [a] +parseHtmlContent :: PandocMonad m => String -> TWParser m a -> TWParser m [a] parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd -- -- main parser -- -parseTWiki :: TWParser Pandoc +parseTWiki :: PandocMonad m => TWParser m Pandoc parseTWiki = do bs <- mconcat <$> many block spaces @@ -131,20 +131,16 @@ parseTWiki = do -- block parsers -- -block :: TWParser B.Blocks +block :: PandocMonad m => TWParser m B.Blocks block = do - tr <- getOption readerTrace - pos <- getPosition res <- mempty <$ skipMany1 blankline <|> blockElements <|> para skipMany blankline - when tr $ - trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList res)) (return ()) + trace (take 60 $ show $ B.toList res) return res -blockElements :: TWParser B.Blocks +blockElements :: PandocMonad m => TWParser m B.Blocks blockElements = choice [ separator , header , verbatim @@ -155,10 +151,10 @@ blockElements = choice [ separator , noautolink ] -separator :: TWParser B.Blocks +separator :: PandocMonad m => TWParser m B.Blocks separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule -header :: TWParser B.Blocks +header :: PandocMonad m => TWParser m B.Blocks header = tryMsg "header" $ do string "---" level <- many1 (char '+') >>= return . length @@ -167,45 +163,47 @@ header = tryMsg "header" $ do skipSpaces content <- B.trimInlines . mconcat <$> manyTill inline newline attr <- registerHeader ("", classes, []) content - return $ B.headerWith attr level $ content + return $ B.headerWith attr level content -verbatim :: TWParser B.Blocks +verbatim :: PandocMonad m => TWParser m B.Blocks verbatim = (htmlElement "verbatim" <|> htmlElement "pre") >>= return . (uncurry B.codeBlockWith) -literal :: TWParser B.Blocks +literal :: PandocMonad m => TWParser m B.Blocks literal = htmlElement "literal" >>= return . rawBlock where format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs rawBlock (attrs, content) = B.rawBlock (format attrs) content -list :: String -> TWParser B.Blocks +list :: PandocMonad m => String -> TWParser m B.Blocks list prefix = choice [ bulletList prefix , orderedList prefix , definitionList prefix] -definitionList :: String -> TWParser B.Blocks +definitionList :: PandocMonad m => String -> TWParser m B.Blocks definitionList prefix = tryMsg "definitionList" $ do indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ " elements <- many $ parseDefinitionListItem (prefix ++ concat indent) return $ B.definitionList elements where - parseDefinitionListItem :: String -> TWParser (B.Inlines, [B.Blocks]) + parseDefinitionListItem :: PandocMonad m + => String -> TWParser m (B.Inlines, [B.Blocks]) parseDefinitionListItem indent = do string (indent ++ "$ ") >> skipSpaces term <- many1Till inline $ string ": " line <- listItemLine indent $ string "$ " return $ (mconcat term, [line]) -bulletList :: String -> TWParser B.Blocks +bulletList :: PandocMonad m => String -> TWParser m B.Blocks bulletList prefix = tryMsg "bulletList" $ parseList prefix (char '*') (char ' ') -orderedList :: String -> TWParser B.Blocks +orderedList :: PandocMonad m => String -> TWParser m B.Blocks orderedList prefix = tryMsg "orderedList" $ parseList prefix (oneOf "1iIaA") (string ". ") -parseList :: String -> TWParser Char -> TWParser a -> TWParser B.Blocks +parseList :: PandocMonad m + => String -> TWParser m Char -> TWParser m a -> TWParser m B.Blocks parseList prefix marker delim = do (indent, style) <- lookAhead $ string prefix *> listStyle <* delim blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim) @@ -222,10 +220,12 @@ parseList prefix marker delim = do style <- marker return (concat indent, style) -parseListItem :: Show a => String -> TWParser a -> TWParser B.Blocks +parseListItem :: (PandocMonad m, Show a) + => String -> TWParser m a -> TWParser m B.Blocks parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker -listItemLine :: Show a => String -> TWParser a -> TWParser B.Blocks +listItemLine :: (PandocMonad m, Show a) + => String -> TWParser m a -> TWParser m B.Blocks listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat where lineContent = do @@ -235,14 +235,14 @@ listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat filterSpaces = reverse . dropWhile (== ' ') . reverse listContinuation = notFollowedBy (string prefix >> marker) >> string " " >> lineContent - parseContent = parseFromString $ many1 $ nestedList <|> parseInline + parseContent = parseFromString' $ many1 $ nestedList <|> parseInline parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>= return . B.plain . mconcat nestedList = list prefix lastNewline = try $ char '\n' <* eof newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList -table :: TWParser B.Blocks +table :: PandocMonad m => TWParser m B.Blocks table = try $ do tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip rows <- many1 tableParseRow @@ -254,7 +254,7 @@ table = try $ do columns rows = replicate (columCount rows) mempty columCount rows = length $ head rows -tableParseHeader :: TWParser ((Alignment, Double), B.Blocks) +tableParseHeader :: PandocMonad m => TWParser m ((Alignment, Double), B.Blocks) tableParseHeader = try $ do char '|' leftSpaces <- many spaceChar >>= return . length @@ -270,27 +270,27 @@ tableParseHeader = try $ do | left > right = (AlignRight, 0) | otherwise = (AlignLeft, 0) -tableParseRow :: TWParser [B.Blocks] +tableParseRow :: PandocMonad m => TWParser m [B.Blocks] tableParseRow = many1Till tableParseColumn newline -tableParseColumn :: TWParser B.Blocks +tableParseColumn :: PandocMonad m => TWParser m B.Blocks tableParseColumn = char '|' *> skipSpaces *> tableColumnContent (skipSpaces >> char '|') <* skipSpaces <* optional tableEndOfRow -tableEndOfRow :: TWParser Char +tableEndOfRow :: PandocMonad m => TWParser m Char tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|' -tableColumnContent :: TWParser a -> TWParser B.Blocks +tableColumnContent :: PandocMonad m => TWParser m a -> TWParser m B.Blocks tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat where content = continuation <|> inline continuation = try $ char '\\' >> newline >> return mempty -blockQuote :: TWParser B.Blocks +blockQuote :: PandocMonad m => TWParser m B.Blocks blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat -noautolink :: TWParser B.Blocks +noautolink :: PandocMonad m => TWParser m B.Blocks noautolink = do (_, content) <- htmlElement "noautolink" st <- getState @@ -299,9 +299,9 @@ noautolink = do setState $ st{ stateAllowLinks = True } return $ mconcat blocks where - parseContent = parseFromString $ many $ block + parseContent = parseFromString' $ many $ block -para :: TWParser B.Blocks +para :: PandocMonad m => TWParser m B.Blocks para = many1Till inline endOfParaElement >>= return . result . mconcat where endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement @@ -317,7 +317,7 @@ para = many1Till inline endOfParaElement >>= return . result . mconcat -- inline parsers -- -inline :: TWParser B.Inlines +inline :: PandocMonad m => TWParser m B.Inlines inline = choice [ whitespace , br , macro @@ -338,36 +338,39 @@ inline = choice [ whitespace , symbol ] <?> "inline" -whitespace :: TWParser B.Inlines +whitespace :: PandocMonad m => TWParser m B.Inlines whitespace = (lb <|> regsp) >>= return where lb = try $ skipMany spaceChar >> linebreak >> return B.space regsp = try $ skipMany1 spaceChar >> return B.space -br :: TWParser B.Inlines +br :: PandocMonad m => TWParser m B.Inlines br = try $ string "%BR%" >> return B.linebreak -linebreak :: TWParser B.Inlines +linebreak :: PandocMonad m => TWParser m B.Inlines linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) where lastNewline = eof >> return mempty innerNewline = return B.space -between :: (Monoid c) => TWParser a -> TWParser b -> (TWParser b -> TWParser c) -> TWParser c +between :: (Monoid c, PandocMonad m, Show b) + => TWParser m a -> TWParser m b -> (TWParser m b -> TWParser m c) + -> TWParser m c between start end p = mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) -enclosed :: (Monoid b) => TWParser a -> (TWParser a -> TWParser b) -> TWParser b +enclosed :: (Monoid b, PandocMonad m, Show a) + => TWParser m a -> (TWParser m a -> TWParser m b) -> TWParser m b enclosed sep p = between sep (try $ sep <* endMarker) p where endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof endSpace = (spaceChar <|> newline) >> return B.space -macro :: TWParser B.Inlines +macro :: PandocMonad m => TWParser m B.Inlines macro = macroWithParameters <|> withoutParameters where withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan emptySpan name = buildSpan name [] mempty -macroWithParameters :: TWParser B.Inlines +macroWithParameters :: PandocMonad m => TWParser m B.Inlines macroWithParameters = try $ do char '%' name <- macroName @@ -382,22 +385,22 @@ buildSpan className kvs = B.spanWith attrs additionalClasses = maybe [] words $ lookup "class" kvs kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"] -macroName :: TWParser String +macroName :: PandocMonad m => TWParser m String macroName = do first <- letter rest <- many $ alphaNum <|> char '_' return (first:rest) -attributes :: TWParser (String, [(String, String)]) +attributes :: PandocMonad m => TWParser m (String, [(String, String)]) attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>= return . foldr (either mkContent mkKvs) ([], []) where spnl = skipMany (spaceChar <|> newline) - mkContent c ([], kvs) = (c, kvs) - mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) + mkContent c ([], kvs) = (c, kvs) + mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) mkKvs kv (cont, rest) = (cont, (kv : rest)) -attribute :: TWParser (Either String (String, String)) +attribute :: PandocMonad m => TWParser m (Either String (String, String)) attribute = withKey <|> withoutKey where withKey = try $ do @@ -411,49 +414,51 @@ attribute = withKey <|> withoutKey | allowSpaces == True = many1 $ noneOf "}" | otherwise = many1 $ noneOf " }" -nestedInlines :: Show a => TWParser a -> TWParser B.Inlines +nestedInlines :: (Show a, PandocMonad m) + => TWParser m a -> TWParser m B.Inlines nestedInlines end = innerSpace <|> nestedInline where innerSpace = try $ whitespace <* (notFollowedBy end) nestedInline = notFollowedBy whitespace >> nested inline -strong :: TWParser B.Inlines +strong :: PandocMonad m => TWParser m B.Inlines strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong -strongHtml :: TWParser B.Inlines +strongHtml :: PandocMonad m => TWParser m B.Inlines strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline) >>= return . B.strong . mconcat -strongAndEmph :: TWParser B.Inlines +strongAndEmph :: PandocMonad m => TWParser m B.Inlines strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong -emph :: TWParser B.Inlines +emph :: PandocMonad m => TWParser m B.Inlines emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph -emphHtml :: TWParser B.Inlines +emphHtml :: PandocMonad m => TWParser m B.Inlines emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline) >>= return . B.emph . mconcat -nestedString :: Show a => TWParser a -> TWParser String +nestedString :: (Show a, PandocMonad m) + => TWParser m a -> TWParser m String nestedString end = innerSpace <|> (count 1 nonspaceChar) where innerSpace = try $ many1 spaceChar <* notFollowedBy end -boldCode :: TWParser B.Inlines +boldCode :: PandocMonad m => TWParser m B.Inlines boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities -htmlComment :: TWParser B.Inlines +htmlComment :: PandocMonad m => TWParser m B.Inlines htmlComment = htmlTag isCommentTag >> return mempty -code :: TWParser B.Inlines +code :: PandocMonad m => TWParser m B.Inlines code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities -codeHtml :: TWParser B.Inlines +codeHtml :: PandocMonad m => TWParser m B.Inlines codeHtml = do (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar return $ B.codeWith attrs $ fromEntities content -autoLink :: TWParser B.Inlines +autoLink :: PandocMonad m => TWParser m B.Inlines autoLink = try $ do state <- getState guard $ stateAllowLinks state @@ -467,36 +472,36 @@ autoLink = try $ do | c == '/' = True | otherwise = isAlphaNum c -str :: TWParser B.Inlines +str :: PandocMonad m => TWParser m B.Inlines str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str -nop :: TWParser B.Inlines +nop :: PandocMonad m => TWParser m B.Inlines nop = try $ (skip exclamation <|> skip nopTag) >> followContent where exclamation = char '!' nopTag = stringAnyCase "<nop>" followContent = many1 nonspaceChar >>= return . B.str . fromEntities -symbol :: TWParser B.Inlines +symbol :: PandocMonad m => TWParser m B.Inlines symbol = count 1 nonspaceChar >>= return . B.str -smart :: TWParser B.Inlines +smart :: PandocMonad m => TWParser m B.Inlines smart = do - getOption readerSmart >>= guard + guardEnabled Ext_smart doubleQuoted <|> singleQuoted <|> choice [ apostrophe , dash , ellipses ] -singleQuoted :: TWParser B.Inlines +singleQuoted :: PandocMonad m => TWParser m B.Inlines singleQuoted = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= (return . B.singleQuoted . B.trimInlines . mconcat) -doubleQuoted :: TWParser B.Inlines +doubleQuoted :: PandocMonad m => TWParser m B.Inlines doubleQuoted = try $ do doubleQuoteStart contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) @@ -504,7 +509,7 @@ doubleQuoted = try $ do return (B.doubleQuoted $ B.trimInlines contents)) <|> (return $ (B.str "\8220") B.<> contents) -link :: TWParser B.Inlines +link :: PandocMonad m => TWParser m B.Inlines link = try $ do st <- getState guard $ stateAllowLinks st @@ -513,13 +518,13 @@ link = try $ do setState $ st{ stateAllowLinks = True } return $ B.link url title content -linkText :: TWParser (String, String, B.Inlines) +linkText :: PandocMonad m => TWParser m (String, String, B.Inlines) linkText = do string "[[" url <- many1Till anyChar (char ']') - content <- option [B.str url] linkContent + content <- option (B.str url) (mconcat <$> linkContent) char ']' - return (url, "", mconcat content) + return (url, "", content) where linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent - parseLinkContent = parseFromString $ many1 inline + parseLinkContent = parseFromString' $ many1 inline diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs deleted file mode 100644 index e5778b123..000000000 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ /dev/null @@ -1,48 +0,0 @@ -{- -Copyright (C) 2007-2015 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.TeXMath - Copyright : Copyright (C) 2007-2015 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of TeX math to a list of 'Pandoc' inline elements. --} -module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where - -import Text.Pandoc.Definition -import Text.TeXMath - --- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ or @$$@ characters if entire formula --- can't be converted. -texMathToInlines :: MathType - -> String -- ^ String to parse (assumes @'\n'@ line endings) - -> [Inline] -texMathToInlines mt inp = - case writePandoc dt `fmap` readTeX inp of - Right (Just ils) -> ils - _ -> [Str (delim ++ inp ++ delim)] - where (dt, delim) = case mt of - DisplayMath -> (DisplayBlock, "$$") - InlineMath -> (DisplayInline, "$") - diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 8dbbf7be2..30bb6a715 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -1,6 +1,6 @@ {- -Copyright (C) 2010-2015 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' - and John MacFarlane +Copyright (C) 2010-2012 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + 2010-2018 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2015 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2012 Paul Rivier + 2010-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -51,44 +52,42 @@ TODO : refactor common patterns across readers : module Text.Pandoc.Readers.Textile ( readTextile) where +import Control.Monad (guard, liftM) +import Control.Monad.Except (throwError) +import Data.Char (digitToInt, isUpper) +import Data.List (intercalate, intersperse, transpose) +import Data.Monoid ((<>)) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup (Tag (..), fromAttrib) +import Text.HTML.TagSoup.Match +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) import Text.Pandoc.CSS import Text.Pandoc.Definition -import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) -import qualified Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Parsing -import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isInlineTag ) -import Text.Pandoc.Shared (trim) -import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) -import Text.HTML.TagSoup (fromAttrib, Tag(..)) -import Text.HTML.TagSoup.Match -import Data.List ( intercalate, transpose, intersperse ) -import Data.Char ( digitToInt, isUpper ) -import Control.Monad ( guard, liftM, when ) -import Data.Monoid ((<>)) -import Text.Printf -import Debug.Trace (trace) -import Text.Pandoc.Error +import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isInlineTag) +import Text.Pandoc.Readers.LaTeX (rawLaTeXBlock, rawLaTeXInline) +import Text.Pandoc.Shared (crFilter, trim, underlineSpan) -- | Parse a Textile text and return a Pandoc document. -readTextile :: ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Either PandocError Pandoc -readTextile opts s = - (readWith parseTextile) def{ stateOptions = opts } (s ++ "\n\n") +readTextile :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readTextile opts s = do + parsed <- readWithM parseTextile def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e -- | Generate a Pandoc ADT from a textile document -parseTextile :: Parser [Char] ParserState Pandoc +parseTextile :: PandocMonad m => ParserT [Char] ParserState m Pandoc parseTextile = do - -- textile allows raw HTML and does smart punctuation by default, - -- but we do not enable smart punctuation unless it is explicitly - -- asked for, for better conversion to other light markup formats - oldOpts <- stateOptions `fmap` getState - updateState $ \state -> state{ stateOptions = - oldOpts{ readerParseRaw = True - , readerOldDashes = True - } } many blankline startPos <- getPosition -- go through once just to get list of reference keys and notes @@ -103,15 +102,15 @@ parseTextile = do blocks <- parseBlocks return $ Pandoc nullMeta (B.toList blocks) -- FIXME -noteMarker :: Parser [Char] ParserState [Char] +noteMarker :: PandocMonad m => ParserT [Char] ParserState m [Char] noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') -noteBlock :: Parser [Char] ParserState [Char] +noteBlock :: PandocMonad m => ParserT [Char] ParserState m [Char] noteBlock = try $ do startPos <- getPosition ref <- noteMarker optional blankline - contents <- liftM unlines $ many1Till anyLine (blanklines <|> noteBlock) + contents <- unlines <$> many1Till anyLine (blanklines <|> noteBlock) endPos <- getPosition let newnote = (ref, contents ++ "\n") st <- getState @@ -121,11 +120,11 @@ noteBlock = try $ do return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- | Parse document blocks -parseBlocks :: Parser [Char] ParserState Blocks +parseBlocks :: PandocMonad m => ParserT [Char] ParserState m Blocks parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: [Parser [Char] ParserState Blocks] +blockParsers :: PandocMonad m => [ParserT [Char] ParserState m Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -140,26 +139,22 @@ blockParsers = [ codeBlock ] -- | Any block in the order of definition of blockParsers -block :: Parser [Char] ParserState Blocks +block :: PandocMonad m => ParserT [Char] ParserState m Blocks block = do res <- choice blockParsers <?> "block" - pos <- getPosition - tr <- getOption readerTrace - when tr $ - trace (printf "line %d: %s" (sourceLine pos) - (take 60 $ show $ B.toList res)) (return ()) + trace (take 60 $ show $ B.toList res) return res -commentBlock :: Parser [Char] ParserState Blocks +commentBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines return mempty -codeBlock :: Parser [Char] ParserState Blocks +codeBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks codeBlock = codeBlockBc <|> codeBlockPre -codeBlockBc :: Parser [Char] ParserState Blocks +codeBlockBc :: PandocMonad m => ParserT [Char] ParserState m Blocks codeBlockBc = try $ do string "bc." extended <- option False (True <$ char '.') @@ -179,11 +174,10 @@ trimTrailingNewlines :: String -> String trimTrailingNewlines = reverse . dropWhile (=='\n') . reverse -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockPre :: Parser [Char] ParserState Blocks +codeBlockPre :: PandocMonad m => ParserT [Char] ParserState m Blocks codeBlockPre = try $ do (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) result' <- manyTill anyChar (htmlTag (tagClose (=="pre"))) - optional blanklines -- drop leading newline if any let result'' = case result' of '\n':xs -> xs @@ -198,7 +192,7 @@ codeBlockPre = try $ do return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: Parser [Char] ParserState Blocks +header :: PandocMonad m => ParserT [Char] ParserState m Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" @@ -210,14 +204,14 @@ header = try $ do return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: Parser [Char] ParserState Blocks +blockQuote :: PandocMonad m => ParserT [Char] ParserState m Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace B.blockQuote <$> para -- Horizontal rule -hrule :: Parser [Char] st Blocks +hrule :: PandocMonad m => ParserT [Char] st m Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -232,66 +226,67 @@ hrule = try $ do -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: Parser [Char] ParserState Blocks +anyList :: PandocMonad m => ParserT [Char] ParserState m Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: Int -> Parser [Char] ParserState Blocks +anyListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: Int -> Parser [Char] ParserState Blocks +bulletListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: Int -> Parser [Char] ParserState Blocks +bulletListItemAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: Int -> Parser [Char] ParserState Blocks +orderedListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: Int -> Parser [Char] ParserState Blocks +orderedListItemAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState Blocks +genericListItemAtDepth :: PandocMonad m => Char -> Int -> ParserT [Char] ParserState m Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace - p <- mconcat <$> many listInline + contents <- mconcat <$> many ((B.plain . mconcat <$> many1 inline) <|> + try (newline >> codeBlockPre)) newline sublist <- option mempty (anyListAtDepth (depth + 1)) - return $ (B.plain p) <> sublist + return $ contents <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: Parser [Char] ParserState Blocks +definitionList :: PandocMonad m => ParserT [Char] ParserState m Blocks definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: Parser [Char] ParserState () +listStart :: PandocMonad m => ParserT [Char] ParserState m () listStart = genericListStart '*' <|> () <$ genericListStart '#' <|> () <$ definitionListStart -genericListStart :: Char -> Parser [Char] st () +genericListStart :: PandocMonad m => Char -> ParserT [Char] st m () genericListStart c = () <$ try (many1 (char c) >> whitespace) -basicDLStart :: Parser [Char] ParserState () +basicDLStart :: PandocMonad m => ParserT [Char] ParserState m () basicDLStart = do char '-' whitespace notFollowedBy newline -definitionListStart :: Parser [Char] ParserState Inlines +definitionListStart :: PandocMonad m => ParserT [Char] ParserState m Inlines definitionListStart = try $ do basicDLStart trimInlines . mconcat <$> @@ -300,34 +295,30 @@ definitionListStart = try $ do <|> try (lookAhead (() <$ string ":=")) ) -listInline :: Parser [Char] ParserState Inlines -listInline = try (notFollowedBy newline >> inline) - <|> try (endline <* notFollowedBy listStart) - -- | A definition list item in textile begins with '- ', followed by -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) +definitionListItem :: PandocMonad m => ParserT [Char] ParserState m (Inlines, [Blocks]) definitionListItem = try $ do term <- (mconcat . intersperse B.linebreak) <$> many1 definitionListStart def' <- string ":=" *> optional whitespace *> (multilineDef <|> inlineDef) return (term, def') - where inlineDef :: Parser [Char] ParserState [Blocks] + where inlineDef :: PandocMonad m => ParserT [Char] ParserState m [Blocks] inlineDef = liftM (\d -> [B.plain d]) - $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline - multilineDef :: Parser [Char] ParserState [Blocks] + $ optional whitespace >> (trimInlines . mconcat <$> many inline) <* newline + multilineDef :: PandocMonad m => ParserT [Char] ParserState m [Blocks] multilineDef = try $ do optional whitespace >> newline s <- many1Till anyChar (try (string "=:" >> newline)) -- this ++ "\n\n" does not look very good - ds <- parseFromString parseBlocks (s ++ "\n\n") + ds <- parseFromString' parseBlocks (s ++ "\n\n") return [ds] -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: Parser [Char] ParserState Blocks +rawHtmlBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks rawHtmlBlock = try $ do skipMany spaceChar (_,b) <- htmlTag isBlockTag @@ -335,14 +326,14 @@ rawHtmlBlock = try $ do return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: Parser [Char] ParserState Blocks +rawLaTeXBlock' :: PandocMonad m => ParserT [Char] ParserState m Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: Parser [Char] ParserState Blocks +para :: PandocMonad m => ParserT [Char] ParserState m Blocks para = B.para . trimInlines . mconcat <$> many1 inline -- Tables @@ -353,7 +344,7 @@ toAlignment '>' = AlignRight toAlignment '=' = AlignCenter toAlignment _ = AlignDefault -cellAttributes :: Parser [Char] ParserState (Bool, Alignment) +cellAttributes :: PandocMonad m => ParserT [Char] ParserState m (Bool, Alignment) cellAttributes = try $ do isHeader <- option False (True <$ char '_') -- we just ignore colspan and rowspan markers: @@ -366,18 +357,18 @@ cellAttributes = try $ do return (isHeader, alignment) -- | A table cell spans until a pipe | -tableCell :: Parser [Char] ParserState ((Bool, Alignment), Blocks) +tableCell :: PandocMonad m => ParserT [Char] ParserState m ((Bool, Alignment), Blocks) tableCell = try $ do char '|' - (isHeader, alignment) <- option (False, AlignDefault) $ cellAttributes + (isHeader, alignment) <- option (False, AlignDefault) cellAttributes notFollowedBy blankline raw <- trim <$> many (noneOf "|\n" <|> try (char '\n' <* notFollowedBy blankline)) - content <- mconcat <$> parseFromString (many inline) raw + content <- mconcat <$> parseFromString' (many inline) raw return ((isHeader, alignment), B.plain content) -- | A table row is made of many table cells -tableRow :: Parser [Char] ParserState [((Bool, Alignment), Blocks)] +tableRow :: PandocMonad m => ParserT [Char] ParserState m [((Bool, Alignment), Blocks)] tableRow = try $ do -- skip optional row attributes optional $ try $ do @@ -387,7 +378,7 @@ tableRow = try $ do many1 tableCell <* char '|' <* blankline -- | A table with an optional header. -table :: Parser [Char] ParserState Blocks +table :: PandocMonad m => ParserT [Char] ParserState m Blocks table = try $ do -- ignore table attributes caption <- option mempty $ try $ do @@ -395,8 +386,8 @@ table = try $ do _ <- attributes char '.' rawcapt <- trim <$> anyLine - parseFromString (mconcat <$> many inline) rawcapt - rawrows <- many1 $ (skipMany ignorableRow) >> tableRow + parseFromString' (mconcat <$> many inline) rawcapt + rawrows <- many1 $ skipMany ignorableRow >> tableRow skipMany ignorableRow blanklines let (headers, rows) = case rawrows of @@ -411,7 +402,7 @@ table = try $ do (map (map snd) rows) -- | Ignore markers for cols, thead, tfoot. -ignorableRow :: Parser [Char] ParserState () +ignorableRow :: PandocMonad m => ParserT [Char] ParserState m () ignorableRow = try $ do char '|' oneOf ":^-~" @@ -420,7 +411,7 @@ ignorableRow = try $ do _ <- anyLine return () -explicitBlockStart :: String -> Parser [Char] ParserState () +explicitBlockStart :: PandocMonad m => String -> ParserT [Char] ParserState m () explicitBlockStart name = try $ do string name attributes @@ -430,9 +421,10 @@ explicitBlockStart name = try $ do -- | Blocks like 'p' and 'table' do not need explicit block tag. -- However, they can be used to set HTML/CSS attributes when needed. -maybeExplicitBlock :: String -- ^ block tag name - -> Parser [Char] ParserState Blocks -- ^ implicit block - -> Parser [Char] ParserState Blocks +maybeExplicitBlock :: PandocMonad m + => String -- ^ block tag name + -> ParserT [Char] ParserState m Blocks -- ^ implicit block + -> ParserT [Char] ParserState m Blocks maybeExplicitBlock name blk = try $ do optional $ explicitBlockStart name blk @@ -445,12 +437,11 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: Parser [Char] ParserState Inlines -inline = do - choice inlineParsers <?> "inline" +inline :: PandocMonad m => ParserT [Char] ParserState m Inlines +inline = choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: [Parser [Char] ParserState Inlines] +inlineParsers :: PandocMonad m => [ParserT [Char] ParserState m Inlines] inlineParsers = [ str , whitespace , endline @@ -470,13 +461,13 @@ inlineParsers = [ str ] -- | Inline markups -inlineMarkup :: Parser [Char] ParserState Inlines +inlineMarkup :: PandocMonad m => ParserT [Char] ParserState m Inlines inlineMarkup = choice [ simpleInline (string "??") (B.cite []) , simpleInline (string "**") B.strong , simpleInline (string "__") B.emph , simpleInline (char '*') B.strong , simpleInline (char '_') B.emph - , simpleInline (char '+') B.emph -- approximates underline + , simpleInline (char '+') underlineSpan , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout , simpleInline (char '^') B.superscript , simpleInline (char '~') B.subscript @@ -484,35 +475,35 @@ inlineMarkup = choice [ simpleInline (string "??") (B.cite []) ] -- | Trademark, registered, copyright -mark :: Parser [Char] st Inlines +mark :: PandocMonad m => ParserT [Char] st m Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: Parser [Char] st Inlines +reg :: PandocMonad m => ParserT [Char] st m Inlines reg = do oneOf "Rr" char ')' return $ B.str "\174" -tm :: Parser [Char] st Inlines +tm :: PandocMonad m => ParserT [Char] st m Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' return $ B.str "\8482" -copy :: Parser [Char] st Inlines +copy :: PandocMonad m => ParserT [Char] st m Inlines copy = do oneOf "Cc" char ')' return $ B.str "\169" -note :: Parser [Char] ParserState Inlines +note :: PandocMonad m => ParserT [Char] ParserState m Inlines note = try $ do - ref <- (char '[' *> many1 digit <* char ']') + ref <- char '[' *> many1 digit <* char ']' notes <- stateNotes <$> getState case lookup ref notes of - Nothing -> fail "note not found" - Just raw -> B.note <$> parseFromString parseBlocks raw + Nothing -> fail "note not found" + Just raw -> B.note <$> parseFromString' parseBlocks raw -- | Special chars markupChars :: [Char] @@ -530,22 +521,22 @@ wordBoundaries :: [Char] wordBoundaries = markupChars ++ stringBreakers -- | Parse a hyphened sequence of words -hyphenedWords :: Parser [Char] ParserState String +hyphenedWords :: PandocMonad m => ParserT [Char] ParserState m String hyphenedWords = do x <- wordChunk xs <- many (try $ char '-' >> wordChunk) return $ intercalate "-" (x:xs) -wordChunk :: Parser [Char] ParserState String +wordChunk :: PandocMonad m => ParserT [Char] ParserState m String wordChunk = try $ do hd <- noneOf wordBoundaries - tl <- many ( (noneOf wordBoundaries) <|> + tl <- many ( noneOf wordBoundaries <|> try (notFollowedBy' note *> oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) ) return $ hd:tl -- | Any string -str :: Parser [Char] ParserState Inlines +str :: PandocMonad m => ParserT [Char] ParserState m Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediatly @@ -558,11 +549,11 @@ str = do return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] st Inlines +whitespace :: PandocMonad m => ParserT [Char] st m Inlines whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: Parser [Char] ParserState Inlines +endline :: PandocMonad m => ParserT [Char] ParserState m Inlines endline = try $ do newline notFollowedBy blankline @@ -570,18 +561,18 @@ endline = try $ do notFollowedBy rawHtmlBlock return B.linebreak -rawHtmlInline :: Parser [Char] ParserState Inlines +rawHtmlInline :: PandocMonad m => ParserT [Char] ParserState m Inlines rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag -- | Raw LaTeX Inline -rawLaTeXInline' :: Parser [Char] ParserState Inlines +rawLaTeXInline' :: PandocMonad m => ParserT [Char] ParserState m Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex - B.singleton <$> rawLaTeXInline + B.rawInline "latex" <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: Parser [Char] ParserState Inlines +link :: PandocMonad m => ParserT [Char] ParserState m Inlines link = try $ do bracketed <- (True <$ char '[') <|> return False char '"' *> notFollowedBy (oneOf " \t\n\r") @@ -591,8 +582,9 @@ link = try $ do char ':' let stop = if bracketed then char ']' - else lookAhead $ space <|> - try (oneOf "!.,;:" *> (space <|> newline)) + else lookAhead $ space <|> eof' <|> + try (oneOf "!.,;:" *> + (space <|> newline <|> eof')) url <- many1Till nonspaceChar stop let name' = if B.toList name == [Str "$"] then B.str url else name return $ if attr == nullAttr @@ -600,7 +592,7 @@ link = try $ do else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: Parser [Char] ParserState Inlines +image :: PandocMonad m => ParserT [Char] ParserState m Inlines image = try $ do char '!' >> notFollowedBy space (ident, cls, kvs) <- attributes @@ -612,50 +604,51 @@ image = try $ do char '!' return $ B.imageWith attr src alt (B.str alt) -escapedInline :: Parser [Char] ParserState Inlines +escapedInline :: PandocMonad m => ParserT [Char] ParserState m Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: Parser [Char] ParserState Inlines +escapedEqs :: PandocMonad m => ParserT [Char] ParserState m Inlines escapedEqs = B.str <$> - (try $ string "==" *> manyTill anyChar' (try $ string "==")) + try (string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: Parser [Char] ParserState Inlines +escapedTag :: PandocMonad m => ParserT [Char] ParserState m Inlines escapedTag = B.str <$> - (try $ string "<notextile>" *> + try (string "<notextile>" *> manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: Parser [Char] ParserState Inlines +symbol :: PandocMonad m => ParserT [Char] ParserState m Inlines symbol = B.str . singleton <$> (notFollowedBy newline *> notFollowedBy rawHtmlBlock *> oneOf wordBoundaries) -- | Inline code -code :: Parser [Char] ParserState Inlines +code :: PandocMonad m => ParserT [Char] ParserState m Inlines code = code1 <|> code2 -- any character except a newline before a blank line -anyChar' :: Parser [Char] ParserState Char +anyChar' :: PandocMonad m => ParserT [Char] ParserState m Char anyChar' = - satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline) + satisfy (/='\n') <|> + try (char '\n' <* notFollowedBy blankline) -code1 :: Parser [Char] ParserState Inlines +code1 :: PandocMonad m => ParserT [Char] ParserState m Inlines code1 = B.code <$> surrounded (char '@') anyChar' -code2 :: Parser [Char] ParserState Inlines +code2 :: PandocMonad m => ParserT [Char] ParserState m Inlines code2 = do htmlTag (tagOpen (=="tt") null) B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes -attributes :: Parser [Char] ParserState Attr -attributes = (foldl (flip ($)) ("",[],[])) <$> +attributes :: PandocMonad m => ParserT [Char] ParserState m Attr +attributes = foldl (flip ($)) ("",[],[]) <$> try (do special <- option id specialAttribute attrs <- many attribute return (special : attrs)) -specialAttribute :: Parser [Char] ParserState (Attr -> Attr) +specialAttribute :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) specialAttribute = do alignStr <- ("center" <$ char '=') <|> ("justify" <$ try (string "<>")) <|> @@ -664,11 +657,11 @@ specialAttribute = do notFollowedBy spaceChar return $ addStyle ("text-align:" ++ alignStr) -attribute :: Parser [Char] ParserState (Attr -> Attr) +attribute :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) attribute = try $ (classIdAttr <|> styleAttr <|> langAttr) <* notFollowedBy spaceChar -classIdAttr :: Parser [Char] ParserState (Attr -> Attr) +classIdAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' ws <- words `fmap` manyTill anyChar' (char ')') @@ -679,7 +672,7 @@ classIdAttr = try $ do -- (class class #id) classes' -> return $ \(_,_,keyvals) -> ("",classes',keyvals) -styleAttr :: Parser [Char] ParserState (Attr -> Attr) +styleAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) styleAttr = do style <- try $ enclosed (char '{') (char '}') anyChar' return $ addStyle style @@ -690,21 +683,23 @@ addStyle style (id',classes,keyvals) = where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] style' = style ++ ";" ++ concat [v | ("style",v) <- keyvals] -langAttr :: Parser [Char] ParserState (Attr -> Attr) +langAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) langAttr = do lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals) -- | Parses material surrounded by a parser. -surrounded :: Parser [Char] st t -- ^ surrounding parser - -> Parser [Char] st a -- ^ content parser (to be used repeatedly) - -> Parser [Char] st [a] +surrounded :: (PandocMonad m, Show t) + => ParserT [Char] st m t -- ^ surrounding parser + -> ParserT [Char] st m a -- ^ content parser (to be used repeatedly) + -> ParserT [Char] st m [a] surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) -simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser - -> (Inlines -> Inlines) -- ^ Inline constructor - -> Parser [Char] ParserState Inlines -- ^ content parser (to be used repeatedly) +simpleInline :: PandocMonad m + => ParserT [Char] ParserState m t -- ^ surrounding parser + -> (Inlines -> Inlines) -- ^ Inline constructor + -> ParserT [Char] ParserState m Inlines -- ^ content parser (to be used repeatedly) simpleInline border construct = try $ do notAfterString border *> notFollowedBy (oneOf " \t\n\r") @@ -718,7 +713,7 @@ simpleInline border construct = try $ do then body else B.spanWith attr body -groupedInlineMarkup :: Parser [Char] ParserState Inlines +groupedInlineMarkup :: PandocMonad m => ParserT [Char] ParserState m Inlines groupedInlineMarkup = try $ do char '[' sp1 <- option mempty $ B.space <$ whitespace @@ -731,3 +726,5 @@ groupedInlineMarkup = try $ do singleton :: a -> [a] singleton x = [x] +eof' :: Monad m => ParserT [Char] s m Char +eof' = '\n' <$ eof diff --git a/src/Text/Pandoc/Readers/TikiWiki.hs b/src/Text/Pandoc/Readers/TikiWiki.hs new file mode 100644 index 000000000..a92f7bed2 --- /dev/null +++ b/src/Text/Pandoc/Readers/TikiWiki.hs @@ -0,0 +1,654 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RelaxedPolyRec #-} +{-# LANGUAGE TypeSynonymInstances #-} + +{- | + Module : Text.Pandoc.Readers.TikiWiki + Copyright : Copyright (C) 2017 Robin Lee Powell + License : GPLv2 + + Maintainer : Robin Lee Powell <robinleepowell@gmail.com> + Stability : alpha + Portability : portable + +Conversion of TikiWiki text to 'Pandoc' document. +-} + +module Text.Pandoc.Readers.TikiWiki ( readTikiWiki + ) where + +import Control.Monad +import Control.Monad.Except (throwError) +import qualified Data.Foldable as F +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (CommonState (..), PandocMonad (..)) +import Text.Pandoc.Definition +import Text.Pandoc.Logging (Verbosity (..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed, nested) +import Text.Pandoc.Shared (crFilter) +import Text.Pandoc.XML (fromEntities) +import Text.Printf (printf) + +-- | Read TikiWiki from an input string and return a Pandoc document. +readTikiWiki :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readTikiWiki opts s = do + res <- readWithM parseTikiWiki def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case res of + Left e -> throwError e + Right d -> return d + +type TikiWikiParser = ParserT [Char] ParserState + +-- +-- utility functions +-- + +tryMsg :: PandocMonad m => String -> TikiWikiParser m a -> TikiWikiParser m a +tryMsg msg p = try p <?> msg + +skip :: PandocMonad m => TikiWikiParser m a -> TikiWikiParser m () +skip parser = Control.Monad.void parser + +nested :: PandocMonad m => TikiWikiParser m a -> TikiWikiParser m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +-- +-- main parser +-- + +parseTikiWiki :: PandocMonad m => TikiWikiParser m Pandoc +parseTikiWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + +block :: PandocMonad m => TikiWikiParser m B.Blocks +block = do + verbosity <- getsCommonState stVerbosity + pos <- getPosition + res <- mempty <$ skipMany1 blankline + <|> blockElements + <|> para + skipMany blankline + when (verbosity >= INFO) $ + trace (printf "line %d: %s" (sourceLine pos) (take 60 $ show $ B.toList res)) + return res + +blockElements :: PandocMonad m => TikiWikiParser m B.Blocks +blockElements = choice [ table + , hr + , header + , mixedList + , definitionList + , codeMacro + ] + +-- top +-- ---- +-- bottom +-- +-- ---- +-- +hr :: PandocMonad m => TikiWikiParser m B.Blocks +hr = try $ do + string "----" + many (char '-') + newline + return B.horizontalRule + +-- ! header +-- +-- !! header level two +-- +-- !!! header level 3 +-- +header :: PandocMonad m => TikiWikiParser m B.Blocks +header = tryMsg "header" $ do + level <- fmap length (many1 (char '!')) + guard $ level <= 6 + skipSpaces + content <- B.trimInlines . mconcat <$> manyTill inline newline + attr <- registerHeader nullAttr content + return $B.headerWith attr level content + +tableRow :: PandocMonad m => TikiWikiParser m [B.Blocks] +tableRow = try $ do +-- row <- sepBy1 (many1Till inline $ oneOf "\n|") (try $ string "|" <* notFollowedBy (oneOf "|\n")) +-- return $ map (B.plain . mconcat) row + row <- sepBy1 (many1 (noneOf "\n|") >>= parseColumn) (try $ string "|" <* notFollowedBy (oneOf "|\n")) + return $ map B.plain row + where + parseColumn x = do + parsed <- parseFromString (many1 inline) x + return $ mconcat parsed + + + +-- Tables: +-- +-- ||foo|| +-- +-- ||row1-column1|row1-column2||row2-column1|row2-column2|| +-- +-- ||row1-column1|row1-column2 +-- row2-column1|row2-column2|| +-- +-- ||row1-column1|row1-column2 +-- row2-column1|row2-column2||row3-column1|row3-column2|| +-- +-- || Orange | Apple | more +-- Bread | Pie | more +-- Butter | Ice cream | and more || +-- +table :: PandocMonad m => TikiWikiParser m B.Blocks +table = try $ do + string "||" + rows <- sepBy1 tableRow (try $ string "\n" <|> (string "||" <* notFollowedBy (string "\n"))) + string "||" + newline + -- return $ B.simpleTable (headers rows) $ trace ("rows: " ++ (show rows)) rows + return $B.simpleTable (headers rows) rows + where + -- The headers are as many empty srings as the number of columns + -- in the first row + headers rows = map (B.plain . B.str) $replicate (length $ head rows) "" + +para :: PandocMonad m => TikiWikiParser m B.Blocks +para = fmap (result . mconcat) ( many1Till inline endOfParaElement) + where + endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + newBlockElement = try $ blankline >> skip blockElements + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + +-- ;item 1: definition 1 +-- ;item 2: definition 2-1 +-- + definition 2-2 +-- ;item ''3'': definition ''3'' +-- +definitionList :: PandocMonad m => TikiWikiParser m B.Blocks +definitionList = tryMsg "definitionList" $ do + elements <-many1 parseDefinitionListItem + return $ B.definitionList elements + where + parseDefinitionListItem :: PandocMonad m => TikiWikiParser m (B.Inlines, [B.Blocks]) + parseDefinitionListItem = do + skipSpaces >> char ';' <* skipSpaces + term <- many1Till inline $ char ':' <* skipSpaces + line <- listItemLine 1 + return (mconcat term, [B.plain line]) + +data ListType = None | Numbered | Bullet deriving (Ord, Eq, Show) + +data ListNesting = LN { lntype :: ListType, lnnest :: Int } deriving (Ord, Eq, Show) + +-- The first argument is a stack (most recent == head) of our list +-- nesting status; the list type and the nesting level; if we're in +-- a number list in a bullet list it'd be +-- [LN Numbered 2, LN Bullet 1] +-- +-- Mixed list example: +-- +-- # one +-- # two +-- ** two point one +-- ** two point two +-- # three +-- # four +-- +mixedList :: PandocMonad m => TikiWikiParser m B.Blocks +mixedList = try $ do + items <- try $ many1 listItem + return $ mconcat $ fixListNesting $ spanFoldUpList (LN None 0) items + +-- See the "Handling Lists" section of DESIGN-CODE for why this +-- function exists. It's to post-process the lists and do some +-- mappends. +-- +-- We need to walk the tree two items at a time, so we can see what +-- we're going to join *to* before we get there. +-- +-- Because of that, it seemed easier to do it by hand than to try to +-- figre out a fold or something. +fixListNesting :: [B.Blocks] -> [B.Blocks] +fixListNesting [] = [] +fixListNesting [first] = [recurseOnList first] +-- fixListNesting nestall | trace ("\n\nfixListNesting: " ++ (show nestall)) False = undefined +-- fixListNesting nestall@(first:second:rest) = +fixListNesting (first:second:rest) = + let secondBlock = head $ B.toList second in + case secondBlock of + BulletList _ -> fixListNesting $ mappend (recurseOnList first) (recurseOnList second) : rest + OrderedList _ _ -> fixListNesting $ mappend (recurseOnList first) (recurseOnList second) : rest + _ -> recurseOnList first : fixListNesting (second:rest) + +-- This function walks the Block structure for fixListNesting, +-- because it's a bit complicated, what with converting to and from +-- lists and so on. +recurseOnList :: B.Blocks -> B.Blocks +-- recurseOnList item | trace ("rOL: " ++ (show $ length $ B.toList item) ++ ", " ++ (show $ B.toList item)) False = undefined +recurseOnList items + | length (B.toList items) == 1 = + let itemBlock = head $ B.toList items in + case itemBlock of + BulletList listItems -> B.bulletList $ fixListNesting $ map B.fromList listItems + OrderedList _ listItems -> B.orderedList $ fixListNesting $ map B.fromList listItems + _ -> items + + -- The otherwise works because we constructed the blocks, and we + -- know for a fact that no mappends have been run on them; each + -- Blocks consists of exactly one Block. + -- + -- Anything that's not like that has already been processed by + -- fixListNesting; don't bother to process it again. + | otherwise = items + + +-- Turn the list if list items into a tree by breaking off the first +-- item, splitting the remainder of the list into items that are in +-- the tree of the first item and those that aren't, wrapping the +-- tree of the first item in its list time, and recursing on both +-- sections. +spanFoldUpList :: ListNesting -> [(ListNesting, B.Blocks)] -> [B.Blocks] +spanFoldUpList _ [] = [] +spanFoldUpList ln [first] = + listWrap ln (fst first) [snd first] +spanFoldUpList ln (first:rest) = + let (span1, span2) = span (splitListNesting (fst first)) rest + newTree1 = listWrap ln (fst first) $ snd first : spanFoldUpList (fst first) span1 + newTree2 = spanFoldUpList ln span2 + in + newTree1 ++ newTree2 + +-- Decide if the second item should be in the tree of the first +-- item, which is true if the second item is at a deeper nesting +-- level and of the same type. +splitListNesting :: ListNesting -> (ListNesting, B.Blocks) -> Bool +splitListNesting ln1 (ln2, _) + | lnnest ln1 < lnnest ln2 = + True + | ln1 == ln2 = + True + | otherwise = + False + +-- If we've moved to a deeper nesting level, wrap the new level in +-- the appropriate type of list. +listWrap :: ListNesting -> ListNesting -> [B.Blocks] -> [B.Blocks] +listWrap upperLN curLN retTree = + if upperLN == curLN then + retTree + else + case lntype curLN of + None -> [] + Bullet -> [B.bulletList retTree] + Numbered -> [B.orderedList retTree] + +listItem :: PandocMonad m => TikiWikiParser m (ListNesting, B.Blocks) +listItem = choice [ + bulletItem + , numberedItem + ] + + +-- * Start each line +-- * with an asterisk (*). +-- ** More asterisks gives deeper +-- *** and deeper levels. +-- +bulletItem :: PandocMonad m => TikiWikiParser m (ListNesting, B.Blocks) +bulletItem = try $ do + prefix <- many1 $ char '*' + many1 $ char ' ' + content <- listItemLine (length prefix) + return (LN Bullet (length prefix), B.plain content) + +-- # Start each line +-- # with a number (1.). +-- ## More number signs gives deeper +-- ### and deeper +-- +numberedItem :: PandocMonad m => TikiWikiParser m (ListNesting, B.Blocks) +numberedItem = try $ do + prefix <- many1 $ char '#' + many1 $ char ' ' + content <- listItemLine (length prefix) + return (LN Numbered (length prefix), B.plain content) + +listItemLine :: PandocMonad m => Int -> TikiWikiParser m B.Inlines +listItemLine nest = lineContent >>= parseContent + where + lineContent = do + content <- anyLine + continuation <- optionMaybe listContinuation + return $ filterSpaces content ++ "\n" ++ Data.Maybe.fromMaybe "" continuation + filterSpaces = reverse . dropWhile (== ' ') . reverse + listContinuation = string (replicate nest '+') >> lineContent + parseContent x = do + parsed <- parseFromString (many1 inline) x + return $ mconcat parsed + +-- Turn the CODE macro attributes into Pandoc code block attributes. +mungeAttrs :: [(String, String)] -> (String, [String], [(String, String)]) +mungeAttrs rawAttrs = ("", classes, rawAttrs) + where + -- "colors" is TikiWiki CODE macro for "name of language to do + -- highlighting for"; turn the value into a class + color = fromMaybe "" $ lookup "colors" rawAttrs + -- ln = 1 means line numbering. It's also the default. So we + -- emit numberLines as a class unless ln = 0 + lnRaw = fromMaybe "1" $ lookup "ln" rawAttrs + ln = if lnRaw == "0" then + "" + else + "numberLines" + classes = filter (/= "") [color, ln] + +codeMacro :: PandocMonad m => TikiWikiParser m B.Blocks +codeMacro = try $ do + string "{CODE(" + rawAttrs <- macroAttrs + string ")}" + body <- manyTill anyChar (try (string "{CODE}")) + newline + if not (null rawAttrs) + then + return $ B.codeBlockWith (mungeAttrs rawAttrs) body + else + return $ B.codeBlock body + + +-- +-- inline parsers +-- + +inline :: PandocMonad m => TikiWikiParser m B.Inlines +inline = choice [ whitespace + , noparse + , strong + , emph + , nbsp + , image + , htmlComment + , strikeout + , code + , wikiLink + , notExternalLink + , externalLink + , superTag + , superMacro + , subTag + , subMacro + , escapedChar + , colored + , centered + , underlined + , boxed + , breakChars + , str + , symbol + ] <?> "inline" + +whitespace :: PandocMonad m => TikiWikiParser m B.Inlines +whitespace = lb <|> regsp + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +-- UNSUPPORTED, as there doesn't seem to be any facility in calibre +-- for this +nbsp :: PandocMonad m => TikiWikiParser m B.Inlines +nbsp = try $ do + string "~hs~" + return $ B.str " NOT SUPPORTED BEGIN: ~hs~ (non-breaking space) :END " + +-- UNSUPPORTED, as the desired behaviour (that the data be +-- *retained* and stored as a comment) doesn't exist in calibre, and +-- silently throwing data out seemed bad. +htmlComment :: PandocMonad m => TikiWikiParser m B.Inlines +htmlComment = try $ do + string "~hc~" + inner <- many1 $ noneOf "~" + string "~/hc~" + return $ B.str $ " NOT SUPPORTED: ~hc~ (html comment opener) BEGIN: " ++ inner ++ " ~/hc~ :END " + +linebreak :: PandocMonad m => TikiWikiParser m B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +between :: (Monoid c, PandocMonad m, Show b) => TikiWikiParser m a -> TikiWikiParser m b -> (TikiWikiParser m b -> TikiWikiParser m c) -> TikiWikiParser m c +between start end p = + mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) + +enclosed :: (Monoid b, PandocMonad m, Show a) => TikiWikiParser m a -> (TikiWikiParser m a -> TikiWikiParser m b) -> TikiWikiParser m b +enclosed sep p = between sep (try $ sep <* endMarker) p + where + endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|'_") <|> eof + endSpace = (spaceChar <|> newline) >> return B.space + + +nestedInlines :: (Show a, PandocMonad m) => TikiWikiParser m a -> TikiWikiParser m B.Inlines +nestedInlines end = innerSpace <|> nestedInline + where + innerSpace = try $ whitespace <* notFollowedBy end + nestedInline = notFollowedBy whitespace >> nested inline + +-- {img attId="39" imalign="right" link="http://info.tikiwiki.org" alt="Panama Hat"} +-- +-- {img attId="37", thumb="mouseover", styleimage="border", desc="150"} +-- +-- {img src="img/wiki_up/393px-Pears.jpg" thumb="y" imalign="center" stylebox="border" button="y" desc="Pretty pears" max="200" rel="box"} +-- +image :: PandocMonad m => TikiWikiParser m B.Inlines +image = try $ do + string "{img " + rawAttrs <- sepEndBy1 imageAttr spaces + string "}" + let src = fromMaybe "" $ lookup "src" rawAttrs + let title = fromMaybe src $ lookup "desc" rawAttrs + let alt = fromMaybe title $ lookup "alt" rawAttrs + let classes = map fst $ filter (\(_,b) -> b == "" || b == "y") rawAttrs + if not (null src) + then + return $ B.imageWith ("", classes, rawAttrs) src title (B.str alt) + else + return $ B.str $ " NOT SUPPORTED: image without src attribute BEGIN: {img " ++ printAttrs rawAttrs ++ "} :END " + where + printAttrs attrs = unwords $ map (\(a, b) -> a ++ "=\"" ++ b ++ "\"") attrs + +imageAttr :: PandocMonad m => TikiWikiParser m (String, String) +imageAttr = try $ do + key <- many1 (noneOf "=} \t\n") + char '=' + optional $ char '"' + value <- many1 (noneOf "}\"\n") + optional $ char '"' + optional $ char ',' + return (key, value) + + +-- __strong__ +strong :: PandocMonad m => TikiWikiParser m B.Inlines +strong = try $ fmap B.strong (enclosed (string "__") nestedInlines) + +-- ''emph'' +emph :: PandocMonad m => TikiWikiParser m B.Inlines +emph = try $ fmap B.emph (enclosed (string "''") nestedInlines) + +-- ~246~ +escapedChar :: PandocMonad m => TikiWikiParser m B.Inlines +escapedChar = try $ do + string "~" + inner <- many1 $ oneOf "0123456789" + string "~" + return $B.str [toEnum (read inner :: Int) :: Char] + +-- UNSUPPORTED, as there doesn't seem to be any facility in calibre +-- for this +centered :: PandocMonad m => TikiWikiParser m B.Inlines +centered = try $ do + string "::" + inner <- many1 $ noneOf ":\n" + string "::" + return $ B.str $ " NOT SUPPORTED: :: (centered) BEGIN: ::" ++ inner ++ ":: :END " + +-- UNSUPPORTED, as there doesn't seem to be any facility in calibre +-- for this +colored :: PandocMonad m => TikiWikiParser m B.Inlines +colored = try $ do + string "~~" + inner <- many1 $ noneOf "~\n" + string "~~" + return $ B.str $ " NOT SUPPORTED: ~~ (colored) BEGIN: ~~" ++ inner ++ "~~ :END " + +-- UNSUPPORTED, as there doesn't seem to be any facility in calibre +-- for this +underlined :: PandocMonad m => TikiWikiParser m B.Inlines +underlined = try $ do + string "===" + inner <- many1 $ noneOf "=\n" + string "===" + return $ B.str $ " NOT SUPPORTED: ==== (underlined) BEGIN: ===" ++ inner ++ "=== :END " + +-- UNSUPPORTED, as there doesn't seem to be any facility in calibre +-- for this +boxed :: PandocMonad m => TikiWikiParser m B.Inlines +boxed = try $ do + string "^" + inner <- many1 $ noneOf "^\n" + string "^" + return $ B.str $ " NOT SUPPORTED: ^ (boxed) BEGIN: ^" ++ inner ++ "^ :END " + +-- --text-- +strikeout :: PandocMonad m => TikiWikiParser m B.Inlines +strikeout = try $ fmap B.strikeout (enclosed (string "--") nestedInlines) + +nestedString :: (Show a, PandocMonad m) => TikiWikiParser m a -> TikiWikiParser m String +nestedString end = innerSpace <|> count 1 nonspaceChar + where + innerSpace = try $ many1 spaceChar <* notFollowedBy end + +breakChars :: PandocMonad m => TikiWikiParser m B.Inlines +breakChars = try $ string "%%%" >> return B.linebreak + +-- superscript: foo{TAG(tag=>sup)}super{TAG}foo / bar{SUP()}super2{SUP}bar +superTag :: PandocMonad m => TikiWikiParser m B.Inlines +superTag = try $ fmap (B.superscript . B.text . fromEntities) ( between (string "{TAG(tag=>sup)}") (string "{TAG}") nestedString) + +superMacro :: PandocMonad m => TikiWikiParser m B.Inlines +superMacro = try $ do + string "{SUP(" + manyTill anyChar (string ")}") + body <- manyTill anyChar (string "{SUP}") + return $ B.superscript $ B.text body + +-- subscript: baz{TAG(tag=>sub)}sub{TAG}qux / qux{SUB()}sub2{SUB}qux +subTag :: PandocMonad m => TikiWikiParser m B.Inlines +subTag = try $ fmap (B.subscript . B.text . fromEntities) ( between (string "{TAG(tag=>sub)}") (string "{TAG}") nestedString) + +subMacro :: PandocMonad m => TikiWikiParser m B.Inlines +subMacro = try $ do + string "{SUB(" + manyTill anyChar (string ")}") + body <- manyTill anyChar (string "{SUB}") + return $ B.subscript $ B.text body + +-- -+text+- +code :: PandocMonad m => TikiWikiParser m B.Inlines +code = try $ fmap (B.code . fromEntities) ( between (string "-+") (string "+-") nestedString) + +macroAttr :: PandocMonad m => TikiWikiParser m (String, String) +macroAttr = try $ do + key <- many1 (noneOf "=)") + char '=' + optional $ char '"' + value <- many1 (noneOf " )\"") + optional $ char '"' + return (key, value) + +macroAttrs :: PandocMonad m => TikiWikiParser m [(String, String)] +macroAttrs = try $ sepEndBy macroAttr spaces + +-- ~np~ __not bold__ ~/np~ +noparse :: PandocMonad m => TikiWikiParser m B.Inlines +noparse = try $ do + string "~np~" + body <- manyTill anyChar (string "~/np~") + return $ B.str body + +str :: PandocMonad m => TikiWikiParser m B.Inlines +str = fmap B.str (many1 alphaNum <|> count 1 characterReference) + +symbol :: PandocMonad m => TikiWikiParser m B.Inlines +symbol = fmap B.str (count 1 nonspaceChar) + +-- [[not a link] +notExternalLink :: PandocMonad m => TikiWikiParser m B.Inlines +notExternalLink = try $ do + start <- string "[[" + body <- many (noneOf "\n[]") + end <- string "]" + return $ B.text (start ++ body ++ end) + +-- [http://www.somesite.org url|Some Site title] +-- ((internal link)) +-- +-- The ((...)) wiki links and [...] external links are handled +-- exactly the same; this abstracts that out +makeLink :: PandocMonad m => String -> String -> String -> TikiWikiParser m B.Inlines +makeLink start middle end = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + (url, title, anchor) <- wikiLinkText start middle end + parsedTitle <- parseFromString (many1 inline) title + setState $ st{ stateAllowLinks = True } + return $ B.link (url++anchor) "" $mconcat parsedTitle + +wikiLinkText :: PandocMonad m => String -> String -> String -> TikiWikiParser m (String, String, String) +wikiLinkText start middle end = do + string start + url <- many1 (noneOf $ middle ++ "\n") + seg1 <- option url linkContent + seg2 <- option "" linkContent + string end + if seg2 /= "" + then + return (url, seg2, seg1) + else + return (url, seg1, "") + where + linkContent = do + char '|' + many (noneOf middle) + +externalLink :: PandocMonad m => TikiWikiParser m B.Inlines +externalLink = makeLink "[" "]|" "]" + +-- NB: this wiki linking is unlikely to work for anyone besides me +-- (rlpowell); it happens to work for me because my Hakyll code has +-- post-processing that treats pandoc .md titles as valid link +-- targets, so something like +-- [see also this other post](My Other Page) is perfectly valid. +wikiLink :: PandocMonad m => TikiWikiParser m B.Inlines +wikiLink = makeLink "((" ")|" "))" diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs index 0aafc83c7..f4dda7a11 100644 --- a/src/Text/Pandoc/Readers/Txt2Tags.hs +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE ViewPatterns #-} {- Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> @@ -29,39 +28,39 @@ Conversion of txt2tags formatted plain text to 'Pandoc' document. module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags , getT2TMeta , T2TMeta (..) - , readTxt2TagsNoMacros) + ) where -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Inlines, Blocks, trimInlines ) -import Data.Monoid ((<>)) -import Text.Pandoc.Definition -import Text.Pandoc.Options -import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL) -import Text.Pandoc.Parsing hiding (space, spaces, uri, macro) +import Control.Monad (guard, void, when) +import Control.Monad.Except (catchError, throwError) +import Control.Monad.Reader (Reader, asks, runReader) import Data.Char (toLower) -import Data.List (transpose, intersperse, intercalate) -import Data.Maybe (fromMaybe) ---import Network.URI (isURI) -- Not sure whether to use this function -import Control.Monad (void, guard, when) import Data.Default -import Control.Monad.Reader (Reader, runReader, asks) -import Text.Pandoc.Error - -import Data.Time.LocalTime (getZonedTime) -import System.Directory(getModificationTime) +import Data.List (intercalate, transpose) +import Data.Maybe (fromMaybe) +import Data.Monoid ((<>)) +import Data.Text (Text) +import qualified Data.Text as T import Data.Time.Format (formatTime) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P import Text.Pandoc.Compat.Time (defaultTimeLocale) -import System.IO.Error (catchIOError) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (space, spaces, uri) +import Text.Pandoc.Shared (compactify, compactifyDL, crFilter, escapeURI, + underlineSpan) type T2T = ParserT String ParserState (Reader T2TMeta) -- | An object for the T2T macros meta information -- the contents of each field is simply substituted verbatim into the file data T2TMeta = T2TMeta { - date :: String -- ^ Current date - , mtime :: String -- ^ Last modification time of infile - , infile :: FilePath -- ^ Input file + date :: String -- ^ Current date + , mtime :: String -- ^ Last modification time of infile + , infile :: FilePath -- ^ Input file , outfile :: FilePath -- ^ Output file } deriving Show @@ -69,26 +68,38 @@ instance Default T2TMeta where def = T2TMeta "" "" "" "" -- | Get the meta information required by Txt2Tags macros -getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta -getT2TMeta inps out = do - curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime +getT2TMeta :: PandocMonad m => m T2TMeta +getT2TMeta = do + inps <- P.getInputFiles + outp <- fromMaybe "" <$> P.getOutputFile + curDate <- formatTime defaultTimeLocale "%F" <$> P.getZonedTime let getModTime = fmap (formatTime defaultTimeLocale "%T") . - getModificationTime + P.getModificationTime curMtime <- case inps of - [] -> formatTime defaultTimeLocale "%T" <$> getZonedTime - _ -> catchIOError + [] -> formatTime defaultTimeLocale "%T" <$> P.getZonedTime + _ -> catchError (maximum <$> mapM getModTime inps) (const (return "")) - return $ T2TMeta curDate curMtime (intercalate ", " inps) out + return $ T2TMeta curDate curMtime (intercalate ", " inps) outp -- | Read Txt2Tags from an input string returning a Pandoc document -readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Either PandocError Pandoc -readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n") +readTxt2Tags :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readTxt2Tags opts s = do + meta <- getT2TMeta + let parsed = flip runReader meta $ + readWithM parseT2T (def {stateOptions = opts}) $ + T.unpack (crFilter s) ++ "\n\n" + case parsed of + Right result -> return result + Left e -> throwError e -- | Read Txt2Tags (ignoring all macros) from an input string returning -- a Pandoc document -readTxt2TagsNoMacros :: ReaderOptions -> String -> Either PandocError Pandoc -readTxt2TagsNoMacros = readTxt2Tags def +-- readTxt2TagsNoMacros :: PandocMonad m => ReaderOptions -> String -> m Pandoc +-- readTxt2TagsNoMacros = readTxt2Tags parseT2T :: T2T Pandoc parseT2T = do @@ -137,7 +148,7 @@ setting = do string "%!" keyword <- ignoreSpacesCap (many1 alphaNum) char ':' - value <- ignoreSpacesCap (manyTill anyChar (newline)) + value <- ignoreSpacesCap (manyTill anyChar newline) return (keyword, value) -- Blocks @@ -146,7 +157,7 @@ parseBlocks :: T2T Blocks parseBlocks = mconcat <$> manyTill block eof block :: T2T Blocks -block = do +block = choice [ mempty <$ blanklines , quote @@ -184,7 +195,7 @@ para = try $ do listStart = try bulletListStart <|> orderedListStart commentBlock :: T2T Blocks -commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment +commentBlock = try (blockMarkupArea anyLine (const mempty) "%%%") <|> comment -- Seperator and Strong line treated the same hrule :: T2T Blocks @@ -198,7 +209,7 @@ quote :: T2T Blocks quote = try $ do lookAhead tab rawQuote <- many1 (tab *> optional spaces *> anyLine) - contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n") + contents <- parseFromString' parseBlocks (intercalate "\n" rawQuote ++ "\n\n") return $ B.blockQuote contents commentLine :: T2T Inlines @@ -210,16 +221,16 @@ list :: T2T Blocks list = choice [bulletList, orderedList, definitionList] bulletList :: T2T Blocks -bulletList = B.bulletList . compactify' +bulletList = B.bulletList . compactify <$> many1 (listItem bulletListStart parseBlocks) orderedList :: T2T Blocks -orderedList = B.orderedList . compactify' +orderedList = B.orderedList . compactify <$> many1 (listItem orderedListStart parseBlocks) definitionList :: T2T Blocks -definitionList = try $ do - B.definitionList . compactify'DL <$> +definitionList = try $ + B.definitionList . compactifyDL <$> many1 (listItem definitionListStart definitionListEnd) definitionListEnd :: T2T (Inlines, [Blocks]) @@ -250,7 +261,7 @@ listItem start end = try $ do firstLine <- anyLineNewline blank <- option "" ("\n" <$ blankline) rest <- concat <$> many (listContinuation markerLength) - parseFromString end $ firstLine ++ blank ++ rest + parseFromString' end $ firstLine ++ blank ++ rest -- continuation of a list item - indented and separated by blankline or endline. -- Note: nested lists are parsed as continuations. @@ -262,12 +273,6 @@ listContinuation markerLength = try $ <*> many blankline) where listLine = try $ indentWith markerLength *> anyLineNewline -anyLineNewline :: T2T String -anyLineNewline = (++ "\n") <$> anyLine - -indentWith :: Int -> T2T String -indentWith n = count n space - -- Table table :: T2T Blocks @@ -276,17 +281,17 @@ table = try $ do rows <- many1 (many commentLine *> tableRow) let columns = transpose rows let ncolumns = length columns - let aligns = map (foldr1 findAlign) (map (map fst) columns) + let aligns = map (foldr1 findAlign . map fst) columns let rows' = map (map snd) rows let size = maximum (map length rows') let rowsPadded = map (pad size) rows' - let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty + let headerPadded = if null tableHeader then mempty else pad size tableHeader return $ B.table mempty (zip aligns (replicate ncolumns 0.0)) headerPadded rowsPadded pad :: (Monoid a) => Int -> [a] -> [a] -pad n xs = xs ++ (replicate (n - length xs) mempty) +pad n xs = xs ++ replicate (n - length xs) mempty findAlign :: Alignment -> Alignment -> Alignment @@ -309,7 +314,7 @@ genericRow start = try $ do tableCell :: T2T (Alignment, Blocks) tableCell = try $ do leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead - content <- (manyTill inline (try $ lookAhead (cellEnd))) + content <- manyTill inline (try $ lookAhead cellEnd) rightSpaces <- length <$> many space let align = case compare leftSpaces rightSpaces of @@ -317,9 +322,9 @@ tableCell = try $ do EQ -> AlignCenter GT -> AlignRight endOfCell - return $ (align, B.plain (B.trimInlines $ mconcat content)) + return (align, B.plain (B.trimInlines $ mconcat content)) where - cellEnd = (void newline <|> (many1 space *> endOfCell)) + cellEnd = void newline <|> (many1 space *> endOfCell) endOfCell :: T2T () endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline) @@ -342,10 +347,10 @@ taggedBlock = do genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s -blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks -blockMarkupArea p f s = try $ (do +blockMarkupArea :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupArea p f s = try (do string s *> blankline - f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline)))) + f . mconcat <$> manyTill p (eof <|> void (string s *> blankline))) blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks blockMarkupLine p f s = try (f <$> (string s *> space *> p)) @@ -363,7 +368,7 @@ parseInlines :: T2T Inlines parseInlines = trimInlines . mconcat <$> many1 inline inline :: T2T Inlines -inline = do +inline = choice [ endline , macro @@ -385,16 +390,16 @@ inline = do ] bold :: T2T Inlines -bold = inlineMarkup inline B.strong '*' (B.str) +bold = inlineMarkup inline B.strong '*' B.str underline :: T2T Inlines -underline = inlineMarkup inline B.emph '_' (B.str) +underline = inlineMarkup inline underlineSpan '_' B.str strike :: T2T Inlines -strike = inlineMarkup inline B.strikeout '-' (B.str) +strike = inlineMarkup inline B.strikeout '-' B.str italic :: T2T Inlines -italic = inlineMarkup inline B.emph '/' (B.str) +italic = inlineMarkup inline B.emph '/' B.str code :: T2T Inlines code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id @@ -413,7 +418,7 @@ tagged = do -- Glued meaning that markup must be tight to content -- Markup can't pass newlines inlineMarkup :: Monoid a - => (T2T a) -- Content parser + => T2T a -- Content parser -> (a -> Inlines) -- Constructor -> Char -- Fence -> (String -> a) -- Special Case to handle ****** @@ -425,20 +430,24 @@ inlineMarkup p f c special = try $ do when (l == 2) (void $ notFollowedBy space) -- We must make sure that there is no space before the start of the -- closing tags - body <- optionMaybe (try $ manyTill (noneOf "\n\r") $ + body <- optionMaybe (try $ manyTill (noneOf "\n\r") (try $ lookAhead (noneOf " " >> string [c,c] ))) case body of Just middle -> do lastChar <- anyChar end <- many1 (char c) - let parser inp = parseFromString (mconcat <$> many p) inp - let start' = special (drop 2 start) + let parser inp = parseFromString' (mconcat <$> many p) inp + let start' = case drop 2 start of + "" -> mempty + xs -> special xs body' <- parser (middle ++ [lastChar]) - let end' = special (drop 2 end) + let end' = case drop 2 end of + "" -> mempty + xs -> special xs return $ f (start' <> body' <> end') Nothing -> do -- Either bad or case such as ***** guard (l >= 5) - let body' = (replicate (l - 4) c) + let body' = replicate (l - 4) c return $ f (special body') link :: T2T Inlines @@ -453,8 +462,8 @@ titleLink = try $ do guard (length tokens >= 2) char ']' let link' = last tokens - guard (length link' > 0) - let tit = concat (intersperse " " (init tokens)) + guard $ not $ null link' + let tit = unwords (init tokens) return $ B.link link' "" (B.text tit) -- Link with image @@ -479,7 +488,7 @@ macro = try $ do -- raw URLs in text are automatically linked url :: T2T Inlines url = try $ do - (rawUrl, escapedUrl) <- (try uri <|> emailAddress) + (rawUrl, escapedUrl) <- try uri <|> emailAddress return $ B.link rawUrl "" (B.str escapedUrl) uri :: T2T (String, String) @@ -520,8 +529,7 @@ image = try $ do -- List taken from txt2tags source let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"] char '[' - path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions)) - ext <- oneOfStrings extensions + (path, ext) <- manyUntil (noneOf "\n\t\r ") (oneOfStrings extensions) char ']' return $ B.image (path ++ ext) "" mempty @@ -550,11 +558,10 @@ endline = try $ do notFollowedBy quote notFollowedBy list notFollowedBy table - return $ B.softbreak + return B.softbreak str :: T2T Inlines -str = try $ do - B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") +str = try $ B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") whitespace :: T2T Inlines whitespace = try $ B.space <$ spaceChar diff --git a/src/Text/Pandoc/Readers/Vimwiki.hs b/src/Text/Pandoc/Readers/Vimwiki.hs new file mode 100644 index 000000000..d717a1ba8 --- /dev/null +++ b/src/Text/Pandoc/Readers/Vimwiki.hs @@ -0,0 +1,673 @@ +{- + Copyright (C) 2017-2018 Yuchen Pei <me@ypei.me> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Vimwiki + Copyright : Copyright (C) 2017-2018 Yuchen Pei + License : GNU GPL, version 2 or above + + Maintainer : Yuchen Pei <me@ypei.me> + Stability : alpha + Portability : portable + +Conversion of vimwiki text to 'Pandoc' document. +-} +{-- +[X]: implemented +[O]: not implemented +* block parsers: + * [X] header + * [X] hrule + * [X] comment + * [X] blockquote + * [X] preformatted -- using codeblock + * [X] displaymath + * [X] bulletlist / orderedlist + * [X] todo lists -- using span. + * [X] table + * [X] centered table -- using div + * [O] colspan and rowspan -- see issue #1024 + * [X] paragraph + * [X] definition list +* inline parsers: + * [X] bareURL + * [X] strong + * [X] emph + * [X] strikeout + * [X] code + * [X] link + * [X] image + * [X] inline math + * [X] tag + * [X] sub- and super-scripts +* misc: + * [X] `TODO:` mark + * [X] metadata placeholders: %title and %date + * [O] control placeholders: %template and %nohtml -- ignored +--} + +module Text.Pandoc.Readers.Vimwiki ( readVimwiki + ) where +import Control.Monad (guard) +import Control.Monad.Except (throwError) +import Data.Default +import Data.List (isInfixOf, isPrefixOf) +import Data.Maybe +import Data.Monoid ((<>)) +import Data.Text (Text, unpack) +import Text.Pandoc.Builder (Blocks, Inlines, fromList, toList, trimInlines) +import qualified Text.Pandoc.Builder as B (blockQuote, bulletList, code, + codeBlockWith, definitionList, + displayMath, divWith, emph, + headerWith, horizontalRule, image, + imageWith, link, math, orderedList, + para, plain, setMeta, simpleTable, + softbreak, space, spanWith, str, + strikeout, strong, subscript, + superscript) +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition (Attr, Block (BulletList, OrderedList), + Inline (Space), ListNumberDelim (..), + ListNumberStyle (..), Meta, Pandoc (..), + nullMeta) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Parsing (F, ParserState, ParserT, blanklines, emailAddress, + many1Till, orderedListMarker, readWithM, + registerHeader, runF, spaceChar, stateMeta', + stateOptions, uri) +import Text.Pandoc.Shared (crFilter, splitBy, stringify, stripFirstAndLast) +import Text.Parsec.Char (alphaNum, anyChar, char, newline, noneOf, oneOf, space, + spaces, string) +import Text.Parsec.Combinator (between, choice, count, eof, lookAhead, many1, + manyTill, notFollowedBy, option, skipMany1) +import Text.Parsec.Prim (getState, many, try, updateState, (<|>)) + +readVimwiki :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readVimwiki opts s = do + res <- readWithM parseVimwiki def{ stateOptions = opts } + (unpack (crFilter s)) + case res of + Left e -> throwError e + Right result -> return result + +type VwParser = ParserT [Char] ParserState + + +-- constants + +specialChars :: [Char] +specialChars = "=*-#[]_~{}`$|:%^," + +spaceChars :: [Char] +spaceChars = " \t\n" + +-- main parser + +parseVimwiki :: PandocMonad m => VwParser m Pandoc +parseVimwiki = do + bs <- mconcat <$> many block + spaces + eof + st <- getState + let meta = runF (stateMeta' st) st + return $ Pandoc meta (toList bs) + +-- block parser + +block :: PandocMonad m => VwParser m Blocks +block = do + res <- choice [ mempty <$ blanklines + , header + , hrule + , mempty <$ comment + , mixedList + , preformatted + , displayMath + , table + , mempty <$ placeholder + , blockQuote + , definitionList + , para + ] + trace (take 60 $ show $ toList res) + return res + +blockML :: PandocMonad m => VwParser m Blocks +blockML = choice [preformatted, displayMath, table] + +header :: PandocMonad m => VwParser m Blocks +header = try $ do + sp <- many spaceChar + eqs <- many1 (char '=') + spaceChar + let lev = length eqs + guard $ lev <= 6 + contents <- trimInlines . mconcat <$> manyTill inline (try $ spaceChar + >> string eqs >> many spaceChar >> newline) + attr <- registerHeader (makeId contents, + if sp == "" then [] else ["justcenter"], []) contents + return $ B.headerWith attr lev contents + +para :: PandocMonad m => VwParser m Blocks +para = try $ do + contents <- trimInlines . mconcat <$> many1 inline + if all (==Space) (toList contents) + then return mempty + else return $ B.para contents + +hrule :: PandocMonad m => VwParser m Blocks +hrule = try $ B.horizontalRule <$ (string "----" >> many (char '-') >> newline) + +comment :: PandocMonad m => VwParser m () +comment = try $ do + many spaceChar >> string "%%" >> many (noneOf "\n") + return () + +blockQuote :: PandocMonad m => VwParser m Blocks +blockQuote = try $ do + string " " + contents <- trimInlines . mconcat <$> many1 inlineBQ + if all (==Space) (toList contents) + then return mempty + else return $ B.blockQuote $ B.plain contents + +definitionList :: PandocMonad m => VwParser m Blocks +definitionList = try $ + B.definitionList <$> many1 (dlItemWithDT <|> dlItemWithoutDT) + +dlItemWithDT :: PandocMonad m => VwParser m (Inlines, [Blocks]) +dlItemWithDT = do + dt <- definitionTerm + dds <- many definitionDef + return (dt, dds) + +dlItemWithoutDT :: PandocMonad m => VwParser m (Inlines, [Blocks]) +dlItemWithoutDT = do + dds <- many1 definitionDef + return (mempty, dds) + +definitionDef :: PandocMonad m => VwParser m Blocks +definitionDef = try $ + notFollowedBy definitionTerm >> many spaceChar + >> (definitionDef1 <|> definitionDef2) + +definitionDef1 :: PandocMonad m => VwParser m Blocks +definitionDef1 = try $ mempty <$ defMarkerE + +definitionDef2 :: PandocMonad m => VwParser m Blocks +definitionDef2 = try $ B.plain <$> + (defMarkerM >> (trimInlines . mconcat <$> many inline') <* newline) + + +definitionTerm :: PandocMonad m => VwParser m Inlines +definitionTerm = try $ do + x <- definitionTerm1 <|> definitionTerm2 + guard (stringify x /= "") + return x + +definitionTerm1 :: PandocMonad m => VwParser m Inlines +definitionTerm1 = try $ + trimInlines . mconcat <$> manyTill inline' (try defMarkerE) + +definitionTerm2 :: PandocMonad m => VwParser m Inlines +definitionTerm2 = try $ trimInlines . mconcat <$> manyTill inline' + (try $lookAhead (defMarkerM >> notFollowedBy hasDefMarkerM)) + +defMarkerM :: PandocMonad m => VwParser m Char +defMarkerM = string "::" >> spaceChar + +defMarkerE :: PandocMonad m => VwParser m Char +defMarkerE = string "::" >> newline + +hasDefMarkerM :: PandocMonad m => VwParser m String +hasDefMarkerM = manyTill (noneOf "\n") (try defMarkerM) + +preformatted :: PandocMonad m => VwParser m Blocks +preformatted = try $ do + many spaceChar >> string "{{{" + attrText <- many (noneOf "\n") + lookAhead newline + contents <- manyTill anyChar (try (char '\n' >> many spaceChar >> string "}}}" + >> many spaceChar >> newline)) + if (contents /= "") && (head contents == '\n') + then return $ B.codeBlockWith (makeAttr attrText) (tail contents) + else return $ B.codeBlockWith (makeAttr attrText) contents + +makeAttr :: String -> Attr +makeAttr s = + let xs = splitBy (`elem` " \t") s in + ("", [], mapMaybe nameValue xs) + +nameValue :: String -> Maybe (String, String) +nameValue s = + let t = splitBy (== '=') s in + if length t /= 2 + then Nothing + else let (a, b) = (head t, last t) in + if (length b < 2) || ((head b, last b) /= ('"', '"')) + then Nothing + else Just (a, stripFirstAndLast b) + + +displayMath :: PandocMonad m => VwParser m Blocks +displayMath = try $ do + many spaceChar >> string "{{$" + mathTag <- option "" mathTagParser + many space + contents <- manyTill anyChar (try (char '\n' >> many spaceChar >> string "}}$" + >> many spaceChar >> newline)) + let contentsWithTags + | mathTag == "" = contents + | otherwise = "\\begin{" ++ mathTag ++ "}\n" ++ contents + ++ "\n\\end{" ++ mathTag ++ "}" + return $ B.para $ B.displayMath contentsWithTags + + +mathTagLaTeX :: String -> String +mathTagLaTeX s = case s of + "equation" -> "" + "equation*" -> "" + "gather" -> "gathered" + "gather*" -> "gathered" + "multline" -> "gathered" + "multline*" -> "gathered" + "eqnarray" -> "aligned" + "eqnarray*" -> "aligned" + "align" -> "aligned" + "align*" -> "aligned" + "alignat" -> "aligned" + "alignat*" -> "aligned" + _ -> s + + +mixedList :: PandocMonad m => VwParser m Blocks +mixedList = try $ do + (bl, _) <- mixedList' (-1) + return $ head bl + +mixedList' :: PandocMonad m => Int -> VwParser m ([Blocks], Int) +mixedList' prevInd = do + (curInd, builder) <- option (-1, "na") (lookAhead listStart) + if curInd < prevInd + then return ([], curInd) + else do + listStart + curLine <- listItemContent + let listBuilder = + if builder == "ul" then B.bulletList else B.orderedList + (subList, lowInd) <- mixedList' curInd + if lowInd >= curInd + then do + (sameIndList, endInd) <- mixedList' lowInd + let curList = combineList curLine subList ++ sameIndList + if curInd > prevInd + then return ([listBuilder curList], endInd) + else return (curList, endInd) + else do + let (curList, endInd) = (combineList curLine subList, + lowInd) + if curInd > prevInd + then return ([listBuilder curList], endInd) + else return (curList, endInd) + +plainInlineML' :: PandocMonad m => Inlines -> VwParser m Blocks +plainInlineML' w = do + xs <- many inlineML + newline + return $ B.plain $ trimInlines $ mconcat $ w:xs + +plainInlineML :: PandocMonad m => VwParser m Blocks +plainInlineML = notFollowedBy listStart >> spaceChar >> plainInlineML' mempty + + +listItemContent :: PandocMonad m => VwParser m Blocks +listItemContent = try $ do + w <- option mempty listTodoMarker + x <- plainInlineML' w + y <- many blocksThenInline + z <- many blockML + return $ mconcat $ x:y ++ z + +blocksThenInline :: PandocMonad m => VwParser m Blocks +blocksThenInline = try $ do + y <- many1 blockML + x <- plainInlineML + return $ mconcat $ y ++ [x] + +listTodoMarker :: PandocMonad m => VwParser m Inlines +listTodoMarker = try $ do + x <- between (many spaceChar >> char '[') (char ']' >> spaceChar) + (oneOf " .oOX") + return $ makeListMarkerSpan x + +makeListMarkerSpan :: Char -> Inlines +makeListMarkerSpan x = + let cl = case x of + ' ' -> "done0" + '.' -> "done1" + 'o' -> "done2" + 'O' -> "done3" + 'X' -> "done4" + _ -> "" + in + B.spanWith ("", [cl], []) mempty + +combineList :: Blocks -> [Blocks] -> [Blocks] +combineList x [y] = case toList y of + [BulletList z] -> [fromList $ toList x + ++ [BulletList z]] + [OrderedList attr z] -> [fromList $ toList x + ++ [OrderedList attr z]] + _ -> x:[y] +combineList x xs = x:xs + +listStart :: PandocMonad m => VwParser m (Int, String) +listStart = try $ do + s <- many spaceChar + listType <- bulletListMarkers <|> orderedListMarkers + spaceChar + return (length s, listType) + +bulletListMarkers :: PandocMonad m => VwParser m String +bulletListMarkers = "ul" <$ (char '*' <|> char '-') + +orderedListMarkers :: PandocMonad m => VwParser m String +orderedListMarkers = + ("ol" <$choice (orderedListMarker Decimal Period:(($OneParen) . orderedListMarker <$> [Decimal, LowerRoman, UpperRoman, LowerAlpha, UpperAlpha]))) + <|> ("ol" <$ char '#') + +--many need trimInlines +table :: PandocMonad m => VwParser m Blocks +table = try $ do + indent <- lookAhead (many spaceChar) + (th, trs) <- table1 <|> table2 + let tab = B.simpleTable th trs + if indent == "" + then return tab + else return $ B.divWith ("", ["center"], []) tab + +-- table with header +table1 :: PandocMonad m => VwParser m ([Blocks], [[Blocks]]) +table1 = try $ do + th <- tableRow + many1 tableHeaderSeparator + trs <- many tableRow + return (th, trs) + +-- headerless table +table2 :: PandocMonad m => VwParser m ([Blocks], [[Blocks]]) +table2 = try $ do + trs <- many1 tableRow + return (replicate (length $ head trs) mempty, trs) + +tableHeaderSeparator :: PandocMonad m => VwParser m () +tableHeaderSeparator = try $ do + many spaceChar >> char '|' >> many1 (many1 (char '-') >> char '|') + >> many spaceChar >> newline + return () + +tableRow :: PandocMonad m => VwParser m [Blocks] +tableRow = try $ do + many spaceChar >> char '|' + s <- lookAhead $ manyTill anyChar (try (char '|' >> many spaceChar + >> newline)) + guard $ not $ "||" `isInfixOf` ("|" ++ s ++ "|") + tr <- many tableCell + many spaceChar >> char '\n' + return tr + +tableCell :: PandocMonad m => VwParser m Blocks +tableCell = try $ + B.plain . trimInlines . mconcat <$> manyTill inline' (char '|') + +placeholder :: PandocMonad m => VwParser m () +placeholder = try $ + choice (ph <$> ["title", "date"]) <|> noHtmlPh <|> templatePh + +ph :: PandocMonad m => String -> VwParser m () +ph s = try $ do + many spaceChar >>string ('%':s) >> spaceChar + contents <- trimInlines . mconcat <$> manyTill inline (lookAhead newline) + --use lookAhead because of placeholder in the whitespace parser + let meta' = return $ B.setMeta s contents nullMeta :: F Meta + updateState $ \st -> st { stateMeta' = stateMeta' st <> meta' } + +noHtmlPh :: PandocMonad m => VwParser m () +noHtmlPh = try $ + () <$ (many spaceChar >> string "%nohtml" >> many spaceChar + >> lookAhead newline) + +templatePh :: PandocMonad m => VwParser m () +templatePh = try $ + () <$ (many spaceChar >> string "%template" >>many (noneOf "\n") + >> lookAhead newline) + +-- inline parser + +inline :: PandocMonad m => VwParser m Inlines +inline = choice $ whitespace endlineP:inlineList + +inlineList :: PandocMonad m => [VwParser m Inlines] +inlineList = [ bareURL + , todoMark + , str + , strong + , emph + , strikeout + , code + , link + , image + , inlineMath + , tag + , superscript + , subscript + , special + ] + +-- inline parser without softbreaks or comment breaks +inline' :: PandocMonad m => VwParser m Inlines +inline' = choice $ whitespace':inlineList + +-- inline parser for blockquotes +inlineBQ :: PandocMonad m => VwParser m Inlines +inlineBQ = choice $ whitespace endlineBQ:inlineList + +-- inline parser for mixedlists +inlineML :: PandocMonad m => VwParser m Inlines +inlineML = choice $ whitespace endlineML:inlineList + +str :: PandocMonad m => VwParser m Inlines +str = B.str <$>many1 (noneOf $ spaceChars ++ specialChars) + +whitespace :: PandocMonad m => VwParser m () -> VwParser m Inlines +whitespace endline = B.space <$ (skipMany1 spaceChar <|> + try (newline >> (comment <|> placeholder))) + <|> B.softbreak <$ endline + +whitespace' :: PandocMonad m => VwParser m Inlines +whitespace' = B.space <$ skipMany1 spaceChar + +special :: PandocMonad m => VwParser m Inlines +special = B.str <$> count 1 (oneOf specialChars) + +bareURL :: PandocMonad m => VwParser m Inlines +bareURL = try $ do + (orig, src) <- uri <|> emailAddress + return $ B.link src "" (B.str orig) + +strong :: PandocMonad m => VwParser m Inlines +strong = try $ do + s <- lookAhead $ between (char '*') (char '*') (many1 $ noneOf "*") + guard $ (head s `notElem` spaceChars) + && (last s `notElem` spaceChars) + char '*' + contents <- mconcat <$>manyTill inline' (char '*' + >> notFollowedBy alphaNum) + return $ B.spanWith (makeId contents, [], []) mempty + <> B.strong contents + +makeId :: Inlines -> String +makeId i = concat (stringify <$> toList i) + +emph :: PandocMonad m => VwParser m Inlines +emph = try $ do + s <- lookAhead $ between (char '_') (char '_') (many1 $ noneOf "_") + guard $ (head s `notElem` spaceChars) + && (last s `notElem` spaceChars) + char '_' + contents <- mconcat <$>manyTill inline' (char '_' + >> notFollowedBy alphaNum) + return $ B.emph contents + +strikeout :: PandocMonad m => VwParser m Inlines +strikeout = try $ do + string "~~" + contents <- mconcat <$>many1Till inline' (string "~~") + return $ B.strikeout contents + +code :: PandocMonad m => VwParser m Inlines +code = try $ do + char '`' + contents <- many1Till (noneOf "\n") (char '`') + return $ B.code contents + +superscript :: PandocMonad m => VwParser m Inlines +superscript = try $ + B.superscript . mconcat <$> (char '^' >> many1Till inline' (char '^')) + +subscript :: PandocMonad m => VwParser m Inlines +subscript = try $ + B.subscript . mconcat <$> (string ",," + >> many1Till inline' (try $ string ",,")) + +link :: PandocMonad m => VwParser m Inlines +link = try $ do + string "[[" + contents <- lookAhead $ manyTill anyChar (string "]]") + case '|' `elem` contents of + False -> do + manyTill anyChar (string "]]") +-- not using try here because [[hell]o]] is not rendered as a link in vimwiki + return $ B.link (procLink contents) "" (B.str contents) + True -> do + url <- manyTill anyChar $ char '|' + lab <- mconcat <$> manyTill inline (string "]]") + return $ B.link (procLink url) "" lab + +image :: PandocMonad m => VwParser m Inlines +image = try $ do + string "{{" + contentText <- lookAhead $ manyTill (noneOf "\n") (try $ string "}}") + images $ length $ filter (== '|') contentText + +images :: PandocMonad m => Int -> VwParser m Inlines +images k + | k == 0 = do + imgurl <- manyTill anyChar (try $ string "}}") + return $ B.image (procImgurl imgurl) "" (B.str "") + | k == 1 = do + imgurl <- manyTill anyChar (char '|') + alt <- mconcat <$> manyTill inline (try $ string "}}") + return $ B.image (procImgurl imgurl) "" alt + | k == 2 = do + imgurl <- manyTill anyChar (char '|') + alt <- mconcat <$>manyTill inline (char '|') + attrText <- manyTill anyChar (try $ string "}}") + return $ B.imageWith (makeAttr attrText) (procImgurl imgurl) "" alt + | otherwise = do + imgurl <- manyTill anyChar (char '|') + alt <- mconcat <$>manyTill inline (char '|') + attrText <- manyTill anyChar (char '|') + manyTill anyChar (try $ string "}}") + return $ B.imageWith (makeAttr attrText) (procImgurl imgurl) "" alt + +procLink' :: String -> String +procLink' s + | take 6 s == "local:" = "file" ++ drop 5 s + | take 6 s == "diary:" = "diary/" ++ drop 6 s ++ ".html" + | or ((`isPrefixOf` s) <$> [ "http:", "https:", "ftp:", "file:", "mailto:", + "news:", "telnet:" ]) + = s + | s == "" = "" + | last s == '/' = s + | otherwise = s ++ ".html" + +procLink :: String -> String +procLink s = procLink' x ++ y + where (x, y) = break (=='#') s + +procImgurl :: String -> String +procImgurl s = if take 6 s == "local:" then "file" ++ drop 5 s else s + +inlineMath :: PandocMonad m => VwParser m Inlines +inlineMath = try $ do + char '$' + contents <- many1Till (noneOf "\n") (char '$') + return $ B.math contents + +tag :: PandocMonad m => VwParser m Inlines +tag = try $ do + char ':' + s <- manyTill (noneOf spaceChars) (try (char ':' >> lookAhead space)) + guard $ not $ "::" `isInfixOf` (":" ++ s ++ ":") + let ss = splitBy (==':') s + return $ mconcat $ makeTagSpan' (head ss):(makeTagSpan <$> tail ss) + +todoMark :: PandocMonad m => VwParser m Inlines +todoMark = try $ do + string "TODO:" + return $ B.spanWith ("", ["todo"], []) (B.str "TODO:") + +-- helper functions and parsers +endlineP :: PandocMonad m => VwParser m () +endlineP = () <$ try (newline <* nFBTTBSB <* notFollowedBy blockQuote) + +endlineBQ :: PandocMonad m => VwParser m () +endlineBQ = () <$ try (newline <* nFBTTBSB <* string " ") + +endlineML :: PandocMonad m => VwParser m () +endlineML = () <$ try (newline <* nFBTTBSB <* many1 spaceChar) + +--- nFBTTBSB is short for notFollowedByThingsThatBreakSoftBreaks +nFBTTBSB :: PandocMonad m => VwParser m () +nFBTTBSB = + notFollowedBy newline <* + notFollowedBy hrule <* + notFollowedBy tableRow <* + notFollowedBy header <* + notFollowedBy listStart <* + notFollowedBy preformatted <* + notFollowedBy displayMath <* + notFollowedBy hasDefMarker + +hasDefMarker :: PandocMonad m => VwParser m () +hasDefMarker = () <$ manyTill (noneOf "\n") (string "::" >> oneOf spaceChars) + +makeTagSpan' :: String -> Inlines +makeTagSpan' s = B.spanWith ('-' : s, [], []) (B.str "") <> + B.spanWith (s, ["tag"], []) (B.str s) + +makeTagSpan :: String -> Inlines +makeTagSpan s = B.space <> makeTagSpan' s + +mathTagParser :: PandocMonad m => VwParser m String +mathTagParser = do + s <- try $ lookAhead (char '%' >> manyTill (noneOf spaceChars) + (try $ char '%' >> many (noneOf $ '%':spaceChars) >> space)) + char '%' >> string s >> char '%' + return $ mathTagLaTeX s |