diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 631 |
1 files changed, 380 insertions, 251 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 595c805bf..e58b0a905 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,7 +1,8 @@ -{-# LANGUAGE PatternGuards, OverloadedStrings, CPP #-} - +{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternGuards #-} {- -Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Docx - Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal License : GNU GPL, version 2 or above Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> @@ -50,12 +51,13 @@ implemented, [-] means partially implemented): * Inlines - [X] Str - - [X] Emph (italics and underline both read as Emph) + - [X] Emph - [X] Strong - [X] Strikeout - [X] Superscript - [X] Subscript - [X] SmallCaps + - [-] Underline (was previously converted to Emph) - [ ] Quoted - [ ] Cite - [X] Code (styled with `VerbatimChar`) @@ -64,88 +66,91 @@ implemented, [-] means partially implemented): - [X] Math - [X] Link (links to an arbitrary bookmark create a span with the target as id and "anchor" class) - - [X] Image + - [X] Image - [X] Note (Footnotes and Endnotes are silently combined.) -} module Text.Pandoc.Readers.Docx - ( readDocxWithWarnings - , readDocx + ( readDocx ) where import Codec.Archive.Zip -import Text.Pandoc.Definition -import Text.Pandoc.Options +import Control.Monad.Reader +import Control.Monad.State.Strict +import qualified Data.ByteString.Lazy as B +import Data.Default (Default) +import Data.List (delete, intersect) +import qualified Data.Map as M +import Data.Maybe (isJust, fromMaybe) +import Data.Sequence (ViewL (..), viewl) +import qualified Data.Sequence as Seq +import qualified Data.Set as Set import Text.Pandoc.Builder -import Text.Pandoc.Walk -import Text.Pandoc.Readers.Docx.Parse -import Text.Pandoc.Readers.Docx.Lists +-- import Text.Pandoc.Definition +import Text.Pandoc.MediaBag (MediaBag) +import Text.Pandoc.Options import Text.Pandoc.Readers.Docx.Combine +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Shared -import Text.Pandoc.MediaBag (insertMedia, MediaBag) -import Data.List (delete, intersect) +import Text.Pandoc.Walk import Text.TeXMath (writeTeX) -import Data.Default (Default) -import qualified Data.ByteString.Lazy as B -import qualified Data.Map as M -import qualified Data.Set as Set -import Control.Monad.Reader -import Control.Monad.State -import Data.Sequence (ViewL(..), viewl) -import qualified Data.Sequence as Seq (null) #if !(MIN_VERSION_base(4,8,0)) import Data.Traversable (traverse) #endif - +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P import Text.Pandoc.Error -import Control.Monad.Except +import Text.Pandoc.Logging -readDocxWithWarnings :: ReaderOptions - -> B.ByteString - -> Either PandocError (Pandoc, MediaBag, [String]) -readDocxWithWarnings opts bytes +readDocx :: PandocMonad m + => ReaderOptions + -> B.ByteString + -> m Pandoc +readDocx opts bytes | Right archive <- toArchiveOrFail bytes , Right (docx, parserWarnings) <- archiveToDocxWithWarnings archive = do - (meta, blks, mediaBag, warnings) <- docxToOutput opts docx - return (Pandoc meta blks, mediaBag, parserWarnings ++ warnings) -readDocxWithWarnings _ _ = - Left (ParseFailure "couldn't parse docx file") - -readDocx :: ReaderOptions - -> B.ByteString - -> Either PandocError (Pandoc, MediaBag) -readDocx opts bytes = do - (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes - return (pandoc, mediaBag) + mapM_ (P.report . DocxParserWarning) parserWarnings + (meta, blks) <- docxToOutput opts docx + return $ Pandoc meta blks +readDocx _ _ = + throwError $ PandocSomeError "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String - , docxMediaBag :: MediaBag - , docxDropCap :: Inlines - , docxWarnings :: [String] + , docxAnchorSet :: Set.Set String + , docxImmedPrevAnchor :: Maybe String + , docxMediaBag :: MediaBag + , docxDropCap :: Inlines + , docxWarnings :: [String] + -- keep track of (numId, lvl) values for + -- restarting + , docxListState :: M.Map (String, String) Integer + , docxPrevPara :: Inlines } instance Default DState where def = DState { docxAnchorMap = M.empty + , docxAnchorSet = mempty + , docxImmedPrevAnchor = Nothing , docxMediaBag = mempty , docxDropCap = mempty , docxWarnings = [] + , docxListState = M.empty + , docxPrevPara = mempty } -data DEnv = DEnv { docxOptions :: ReaderOptions - , docxInHeaderBlock :: Bool } +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool + } instance Default DEnv where def = DEnv def False -type DocxContext = ExceptT PandocError (ReaderT DEnv (State DState)) - -evalDocxContext :: DocxContext a -> DEnv -> DState -> Either PandocError a -evalDocxContext ctx env st = flip evalState st . flip runReaderT env . runExceptT $ ctx +type DocxContext m = ReaderT DEnv (StateT DState m) -addDocxWarning :: String -> DocxContext () -addDocxWarning msg = do - warnings <- gets docxWarnings - modify $ \s -> s {docxWarnings = msg : warnings} +evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a +evalDocxContext ctx env st = flip evalStateT st $flip runReaderT env ctx -- This is empty, but we put it in for future-proofing. spansToKeep :: [String] @@ -162,7 +167,7 @@ metaStyles = M.fromList [ ("Title", "title") , ("Abstract", "abstract")] sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) -sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) +sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp) isMetaPar :: BodyPart -> Bool isMetaPar (Paragraph pPr _) = @@ -174,28 +179,28 @@ isEmptyPar (Paragraph _ parParts) = all isEmptyParPart parParts where isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems - isEmptyParPart _ = False + isEmptyParPart _ = False isEmptyElem (TextRun s) = trim s == "" isEmptyElem _ = True isEmptyPar _ = False -bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String MetaValue) bodyPartsToMeta' [] = return M.empty bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp - , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles) , (Just metaField) <- M.lookup c metaStyles = do inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps let f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] - f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks (Para ils : blks) f m (MetaList mv) = MetaList (m : mv) f m n = MetaList [m, n] return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps -bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta :: PandocMonad m => [BodyPart] -> DocxContext m Meta bodyPartsToMeta bps = do mp <- bodyPartsToMeta' bps let mp' = @@ -208,7 +213,7 @@ fixAuthors :: MetaValue -> MetaValue fixAuthors (MetaBlocks blks) = MetaList $ map g $ filter f blks where f (Para _) = True - f _ = False + f _ = False g (Para ils) = MetaInlines ils g _ = MetaInlines [] fixAuthors mv = mv @@ -220,106 +225,122 @@ codeDivs :: [String] codeDivs = ["SourceCode"] runElemToInlines :: RunElem -> Inlines -runElemToInlines (TextRun s) = text s -runElemToInlines (LnBrk) = linebreak -runElemToInlines (Tab) = space -runElemToInlines (SoftHyphen) = text "\xad" -runElemToInlines (NoBreakHyphen) = text "\x2011" +runElemToInlines (TextRun s) = text s +runElemToInlines LnBrk = linebreak +runElemToInlines Tab = space +runElemToInlines SoftHyphen = text "\xad" +runElemToInlines NoBreakHyphen = text "\x2011" runElemToString :: RunElem -> String -runElemToString (TextRun s) = s -runElemToString (LnBrk) = ['\n'] -runElemToString (Tab) = ['\t'] -runElemToString (SoftHyphen) = ['\xad'] -runElemToString (NoBreakHyphen) = ['\x2011'] +runElemToString (TextRun s) = s +runElemToString LnBrk = ['\n'] +runElemToString Tab = ['\t'] +runElemToString SoftHyphen = ['\xad'] +runElemToString NoBreakHyphen = ['\x2011'] runToString :: Run -> String runToString (Run _ runElems) = concatMap runElemToString runElems -runToString _ = "" +runToString _ = "" parPartToString :: ParPart -> String -parPartToString (PlainRun run) = runToString run +parPartToString (PlainRun run) = runToString run parPartToString (InternalHyperLink _ runs) = concatMap runToString runs parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs -parPartToString _ = "" +parPartToString _ = "" blacklistedCharStyles :: [String] blacklistedCharStyles = ["Hyperlink"] -resolveDependentRunStyle :: RunStyle -> RunStyle +resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle resolveDependentRunStyle rPr | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = - rPr - | Just (_, cs) <- rStyle rPr = - let rPr' = resolveDependentRunStyle cs - in - RunStyle { isBold = case isBold rPr of - Just bool -> Just bool - Nothing -> isBold rPr' - , isItalic = case isItalic rPr of - Just bool -> Just bool - Nothing -> isItalic rPr' - , isSmallCaps = case isSmallCaps rPr of - Just bool -> Just bool - Nothing -> isSmallCaps rPr' - , isStrike = case isStrike rPr of - Just bool -> Just bool - Nothing -> isStrike rPr' - , rVertAlign = case rVertAlign rPr of - Just valign -> Just valign - Nothing -> rVertAlign rPr' - , rUnderline = case rUnderline rPr of - Just ulstyle -> Just ulstyle - Nothing -> rUnderline rPr' - , rStyle = rStyle rPr } - | otherwise = rPr - -runStyleToTransform :: RunStyle -> (Inlines -> Inlines) + return rPr + | Just (_, cs) <- rStyle rPr = do + opts <- asks docxOptions + if isEnabled Ext_styles opts + then return rPr + else do rPr' <- resolveDependentRunStyle cs + return $ + RunStyle { isBold = case isBold rPr of + Just bool -> Just bool + Nothing -> isBold rPr' + , isItalic = case isItalic rPr of + Just bool -> Just bool + Nothing -> isItalic rPr' + , isSmallCaps = case isSmallCaps rPr of + Just bool -> Just bool + Nothing -> isSmallCaps rPr' + , isStrike = case isStrike rPr of + Just bool -> Just bool + Nothing -> isStrike rPr' + , rVertAlign = case rVertAlign rPr of + Just valign -> Just valign + Nothing -> rVertAlign rPr' + , rUnderline = case rUnderline rPr of + Just ulstyle -> Just ulstyle + Nothing -> rUnderline rPr' + , rStyle = rStyle rPr } + | otherwise = return rPr + +runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines) runStyleToTransform rPr | Just (s, _) <- rStyle rPr - , s `elem` spansToKeep = - let rPr' = rPr{rStyle = Nothing} - in - (spanWith ("", [s], [])) . (runStyleToTransform rPr') - | Just True <- isItalic rPr = - emph . (runStyleToTransform rPr {isItalic = Nothing}) - | Just True <- isBold rPr = - strong . (runStyleToTransform rPr {isBold = Nothing}) - | Just True <- isSmallCaps rPr = - smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing}) - | Just True <- isStrike rPr = - strikeout . (runStyleToTransform rPr {isStrike = Nothing}) - | Just SupScrpt <- rVertAlign rPr = - superscript . (runStyleToTransform rPr {rVertAlign = Nothing}) - | Just SubScrpt <- rVertAlign rPr = - subscript . (runStyleToTransform rPr {rVertAlign = Nothing}) - | Just "single" <- rUnderline rPr = - emph . (runStyleToTransform rPr {rUnderline = Nothing}) - | otherwise = id - -runToInlines :: Run -> DocxContext Inlines + , s `elem` spansToKeep = do + transform <- runStyleToTransform rPr{rStyle = Nothing} + return $ spanWith ("", [s], []) . transform + | Just (s, _) <- rStyle rPr = do + opts <- asks docxOptions + let extraInfo = if isEnabled Ext_styles opts + then spanWith ("", [], [("custom-style", s)]) + else id + transform <- runStyleToTransform rPr{rStyle = Nothing} + return $ extraInfo . transform + | Just True <- isItalic rPr = do + transform <- runStyleToTransform rPr{isItalic = Nothing} + return $ emph . transform + | Just True <- isBold rPr = do + transform <- runStyleToTransform rPr{isBold = Nothing} + return $ strong . transform + | Just True <- isSmallCaps rPr = do + transform <- runStyleToTransform rPr{isSmallCaps = Nothing} + return $ smallcaps . transform + | Just True <- isStrike rPr = do + transform <- runStyleToTransform rPr{isStrike = Nothing} + return $ strikeout . transform + | Just SupScrpt <- rVertAlign rPr = do + transform <- runStyleToTransform rPr{rVertAlign = Nothing} + return $ superscript . transform + | Just SubScrpt <- rVertAlign rPr = do + transform <- runStyleToTransform rPr{rVertAlign = Nothing} + return $ subscript . transform + | Just "single" <- rUnderline rPr = do + transform <- runStyleToTransform rPr{rUnderline = Nothing} + return $ underlineSpan . transform + | otherwise = return id + +runToInlines :: PandocMonad m => Run -> DocxContext m Inlines runToInlines (Run rs runElems) | Just (s, _) <- rStyle rs - , s `elem` codeStyles = - let rPr = resolveDependentRunStyle rs - codeString = code $ concatMap runElemToString runElems - in - return $ case rVertAlign rPr of - Just SupScrpt -> superscript codeString - Just SubScrpt -> subscript codeString - _ -> codeString + , s `elem` codeStyles = do + rPr <- resolveDependentRunStyle rs + let codeString = code $ concatMap runElemToString runElems + return $ case rVertAlign rPr of + Just SupScrpt -> superscript codeString + Just SubScrpt -> subscript codeString + _ -> codeString | otherwise = do - let ils = smushInlines (map runElemToInlines runElems) - return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils + rPr <- resolveDependentRunStyle rs + let ils = smushInlines (map runElemToInlines runElems) + transform <- runStyleToTransform rPr + return $ transform ils runToInlines (Footnote bps) = do - blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> mapM bodyPartToBlocks bps return $ note blksList runToInlines (Endnote bps) = do - blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> mapM bodyPartToBlocks bps return $ note blksList runToInlines (InlineDrawing fp title alt bs ext) = do - mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + (lift . lift) $ P.insertMedia fp Nothing bs return $ imageWith (extentToAttr ext) fp title $ text alt runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" @@ -330,20 +351,39 @@ extentToAttr (Just (w, h)) = showDim d = show (d / 914400) ++ "in" extentToAttr _ = nullAttr -blocksToInlinesWarn :: String -> Blocks -> DocxContext Inlines +blocksToInlinesWarn :: PandocMonad m => String -> Blocks -> DocxContext m Inlines blocksToInlinesWarn cmtId blks = do let blkList = toList blks notParaOrPlain :: Block -> Bool - notParaOrPlain (Para _) = False + notParaOrPlain (Para _) = False notParaOrPlain (Plain _) = False - notParaOrPlain _ = True - when (not $ null $ filter notParaOrPlain blkList) - (addDocxWarning $ "Docx comment " ++ cmtId ++ " will not retain formatting") - return $ fromList $ blocksToInlines blkList - -parPartToInlines :: ParPart -> DocxContext Inlines -parPartToInlines (PlainRun r) = runToInlines r -parPartToInlines (Insertion _ author date runs) = do + notParaOrPlain _ = True + unless ( not (any notParaOrPlain blkList)) $ + lift $ P.report $ DocxParserWarning $ + "Docx comment " ++ cmtId ++ " will not retain formatting" + return $ blocksToInlines' blkList + +-- The majority of work in this function is done in the primed +-- subfunction `partPartToInlines'`. We make this wrapper so that we +-- don't have to modify `docxImmedPrevAnchor` state after every function. +parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines +parPartToInlines parPart = + case parPart of + (BookMark _ anchor) | notElem anchor dummyAnchors -> do + inHdrBool <- asks docxInHeaderBlock + ils <- parPartToInlines' parPart + immedPrevAnchor <- gets docxImmedPrevAnchor + unless (isJust immedPrevAnchor || inHdrBool) + (modify $ \s -> s{ docxImmedPrevAnchor = Just anchor}) + return ils + _ -> do + ils <- parPartToInlines' parPart + modify $ \s -> s{ docxImmedPrevAnchor = Nothing} + return ils + +parPartToInlines' :: PandocMonad m => ParPart -> DocxContext m Inlines +parPartToInlines' (PlainRun r) = runToInlines r +parPartToInlines' (ChangedRuns (TrackedChange Insertion (ChangeInfo _ author date)) runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> smushInlines <$> mapM runToInlines runs @@ -352,7 +392,7 @@ parPartToInlines (Insertion _ author date runs) = do ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["insertion"], [("author", author), ("date", date)]) return $ spanWith attr ils -parPartToInlines (Deletion _ author date runs) = do +parPartToInlines' (ChangedRuns (TrackedChange Deletion (ChangeInfo _ author date)) runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> return mempty @@ -361,7 +401,7 @@ parPartToInlines (Deletion _ author date runs) = do ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["deletion"], [("author", author), ("date", date)]) return $ spanWith attr ils -parPartToInlines (CommentStart cmtId author date bodyParts) = do +parPartToInlines' (CommentStart cmtId author date bodyParts) = do opts <- asks docxOptions case readerTrackChanges opts of AllChanges -> do @@ -370,16 +410,16 @@ parPartToInlines (CommentStart cmtId author date bodyParts) = do let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)]) return $ spanWith attr ils _ -> return mempty -parPartToInlines (CommentEnd cmtId) = do +parPartToInlines' (CommentEnd cmtId) = do opts <- asks docxOptions case readerTrackChanges opts of AllChanges -> do let attr = ("", ["comment-end"], [("id", cmtId)]) return $ spanWith attr mempty _ -> return mempty -parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = +parPartToInlines' (BookMark _ anchor) | anchor `elem` dummyAnchors = return mempty -parPartToInlines (BookMark _ anchor) = +parPartToInlines' (BookMark _ anchor) = -- We record these, so we can make sure not to overwrite -- user-defined anchor links with header auto ids. do @@ -395,27 +435,40 @@ parPartToInlines (BookMark _ anchor) = -- of rewriting user-defined anchor links. However, since these -- are not defined in pandoc, it seems like a necessary evil to -- avoid an extra pass. - let newAnchor = - if not inHdrBool && anchor `elem` (M.elems anchorMap) - then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) - else anchor - unless inHdrBool - (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) - return $ spanWith (newAnchor, ["anchor"], []) mempty -parPartToInlines (Drawing fp title alt bs ext) = do - mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + immedPrevAnchor <- gets docxImmedPrevAnchor + case immedPrevAnchor of + Just prevAnchor -> do + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor prevAnchor anchorMap}) + return mempty + Nothing -> do + let newAnchor = + if not inHdrBool && anchor `elem` M.elems anchorMap + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return $ spanWith (newAnchor, ["anchor"], []) mempty +parPartToInlines' (Drawing fp title alt bs ext) = do + (lift . lift) $ P.insertMedia fp Nothing bs return $ imageWith (extentToAttr ext) fp title $ text alt -parPartToInlines Chart = do +parPartToInlines' Chart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" -parPartToInlines (InternalHyperLink anchor runs) = do +parPartToInlines' (InternalHyperLink anchor runs) = do ils <- smushInlines <$> mapM runToInlines runs return $ link ('#' : anchor) "" ils -parPartToInlines (ExternalHyperLink target runs) = do +parPartToInlines' (ExternalHyperLink target runs) = do ils <- smushInlines <$> mapM runToInlines runs return $ link target "" ils -parPartToInlines (PlainOMath exps) = do +parPartToInlines' (PlainOMath exps) = return $ math $ writeTeX exps +parPartToInlines' (SmartTag runs) = + smushInlines <$> mapM runToInlines runs +parPartToInlines' (Field info runs) = + case info of + HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs + UnknownField -> smushInlines <$> mapM runToInlines runs +parPartToInlines' NullParPart = return mempty isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (_, classes, kvs) _) = @@ -426,10 +479,10 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchor :: Blocks -> DocxContext Blocks +makeHeaderAnchor :: PandocMonad m => Blocks -> DocxContext m Blocks makeHeaderAnchor bs = traverse makeHeaderAnchor' bs -makeHeaderAnchor' :: Block -> DocxContext Block +makeHeaderAnchor' :: PandocMonad m => Block -> DocxContext m Block -- If there is an anchor already there (an anchor span in the header, -- to be exact), we rename and associate the new id with the old one. makeHeaderAnchor' (Header n (ident, classes, kvs) ils) @@ -458,126 +511,184 @@ makeHeaderAnchor' blk = return blk -- Rewrite a standalone paragraph block as a plain singleParaToPlain :: Blocks -> Blocks singleParaToPlain blks - | (Para (ils) :< seeq) <- viewl $ unMany blks + | (Para ils :< seeq) <- viewl $ unMany blks , Seq.null seeq = singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: Cell -> DocxContext Blocks +cellToBlocks :: PandocMonad m => Cell -> DocxContext m Blocks cellToBlocks (Cell bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks -rowToBlocksList :: Row -> DocxContext [Blocks] +rowToBlocksList :: PandocMonad m => Row -> DocxContext m [Blocks] rowToBlocksList (Row cells) = do blksList <- mapM cellToBlocks cells return $ map singleParaToPlain blksList -trimLineBreaks :: [Inline] -> [Inline] -trimLineBreaks [] = [] -trimLineBreaks (LineBreak : ils) = trimLineBreaks ils -trimLineBreaks ils - | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils') -trimLineBreaks ils = ils +-- like trimInlines, but also take out linebreaks +trimSps :: Inlines -> Inlines +trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils + where isSp Space = True + isSp SoftBreak = True + isSp LineBreak = True + isSp _ = False -parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks) +parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) parStyleToTransform pPr | (c:cs) <- pStyle pPr - , c `elem` divsToKeep = - let pPr' = pPr { pStyle = cs } - in - (divWith ("", [c], [])) . (parStyleToTransform pPr') + , c `elem` divsToKeep = do + let pPr' = pPr { pStyle = cs } + transform <- parStyleToTransform pPr' + return $ divWith ("", [c], []) . transform | (c:cs) <- pStyle pPr, - c `elem` listParagraphDivs = + c `elem` listParagraphDivs = do let pPr' = pPr { pStyle = cs, indentation = Nothing} - in - (divWith ("", [c], [])) . (parStyleToTransform pPr') - | (_:cs) <- pStyle pPr - , Just True <- pBlockQuote pPr = - let pPr' = pPr { pStyle = cs } - in - blockQuote . (parStyleToTransform pPr') - | (_:cs) <- pStyle pPr = + transform <- parStyleToTransform pPr' + return $ divWith ("", [c], []) . transform + | (c:cs) <- pStyle pPr + , Just True <- pBlockQuote pPr = do + opts <- asks docxOptions + let pPr' = pPr { pStyle = cs } + transform <- parStyleToTransform pPr' + let extraInfo = if isEnabled Ext_styles opts + then divWith ("", [], [("custom-style", c)]) + else id + return $ extraInfo . blockQuote . transform + | (c:cs) <- pStyle pPr = do + opts <- asks docxOptions let pPr' = pPr { pStyle = cs} - in - parStyleToTransform pPr' + transform <- parStyleToTransform pPr' + let extraInfo = if isEnabled Ext_styles opts + then divWith ("", [], [("custom-style", c)]) + else id + return $ extraInfo . transform | null (pStyle pPr) , Just left <- indentation pPr >>= leftParIndent - , Just hang <- indentation pPr >>= hangingParIndent = + , Just hang <- indentation pPr >>= hangingParIndent = do let pPr' = pPr { indentation = Nothing } - in - case (left - hang) > 0 of - True -> blockQuote . (parStyleToTransform pPr') - False -> parStyleToTransform pPr' + transform <- parStyleToTransform pPr' + return $ case (left - hang) > 0 of + True -> blockQuote . transform + False -> transform | null (pStyle pPr), - Just left <- indentation pPr >>= leftParIndent = + Just left <- indentation pPr >>= leftParIndent = do let pPr' = pPr { indentation = Nothing } - in - case left > 0 of - True -> blockQuote . (parStyleToTransform pPr') - False -> parStyleToTransform pPr' -parStyleToTransform _ = id + transform <- parStyleToTransform pPr' + return $ case left > 0 of + True -> blockQuote . transform + False -> transform +parStyleToTransform _ = return id -bodyPartToBlocks :: BodyPart -> DocxContext Blocks +bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) - | not $ null $ codeDivs `intersect` (pStyle pPr) = - return - $ parStyleToTransform pPr - $ codeBlock - $ concatMap parPartToString parparts + | not $ null $ codeDivs `intersect` (pStyle pPr) = do + transform <- parStyleToTransform pPr + return $ + transform $ + codeBlock $ + concatMap parPartToString parparts | Just (style, n) <- pHeading pPr = do - ils <- local (\s-> s{docxInHeaderBlock=True}) $ + ils <-local (\s-> s{docxInHeaderBlock=True}) (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do - ils <- smushInlines <$> mapM parPartToInlines parparts >>= - (return . fromList . trimLineBreaks . normalizeSpaces . toList) + ils <- (trimSps . smushInlines) <$> mapM parPartToInlines parparts + prevParaIls <- gets docxPrevPara dropIls <- gets docxDropCap let ils' = dropIls <> ils if dropCap pPr then do modify $ \s -> s { docxDropCap = ils' } return mempty else do modify $ \s -> s { docxDropCap = mempty } - return $ case isNull ils' of - True -> mempty - _ -> parStyleToTransform pPr $ para ils' + let ils'' = prevParaIls <> + (if isNull prevParaIls then mempty else space) <> + ils' + opts <- asks docxOptions + case () of + + _ | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> + return mempty + _ | Just (TrackedChange Insertion _) <- pChange pPr + , AcceptChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' + _ | Just (TrackedChange Insertion _) <- pChange pPr + , RejectChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = ils''} + return mempty + _ | Just (TrackedChange Insertion cInfo) <- pChange pPr + , AllChanges <- readerTrackChanges opts + , ChangeInfo _ cAuthor cDate <- cInfo -> do + let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)]) + insertMark = spanWith attr mempty + transform <- parStyleToTransform pPr + return $ transform $ + para $ ils'' <> insertMark + _ | Just (TrackedChange Deletion _) <- pChange pPr + , AcceptChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = ils''} + return mempty + _ | Just (TrackedChange Deletion _) <- pChange pPr + , RejectChanges <- readerTrackChanges opts -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' + _ | Just (TrackedChange Deletion cInfo) <- pChange pPr + , AllChanges <- readerTrackChanges opts + , ChangeInfo _ cAuthor cDate <- cInfo -> do + let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)]) + insertMark = spanWith attr mempty + transform <- parStyleToTransform pPr + return $ transform $ + para $ ils'' <> insertMark + _ | otherwise -> do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do - let - kvs = case levelInfo of - (_, fmt, txt, Just start) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - , ("start", (show start)) - ] - - (_, fmt, txt, Nothing) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - ] + -- We check whether this current numId has previously been used, + -- since Docx expects us to pick up where we left off. + listState <- gets docxListState + let startFromState = M.lookup (numId, lvl) listState + (_, fmt,txt, startFromLevelInfo) = levelInfo + start = case startFromState of + Just n -> n + 1 + Nothing -> fromMaybe 1 startFromLevelInfo + kvs = [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", show start) + ] + modify $ \st -> st{ docxListState = M.insert (numId, lvl) start listState} blks <- bodyPartToBlocks (Paragraph pPr parparts) return $ divWith ("", ["list-item"], kvs) blks -bodyPartToBlocks (ListItem pPr _ _ _ parparts) = - let pPr' = pPr {pStyle = "ListParagraph": (pStyle pPr)} +bodyPartToBlocks (ListItem pPr _ _ _ parparts) = + let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look (r:rs)) = do +bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do let caption = text cap (hdr, rows) = case firstRowFormatting look of True | null rs -> (Nothing, [r]) | otherwise -> (Just r, rs) False -> (Nothing, r:rs) - cells <- mapM rowToBlocksList rows + cells <- mapM rowToBlocksList rows - let width = case cells of - r':_ -> length r' - -- shouldn't happen - [] -> 0 + let width = maybe 0 maximum $ nonEmpty $ map rowLength parts + -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out + -- our own, see + -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155 + nonEmpty [] = Nothing + nonEmpty l = Just l + rowLength :: Row -> Int + rowLength (Row c) = length c hdrCells <- case hdr of Just r' -> rowToBlocksList r' @@ -592,36 +703,54 @@ bodyPartToBlocks (Tbl cap _ look (r:rs)) = do widths = replicate width 0 :: [Double] return $ table caption (zip alignments widths) hdrCells cells -bodyPartToBlocks (OMathPara e) = do +bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) -- replace targets with generated anchors. -rewriteLink' :: Inline -> DocxContext Inline +rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils ('#':target, title)) = do anchorMap <- gets docxAnchorMap - return $ case M.lookup target anchorMap of - Just newTarget -> (Link attr ils ('#':newTarget, title)) - Nothing -> l + case M.lookup target anchorMap of + Just newTarget -> do + modify $ \s -> s{docxAnchorSet = Set.insert newTarget (docxAnchorSet s)} + return $ Link attr ils ('#':newTarget, title) + Nothing -> do + modify $ \s -> s{docxAnchorSet = Set.insert target (docxAnchorSet s)} + return l rewriteLink' il = return il -rewriteLinks :: [Block] -> DocxContext [Block] +rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block] rewriteLinks = mapM (walkM rewriteLink') -bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag, [String]) +removeOrphanAnchors'' :: PandocMonad m => Inline -> DocxContext m [Inline] +removeOrphanAnchors'' s@(Span (ident, classes, _) ils) + | "anchor" `elem` classes = do + anchorSet <- gets docxAnchorSet + return $ if ident `Set.member` anchorSet + then [s] + else ils +removeOrphanAnchors'' il = return [il] + +removeOrphanAnchors' :: PandocMonad m => [Inline] -> DocxContext m [Inline] +removeOrphanAnchors' ils = liftM concat $ mapM removeOrphanAnchors'' ils + +removeOrphanAnchors :: PandocMonad m => [Block] -> DocxContext m [Block] +removeOrphanAnchors = mapM (walkM removeOrphanAnchors') + +bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks - mediaBag <- gets docxMediaBag - warnings <- gets docxWarnings - return $ (meta, - blks', - mediaBag, - warnings) - -docxToOutput :: ReaderOptions -> Docx -> Either PandocError (Meta, [Block], MediaBag, [String]) + blks'' <- removeOrphanAnchors blks' + return (meta, blks'') + +docxToOutput :: PandocMonad m + => ReaderOptions + -> Docx + -> m (Meta, [Block]) docxToOutput opts (Docx (Document _ body)) = let dEnv = def { docxOptions = opts} in evalDocxContext (bodyToOutput body) dEnv def |