summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Docx.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx.hs')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs631
1 files changed, 380 insertions, 251 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 595c805bf..e58b0a905 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -1,7 +1,8 @@
-{-# LANGUAGE PatternGuards, OverloadedStrings, CPP #-}
-
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PatternGuards #-}
{-
-Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu>
+Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -20,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Docx
- Copyright : Copyright (C) 2014-2016 Jesse Rosenthal
+ Copyright : Copyright (C) 2014-2018 Jesse Rosenthal
License : GNU GPL, version 2 or above
Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
@@ -50,12 +51,13 @@ implemented, [-] means partially implemented):
* Inlines
- [X] Str
- - [X] Emph (italics and underline both read as Emph)
+ - [X] Emph
- [X] Strong
- [X] Strikeout
- [X] Superscript
- [X] Subscript
- [X] SmallCaps
+ - [-] Underline (was previously converted to Emph)
- [ ] Quoted
- [ ] Cite
- [X] Code (styled with `VerbatimChar`)
@@ -64,88 +66,91 @@ implemented, [-] means partially implemented):
- [X] Math
- [X] Link (links to an arbitrary bookmark create a span with the target as
id and "anchor" class)
- - [X] Image
+ - [X] Image
- [X] Note (Footnotes and Endnotes are silently combined.)
-}
module Text.Pandoc.Readers.Docx
- ( readDocxWithWarnings
- , readDocx
+ ( readDocx
) where
import Codec.Archive.Zip
-import Text.Pandoc.Definition
-import Text.Pandoc.Options
+import Control.Monad.Reader
+import Control.Monad.State.Strict
+import qualified Data.ByteString.Lazy as B
+import Data.Default (Default)
+import Data.List (delete, intersect)
+import qualified Data.Map as M
+import Data.Maybe (isJust, fromMaybe)
+import Data.Sequence (ViewL (..), viewl)
+import qualified Data.Sequence as Seq
+import qualified Data.Set as Set
import Text.Pandoc.Builder
-import Text.Pandoc.Walk
-import Text.Pandoc.Readers.Docx.Parse
-import Text.Pandoc.Readers.Docx.Lists
+-- import Text.Pandoc.Definition
+import Text.Pandoc.MediaBag (MediaBag)
+import Text.Pandoc.Options
import Text.Pandoc.Readers.Docx.Combine
+import Text.Pandoc.Readers.Docx.Lists
+import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Shared
-import Text.Pandoc.MediaBag (insertMedia, MediaBag)
-import Data.List (delete, intersect)
+import Text.Pandoc.Walk
import Text.TeXMath (writeTeX)
-import Data.Default (Default)
-import qualified Data.ByteString.Lazy as B
-import qualified Data.Map as M
-import qualified Data.Set as Set
-import Control.Monad.Reader
-import Control.Monad.State
-import Data.Sequence (ViewL(..), viewl)
-import qualified Data.Sequence as Seq (null)
#if !(MIN_VERSION_base(4,8,0))
import Data.Traversable (traverse)
#endif
-
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class (PandocMonad)
+import qualified Text.Pandoc.Class as P
import Text.Pandoc.Error
-import Control.Monad.Except
+import Text.Pandoc.Logging
-readDocxWithWarnings :: ReaderOptions
- -> B.ByteString
- -> Either PandocError (Pandoc, MediaBag, [String])
-readDocxWithWarnings opts bytes
+readDocx :: PandocMonad m
+ => ReaderOptions
+ -> B.ByteString
+ -> m Pandoc
+readDocx opts bytes
| Right archive <- toArchiveOrFail bytes
, Right (docx, parserWarnings) <- archiveToDocxWithWarnings archive = do
- (meta, blks, mediaBag, warnings) <- docxToOutput opts docx
- return (Pandoc meta blks, mediaBag, parserWarnings ++ warnings)
-readDocxWithWarnings _ _ =
- Left (ParseFailure "couldn't parse docx file")
-
-readDocx :: ReaderOptions
- -> B.ByteString
- -> Either PandocError (Pandoc, MediaBag)
-readDocx opts bytes = do
- (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes
- return (pandoc, mediaBag)
+ mapM_ (P.report . DocxParserWarning) parserWarnings
+ (meta, blks) <- docxToOutput opts docx
+ return $ Pandoc meta blks
+readDocx _ _ =
+ throwError $ PandocSomeError "couldn't parse docx file"
data DState = DState { docxAnchorMap :: M.Map String String
- , docxMediaBag :: MediaBag
- , docxDropCap :: Inlines
- , docxWarnings :: [String]
+ , docxAnchorSet :: Set.Set String
+ , docxImmedPrevAnchor :: Maybe String
+ , docxMediaBag :: MediaBag
+ , docxDropCap :: Inlines
+ , docxWarnings :: [String]
+ -- keep track of (numId, lvl) values for
+ -- restarting
+ , docxListState :: M.Map (String, String) Integer
+ , docxPrevPara :: Inlines
}
instance Default DState where
def = DState { docxAnchorMap = M.empty
+ , docxAnchorSet = mempty
+ , docxImmedPrevAnchor = Nothing
, docxMediaBag = mempty
, docxDropCap = mempty
, docxWarnings = []
+ , docxListState = M.empty
+ , docxPrevPara = mempty
}
-data DEnv = DEnv { docxOptions :: ReaderOptions
- , docxInHeaderBlock :: Bool }
+data DEnv = DEnv { docxOptions :: ReaderOptions
+ , docxInHeaderBlock :: Bool
+ }
instance Default DEnv where
def = DEnv def False
-type DocxContext = ExceptT PandocError (ReaderT DEnv (State DState))
-
-evalDocxContext :: DocxContext a -> DEnv -> DState -> Either PandocError a
-evalDocxContext ctx env st = flip evalState st . flip runReaderT env . runExceptT $ ctx
+type DocxContext m = ReaderT DEnv (StateT DState m)
-addDocxWarning :: String -> DocxContext ()
-addDocxWarning msg = do
- warnings <- gets docxWarnings
- modify $ \s -> s {docxWarnings = msg : warnings}
+evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a
+evalDocxContext ctx env st = flip evalStateT st $flip runReaderT env ctx
-- This is empty, but we put it in for future-proofing.
spansToKeep :: [String]
@@ -162,7 +167,7 @@ metaStyles = M.fromList [ ("Title", "title")
, ("Abstract", "abstract")]
sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
-sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
+sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp)
isMetaPar :: BodyPart -> Bool
isMetaPar (Paragraph pPr _) =
@@ -174,28 +179,28 @@ isEmptyPar (Paragraph _ parParts) =
all isEmptyParPart parParts
where
isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
- isEmptyParPart _ = False
+ isEmptyParPart _ = False
isEmptyElem (TextRun s) = trim s == ""
isEmptyElem _ = True
isEmptyPar _ = False
-bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
+bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String MetaValue)
bodyPartsToMeta' [] = return M.empty
bodyPartsToMeta' (bp : bps)
| (Paragraph pPr parParts) <- bp
- , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
+ , (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles)
, (Just metaField) <- M.lookup c metaStyles = do
inlines <- smushInlines <$> mapM parPartToInlines parParts
remaining <- bodyPartsToMeta' bps
let
f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
- f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
+ f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks (Para ils : blks)
f m (MetaList mv) = MetaList (m : mv)
f m n = MetaList [m, n]
return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining
bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
-bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
+bodyPartsToMeta :: PandocMonad m => [BodyPart] -> DocxContext m Meta
bodyPartsToMeta bps = do
mp <- bodyPartsToMeta' bps
let mp' =
@@ -208,7 +213,7 @@ fixAuthors :: MetaValue -> MetaValue
fixAuthors (MetaBlocks blks) =
MetaList $ map g $ filter f blks
where f (Para _) = True
- f _ = False
+ f _ = False
g (Para ils) = MetaInlines ils
g _ = MetaInlines []
fixAuthors mv = mv
@@ -220,106 +225,122 @@ codeDivs :: [String]
codeDivs = ["SourceCode"]
runElemToInlines :: RunElem -> Inlines
-runElemToInlines (TextRun s) = text s
-runElemToInlines (LnBrk) = linebreak
-runElemToInlines (Tab) = space
-runElemToInlines (SoftHyphen) = text "\xad"
-runElemToInlines (NoBreakHyphen) = text "\x2011"
+runElemToInlines (TextRun s) = text s
+runElemToInlines LnBrk = linebreak
+runElemToInlines Tab = space
+runElemToInlines SoftHyphen = text "\xad"
+runElemToInlines NoBreakHyphen = text "\x2011"
runElemToString :: RunElem -> String
-runElemToString (TextRun s) = s
-runElemToString (LnBrk) = ['\n']
-runElemToString (Tab) = ['\t']
-runElemToString (SoftHyphen) = ['\xad']
-runElemToString (NoBreakHyphen) = ['\x2011']
+runElemToString (TextRun s) = s
+runElemToString LnBrk = ['\n']
+runElemToString Tab = ['\t']
+runElemToString SoftHyphen = ['\xad']
+runElemToString NoBreakHyphen = ['\x2011']
runToString :: Run -> String
runToString (Run _ runElems) = concatMap runElemToString runElems
-runToString _ = ""
+runToString _ = ""
parPartToString :: ParPart -> String
-parPartToString (PlainRun run) = runToString run
+parPartToString (PlainRun run) = runToString run
parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
-parPartToString _ = ""
+parPartToString _ = ""
blacklistedCharStyles :: [String]
blacklistedCharStyles = ["Hyperlink"]
-resolveDependentRunStyle :: RunStyle -> RunStyle
+resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle
resolveDependentRunStyle rPr
| Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles =
- rPr
- | Just (_, cs) <- rStyle rPr =
- let rPr' = resolveDependentRunStyle cs
- in
- RunStyle { isBold = case isBold rPr of
- Just bool -> Just bool
- Nothing -> isBold rPr'
- , isItalic = case isItalic rPr of
- Just bool -> Just bool
- Nothing -> isItalic rPr'
- , isSmallCaps = case isSmallCaps rPr of
- Just bool -> Just bool
- Nothing -> isSmallCaps rPr'
- , isStrike = case isStrike rPr of
- Just bool -> Just bool
- Nothing -> isStrike rPr'
- , rVertAlign = case rVertAlign rPr of
- Just valign -> Just valign
- Nothing -> rVertAlign rPr'
- , rUnderline = case rUnderline rPr of
- Just ulstyle -> Just ulstyle
- Nothing -> rUnderline rPr'
- , rStyle = rStyle rPr }
- | otherwise = rPr
-
-runStyleToTransform :: RunStyle -> (Inlines -> Inlines)
+ return rPr
+ | Just (_, cs) <- rStyle rPr = do
+ opts <- asks docxOptions
+ if isEnabled Ext_styles opts
+ then return rPr
+ else do rPr' <- resolveDependentRunStyle cs
+ return $
+ RunStyle { isBold = case isBold rPr of
+ Just bool -> Just bool
+ Nothing -> isBold rPr'
+ , isItalic = case isItalic rPr of
+ Just bool -> Just bool
+ Nothing -> isItalic rPr'
+ , isSmallCaps = case isSmallCaps rPr of
+ Just bool -> Just bool
+ Nothing -> isSmallCaps rPr'
+ , isStrike = case isStrike rPr of
+ Just bool -> Just bool
+ Nothing -> isStrike rPr'
+ , rVertAlign = case rVertAlign rPr of
+ Just valign -> Just valign
+ Nothing -> rVertAlign rPr'
+ , rUnderline = case rUnderline rPr of
+ Just ulstyle -> Just ulstyle
+ Nothing -> rUnderline rPr'
+ , rStyle = rStyle rPr }
+ | otherwise = return rPr
+
+runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines)
runStyleToTransform rPr
| Just (s, _) <- rStyle rPr
- , s `elem` spansToKeep =
- let rPr' = rPr{rStyle = Nothing}
- in
- (spanWith ("", [s], [])) . (runStyleToTransform rPr')
- | Just True <- isItalic rPr =
- emph . (runStyleToTransform rPr {isItalic = Nothing})
- | Just True <- isBold rPr =
- strong . (runStyleToTransform rPr {isBold = Nothing})
- | Just True <- isSmallCaps rPr =
- smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing})
- | Just True <- isStrike rPr =
- strikeout . (runStyleToTransform rPr {isStrike = Nothing})
- | Just SupScrpt <- rVertAlign rPr =
- superscript . (runStyleToTransform rPr {rVertAlign = Nothing})
- | Just SubScrpt <- rVertAlign rPr =
- subscript . (runStyleToTransform rPr {rVertAlign = Nothing})
- | Just "single" <- rUnderline rPr =
- emph . (runStyleToTransform rPr {rUnderline = Nothing})
- | otherwise = id
-
-runToInlines :: Run -> DocxContext Inlines
+ , s `elem` spansToKeep = do
+ transform <- runStyleToTransform rPr{rStyle = Nothing}
+ return $ spanWith ("", [s], []) . transform
+ | Just (s, _) <- rStyle rPr = do
+ opts <- asks docxOptions
+ let extraInfo = if isEnabled Ext_styles opts
+ then spanWith ("", [], [("custom-style", s)])
+ else id
+ transform <- runStyleToTransform rPr{rStyle = Nothing}
+ return $ extraInfo . transform
+ | Just True <- isItalic rPr = do
+ transform <- runStyleToTransform rPr{isItalic = Nothing}
+ return $ emph . transform
+ | Just True <- isBold rPr = do
+ transform <- runStyleToTransform rPr{isBold = Nothing}
+ return $ strong . transform
+ | Just True <- isSmallCaps rPr = do
+ transform <- runStyleToTransform rPr{isSmallCaps = Nothing}
+ return $ smallcaps . transform
+ | Just True <- isStrike rPr = do
+ transform <- runStyleToTransform rPr{isStrike = Nothing}
+ return $ strikeout . transform
+ | Just SupScrpt <- rVertAlign rPr = do
+ transform <- runStyleToTransform rPr{rVertAlign = Nothing}
+ return $ superscript . transform
+ | Just SubScrpt <- rVertAlign rPr = do
+ transform <- runStyleToTransform rPr{rVertAlign = Nothing}
+ return $ subscript . transform
+ | Just "single" <- rUnderline rPr = do
+ transform <- runStyleToTransform rPr{rUnderline = Nothing}
+ return $ underlineSpan . transform
+ | otherwise = return id
+
+runToInlines :: PandocMonad m => Run -> DocxContext m Inlines
runToInlines (Run rs runElems)
| Just (s, _) <- rStyle rs
- , s `elem` codeStyles =
- let rPr = resolveDependentRunStyle rs
- codeString = code $ concatMap runElemToString runElems
- in
- return $ case rVertAlign rPr of
- Just SupScrpt -> superscript codeString
- Just SubScrpt -> subscript codeString
- _ -> codeString
+ , s `elem` codeStyles = do
+ rPr <- resolveDependentRunStyle rs
+ let codeString = code $ concatMap runElemToString runElems
+ return $ case rVertAlign rPr of
+ Just SupScrpt -> superscript codeString
+ Just SubScrpt -> subscript codeString
+ _ -> codeString
| otherwise = do
- let ils = smushInlines (map runElemToInlines runElems)
- return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils
+ rPr <- resolveDependentRunStyle rs
+ let ils = smushInlines (map runElemToInlines runElems)
+ transform <- runStyleToTransform rPr
+ return $ transform ils
runToInlines (Footnote bps) = do
- blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
+ blksList <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ note blksList
runToInlines (Endnote bps) = do
- blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
+ blksList <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ note blksList
runToInlines (InlineDrawing fp title alt bs ext) = do
- mediaBag <- gets docxMediaBag
- modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ (lift . lift) $ P.insertMedia fp Nothing bs
return $ imageWith (extentToAttr ext) fp title $ text alt
runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]"
@@ -330,20 +351,39 @@ extentToAttr (Just (w, h)) =
showDim d = show (d / 914400) ++ "in"
extentToAttr _ = nullAttr
-blocksToInlinesWarn :: String -> Blocks -> DocxContext Inlines
+blocksToInlinesWarn :: PandocMonad m => String -> Blocks -> DocxContext m Inlines
blocksToInlinesWarn cmtId blks = do
let blkList = toList blks
notParaOrPlain :: Block -> Bool
- notParaOrPlain (Para _) = False
+ notParaOrPlain (Para _) = False
notParaOrPlain (Plain _) = False
- notParaOrPlain _ = True
- when (not $ null $ filter notParaOrPlain blkList)
- (addDocxWarning $ "Docx comment " ++ cmtId ++ " will not retain formatting")
- return $ fromList $ blocksToInlines blkList
-
-parPartToInlines :: ParPart -> DocxContext Inlines
-parPartToInlines (PlainRun r) = runToInlines r
-parPartToInlines (Insertion _ author date runs) = do
+ notParaOrPlain _ = True
+ unless ( not (any notParaOrPlain blkList)) $
+ lift $ P.report $ DocxParserWarning $
+ "Docx comment " ++ cmtId ++ " will not retain formatting"
+ return $ blocksToInlines' blkList
+
+-- The majority of work in this function is done in the primed
+-- subfunction `partPartToInlines'`. We make this wrapper so that we
+-- don't have to modify `docxImmedPrevAnchor` state after every function.
+parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines
+parPartToInlines parPart =
+ case parPart of
+ (BookMark _ anchor) | notElem anchor dummyAnchors -> do
+ inHdrBool <- asks docxInHeaderBlock
+ ils <- parPartToInlines' parPart
+ immedPrevAnchor <- gets docxImmedPrevAnchor
+ unless (isJust immedPrevAnchor || inHdrBool)
+ (modify $ \s -> s{ docxImmedPrevAnchor = Just anchor})
+ return ils
+ _ -> do
+ ils <- parPartToInlines' parPart
+ modify $ \s -> s{ docxImmedPrevAnchor = Nothing}
+ return ils
+
+parPartToInlines' :: PandocMonad m => ParPart -> DocxContext m Inlines
+parPartToInlines' (PlainRun r) = runToInlines r
+parPartToInlines' (ChangedRuns (TrackedChange Insertion (ChangeInfo _ author date)) runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> smushInlines <$> mapM runToInlines runs
@@ -352,7 +392,7 @@ parPartToInlines (Insertion _ author date runs) = do
ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["insertion"], [("author", author), ("date", date)])
return $ spanWith attr ils
-parPartToInlines (Deletion _ author date runs) = do
+parPartToInlines' (ChangedRuns (TrackedChange Deletion (ChangeInfo _ author date)) runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> return mempty
@@ -361,7 +401,7 @@ parPartToInlines (Deletion _ author date runs) = do
ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["deletion"], [("author", author), ("date", date)])
return $ spanWith attr ils
-parPartToInlines (CommentStart cmtId author date bodyParts) = do
+parPartToInlines' (CommentStart cmtId author date bodyParts) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AllChanges -> do
@@ -370,16 +410,16 @@ parPartToInlines (CommentStart cmtId author date bodyParts) = do
let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)])
return $ spanWith attr ils
_ -> return mempty
-parPartToInlines (CommentEnd cmtId) = do
+parPartToInlines' (CommentEnd cmtId) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AllChanges -> do
let attr = ("", ["comment-end"], [("id", cmtId)])
return $ spanWith attr mempty
_ -> return mempty
-parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors =
+parPartToInlines' (BookMark _ anchor) | anchor `elem` dummyAnchors =
return mempty
-parPartToInlines (BookMark _ anchor) =
+parPartToInlines' (BookMark _ anchor) =
-- We record these, so we can make sure not to overwrite
-- user-defined anchor links with header auto ids.
do
@@ -395,27 +435,40 @@ parPartToInlines (BookMark _ anchor) =
-- of rewriting user-defined anchor links. However, since these
-- are not defined in pandoc, it seems like a necessary evil to
-- avoid an extra pass.
- let newAnchor =
- if not inHdrBool && anchor `elem` (M.elems anchorMap)
- then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
- else anchor
- unless inHdrBool
- (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
- return $ spanWith (newAnchor, ["anchor"], []) mempty
-parPartToInlines (Drawing fp title alt bs ext) = do
- mediaBag <- gets docxMediaBag
- modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ immedPrevAnchor <- gets docxImmedPrevAnchor
+ case immedPrevAnchor of
+ Just prevAnchor -> do
+ unless inHdrBool
+ (modify $ \s -> s { docxAnchorMap = M.insert anchor prevAnchor anchorMap})
+ return mempty
+ Nothing -> do
+ let newAnchor =
+ if not inHdrBool && anchor `elem` M.elems anchorMap
+ then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
+ else anchor
+ unless inHdrBool
+ (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
+ return $ spanWith (newAnchor, ["anchor"], []) mempty
+parPartToInlines' (Drawing fp title alt bs ext) = do
+ (lift . lift) $ P.insertMedia fp Nothing bs
return $ imageWith (extentToAttr ext) fp title $ text alt
-parPartToInlines Chart = do
+parPartToInlines' Chart =
return $ spanWith ("", ["chart"], []) $ text "[CHART]"
-parPartToInlines (InternalHyperLink anchor runs) = do
+parPartToInlines' (InternalHyperLink anchor runs) = do
ils <- smushInlines <$> mapM runToInlines runs
return $ link ('#' : anchor) "" ils
-parPartToInlines (ExternalHyperLink target runs) = do
+parPartToInlines' (ExternalHyperLink target runs) = do
ils <- smushInlines <$> mapM runToInlines runs
return $ link target "" ils
-parPartToInlines (PlainOMath exps) = do
+parPartToInlines' (PlainOMath exps) =
return $ math $ writeTeX exps
+parPartToInlines' (SmartTag runs) =
+ smushInlines <$> mapM runToInlines runs
+parPartToInlines' (Field info runs) =
+ case info of
+ HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs
+ UnknownField -> smushInlines <$> mapM runToInlines runs
+parPartToInlines' NullParPart = return mempty
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (_, classes, kvs) _) =
@@ -426,10 +479,10 @@ isAnchorSpan _ = False
dummyAnchors :: [String]
dummyAnchors = ["_GoBack"]
-makeHeaderAnchor :: Blocks -> DocxContext Blocks
+makeHeaderAnchor :: PandocMonad m => Blocks -> DocxContext m Blocks
makeHeaderAnchor bs = traverse makeHeaderAnchor' bs
-makeHeaderAnchor' :: Block -> DocxContext Block
+makeHeaderAnchor' :: PandocMonad m => Block -> DocxContext m Block
-- If there is an anchor already there (an anchor span in the header,
-- to be exact), we rename and associate the new id with the old one.
makeHeaderAnchor' (Header n (ident, classes, kvs) ils)
@@ -458,126 +511,184 @@ makeHeaderAnchor' blk = return blk
-- Rewrite a standalone paragraph block as a plain
singleParaToPlain :: Blocks -> Blocks
singleParaToPlain blks
- | (Para (ils) :< seeq) <- viewl $ unMany blks
+ | (Para ils :< seeq) <- viewl $ unMany blks
, Seq.null seeq =
singleton $ Plain ils
singleParaToPlain blks = blks
-cellToBlocks :: Cell -> DocxContext Blocks
+cellToBlocks :: PandocMonad m => Cell -> DocxContext m Blocks
cellToBlocks (Cell bps) = do
blks <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
-rowToBlocksList :: Row -> DocxContext [Blocks]
+rowToBlocksList :: PandocMonad m => Row -> DocxContext m [Blocks]
rowToBlocksList (Row cells) = do
blksList <- mapM cellToBlocks cells
return $ map singleParaToPlain blksList
-trimLineBreaks :: [Inline] -> [Inline]
-trimLineBreaks [] = []
-trimLineBreaks (LineBreak : ils) = trimLineBreaks ils
-trimLineBreaks ils
- | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils')
-trimLineBreaks ils = ils
+-- like trimInlines, but also take out linebreaks
+trimSps :: Inlines -> Inlines
+trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils
+ where isSp Space = True
+ isSp SoftBreak = True
+ isSp LineBreak = True
+ isSp _ = False
-parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks)
+parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks)
parStyleToTransform pPr
| (c:cs) <- pStyle pPr
- , c `elem` divsToKeep =
- let pPr' = pPr { pStyle = cs }
- in
- (divWith ("", [c], [])) . (parStyleToTransform pPr')
+ , c `elem` divsToKeep = do
+ let pPr' = pPr { pStyle = cs }
+ transform <- parStyleToTransform pPr'
+ return $ divWith ("", [c], []) . transform
| (c:cs) <- pStyle pPr,
- c `elem` listParagraphDivs =
+ c `elem` listParagraphDivs = do
let pPr' = pPr { pStyle = cs, indentation = Nothing}
- in
- (divWith ("", [c], [])) . (parStyleToTransform pPr')
- | (_:cs) <- pStyle pPr
- , Just True <- pBlockQuote pPr =
- let pPr' = pPr { pStyle = cs }
- in
- blockQuote . (parStyleToTransform pPr')
- | (_:cs) <- pStyle pPr =
+ transform <- parStyleToTransform pPr'
+ return $ divWith ("", [c], []) . transform
+ | (c:cs) <- pStyle pPr
+ , Just True <- pBlockQuote pPr = do
+ opts <- asks docxOptions
+ let pPr' = pPr { pStyle = cs }
+ transform <- parStyleToTransform pPr'
+ let extraInfo = if isEnabled Ext_styles opts
+ then divWith ("", [], [("custom-style", c)])
+ else id
+ return $ extraInfo . blockQuote . transform
+ | (c:cs) <- pStyle pPr = do
+ opts <- asks docxOptions
let pPr' = pPr { pStyle = cs}
- in
- parStyleToTransform pPr'
+ transform <- parStyleToTransform pPr'
+ let extraInfo = if isEnabled Ext_styles opts
+ then divWith ("", [], [("custom-style", c)])
+ else id
+ return $ extraInfo . transform
| null (pStyle pPr)
, Just left <- indentation pPr >>= leftParIndent
- , Just hang <- indentation pPr >>= hangingParIndent =
+ , Just hang <- indentation pPr >>= hangingParIndent = do
let pPr' = pPr { indentation = Nothing }
- in
- case (left - hang) > 0 of
- True -> blockQuote . (parStyleToTransform pPr')
- False -> parStyleToTransform pPr'
+ transform <- parStyleToTransform pPr'
+ return $ case (left - hang) > 0 of
+ True -> blockQuote . transform
+ False -> transform
| null (pStyle pPr),
- Just left <- indentation pPr >>= leftParIndent =
+ Just left <- indentation pPr >>= leftParIndent = do
let pPr' = pPr { indentation = Nothing }
- in
- case left > 0 of
- True -> blockQuote . (parStyleToTransform pPr')
- False -> parStyleToTransform pPr'
-parStyleToTransform _ = id
+ transform <- parStyleToTransform pPr'
+ return $ case left > 0 of
+ True -> blockQuote . transform
+ False -> transform
+parStyleToTransform _ = return id
-bodyPartToBlocks :: BodyPart -> DocxContext Blocks
+bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
- | not $ null $ codeDivs `intersect` (pStyle pPr) =
- return
- $ parStyleToTransform pPr
- $ codeBlock
- $ concatMap parPartToString parparts
+ | not $ null $ codeDivs `intersect` (pStyle pPr) = do
+ transform <- parStyleToTransform pPr
+ return $
+ transform $
+ codeBlock $
+ concatMap parPartToString parparts
| Just (style, n) <- pHeading pPr = do
- ils <- local (\s-> s{docxInHeaderBlock=True}) $
+ ils <-local (\s-> s{docxInHeaderBlock=True})
(smushInlines <$> mapM parPartToInlines parparts)
makeHeaderAnchor $
headerWith ("", delete style (pStyle pPr), []) n ils
| otherwise = do
- ils <- smushInlines <$> mapM parPartToInlines parparts >>=
- (return . fromList . trimLineBreaks . normalizeSpaces . toList)
+ ils <- (trimSps . smushInlines) <$> mapM parPartToInlines parparts
+ prevParaIls <- gets docxPrevPara
dropIls <- gets docxDropCap
let ils' = dropIls <> ils
if dropCap pPr
then do modify $ \s -> s { docxDropCap = ils' }
return mempty
else do modify $ \s -> s { docxDropCap = mempty }
- return $ case isNull ils' of
- True -> mempty
- _ -> parStyleToTransform pPr $ para ils'
+ let ils'' = prevParaIls <>
+ (if isNull prevParaIls then mempty else space) <>
+ ils'
+ opts <- asks docxOptions
+ case () of
+
+ _ | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) ->
+ return mempty
+ _ | Just (TrackedChange Insertion _) <- pChange pPr
+ , AcceptChanges <- readerTrackChanges opts -> do
+ modify $ \s -> s {docxPrevPara = mempty}
+ transform <- parStyleToTransform pPr
+ return $ transform $ para ils''
+ _ | Just (TrackedChange Insertion _) <- pChange pPr
+ , RejectChanges <- readerTrackChanges opts -> do
+ modify $ \s -> s {docxPrevPara = ils''}
+ return mempty
+ _ | Just (TrackedChange Insertion cInfo) <- pChange pPr
+ , AllChanges <- readerTrackChanges opts
+ , ChangeInfo _ cAuthor cDate <- cInfo -> do
+ let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)])
+ insertMark = spanWith attr mempty
+ transform <- parStyleToTransform pPr
+ return $ transform $
+ para $ ils'' <> insertMark
+ _ | Just (TrackedChange Deletion _) <- pChange pPr
+ , AcceptChanges <- readerTrackChanges opts -> do
+ modify $ \s -> s {docxPrevPara = ils''}
+ return mempty
+ _ | Just (TrackedChange Deletion _) <- pChange pPr
+ , RejectChanges <- readerTrackChanges opts -> do
+ modify $ \s -> s {docxPrevPara = mempty}
+ transform <- parStyleToTransform pPr
+ return $ transform $ para ils''
+ _ | Just (TrackedChange Deletion cInfo) <- pChange pPr
+ , AllChanges <- readerTrackChanges opts
+ , ChangeInfo _ cAuthor cDate <- cInfo -> do
+ let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)])
+ insertMark = spanWith attr mempty
+ transform <- parStyleToTransform pPr
+ return $ transform $
+ para $ ils'' <> insertMark
+ _ | otherwise -> do
+ modify $ \s -> s {docxPrevPara = mempty}
+ transform <- parStyleToTransform pPr
+ return $ transform $ para ils''
bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
- let
- kvs = case levelInfo of
- (_, fmt, txt, Just start) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- , ("start", (show start))
- ]
-
- (_, fmt, txt, Nothing) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- ]
+ -- We check whether this current numId has previously been used,
+ -- since Docx expects us to pick up where we left off.
+ listState <- gets docxListState
+ let startFromState = M.lookup (numId, lvl) listState
+ (_, fmt,txt, startFromLevelInfo) = levelInfo
+ start = case startFromState of
+ Just n -> n + 1
+ Nothing -> fromMaybe 1 startFromLevelInfo
+ kvs = [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ , ("start", show start)
+ ]
+ modify $ \st -> st{ docxListState = M.insert (numId, lvl) start listState}
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ divWith ("", ["list-item"], kvs) blks
-bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
- let pPr' = pPr {pStyle = "ListParagraph": (pStyle pPr)}
+bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
+ let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr}
in
bodyPartToBlocks $ Paragraph pPr' parparts
bodyPartToBlocks (Tbl _ _ _ []) =
return $ para mempty
-bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
+bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do
let caption = text cap
(hdr, rows) = case firstRowFormatting look of
True | null rs -> (Nothing, [r])
| otherwise -> (Just r, rs)
False -> (Nothing, r:rs)
- cells <- mapM rowToBlocksList rows
+ cells <- mapM rowToBlocksList rows
- let width = case cells of
- r':_ -> length r'
- -- shouldn't happen
- [] -> 0
+ let width = maybe 0 maximum $ nonEmpty $ map rowLength parts
+ -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out
+ -- our own, see
+ -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155
+ nonEmpty [] = Nothing
+ nonEmpty l = Just l
+ rowLength :: Row -> Int
+ rowLength (Row c) = length c
hdrCells <- case hdr of
Just r' -> rowToBlocksList r'
@@ -592,36 +703,54 @@ bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
widths = replicate width 0 :: [Double]
return $ table caption (zip alignments widths) hdrCells cells
-bodyPartToBlocks (OMathPara e) = do
+bodyPartToBlocks (OMathPara e) =
return $ para $ displayMath (writeTeX e)
-- replace targets with generated anchors.
-rewriteLink' :: Inline -> DocxContext Inline
+rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
rewriteLink' l@(Link attr ils ('#':target, title)) = do
anchorMap <- gets docxAnchorMap
- return $ case M.lookup target anchorMap of
- Just newTarget -> (Link attr ils ('#':newTarget, title))
- Nothing -> l
+ case M.lookup target anchorMap of
+ Just newTarget -> do
+ modify $ \s -> s{docxAnchorSet = Set.insert newTarget (docxAnchorSet s)}
+ return $ Link attr ils ('#':newTarget, title)
+ Nothing -> do
+ modify $ \s -> s{docxAnchorSet = Set.insert target (docxAnchorSet s)}
+ return l
rewriteLink' il = return il
-rewriteLinks :: [Block] -> DocxContext [Block]
+rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block]
rewriteLinks = mapM (walkM rewriteLink')
-bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag, [String])
+removeOrphanAnchors'' :: PandocMonad m => Inline -> DocxContext m [Inline]
+removeOrphanAnchors'' s@(Span (ident, classes, _) ils)
+ | "anchor" `elem` classes = do
+ anchorSet <- gets docxAnchorSet
+ return $ if ident `Set.member` anchorSet
+ then [s]
+ else ils
+removeOrphanAnchors'' il = return [il]
+
+removeOrphanAnchors' :: PandocMonad m => [Inline] -> DocxContext m [Inline]
+removeOrphanAnchors' ils = liftM concat $ mapM removeOrphanAnchors'' ils
+
+removeOrphanAnchors :: PandocMonad m => [Block] -> DocxContext m [Block]
+removeOrphanAnchors = mapM (walkM removeOrphanAnchors')
+
+bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
- mediaBag <- gets docxMediaBag
- warnings <- gets docxWarnings
- return $ (meta,
- blks',
- mediaBag,
- warnings)
-
-docxToOutput :: ReaderOptions -> Docx -> Either PandocError (Meta, [Block], MediaBag, [String])
+ blks'' <- removeOrphanAnchors blks'
+ return (meta, blks'')
+
+docxToOutput :: PandocMonad m
+ => ReaderOptions
+ -> Docx
+ -> m (Meta, [Block])
docxToOutput opts (Docx (Document _ body)) =
let dEnv = def { docxOptions = opts} in
evalDocxContext (bodyToOutput body) dEnv def