summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJesse Rosenthal <jrosenthal@jhu.edu>2014-06-26 16:48:41 -0400
committerJesse Rosenthal <jrosenthal@jhu.edu>2014-06-28 04:00:16 -0400
commit4248f25152d5715ad99f9d8dda8bf83f33f650ff (patch)
tree3628de4d30cce6e4ee027ebbc29519a2c38a0d44 /src
parentb1a8f1fa1ad8a6083f0e00cf786eaeff5f10c3be (diff)
Move Docx reader to DocxContext monad
This is a ReaderT State stack, which keeps track of some environment info, such as the options and the docx doc. The state will come in handy in the future, for a couple of planned features (rewriting the section anchors as auto_idents, and hopefully smart-quoting).
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs247
1 files changed, 140 insertions, 107 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index cb0735e31..5773027f2 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -88,6 +88,9 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode)
import System.FilePath (combine)
+import qualified Data.Map as M
+import Control.Monad.Reader
+import Control.Monad.State
readDocx :: ReaderOptions
-> B.ByteString
@@ -97,11 +100,24 @@ readDocx opts bytes =
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
Nothing -> error $ "couldn't parse docx file"
-spansToKeep :: [String]
-spansToKeep = []
+data DState = DState { docxHdrLinks :: M.Map String String }
+
+data DEnv = DEnv { docxOptions :: ReaderOptions
+ , docxDocument :: Docx}
+
+type DocxContext = ReaderT DEnv (State DState)
+
+evalDocxContext :: DocxContext a -> DEnv -> DState -> a
+evalDocxContext ctx env st = evalState (runReaderT ctx env) st
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
-- This is empty, but we put it in for future-proofing.
+spansToKeep :: [String]
+spansToKeep = []
+
divsToKeep :: [String]
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
@@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of
inlineCodeContainer _ = False
-runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
-runToInlines _ _ (Run rs runElems)
+runToInlines :: Run -> DocxContext [Inline]
+runToInlines (Run rs runElems)
| any inlineCodeContainer (runStyleToContainers rs) =
+ return $
rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
| otherwise =
+ return $
rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
-runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
+runToInlines (Footnote fnId) = do
+ (Docx _ notes _ _ _ ) <- asks docxDocument
case (getFootNote fnId notes) of
- Just bodyParts ->
- [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
- Nothing ->
- [Note []]
-runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
+ Just bodyParts -> do
+ blks <- concatMapM bodyPartToBlocks bodyParts
+ return $ [Note blks]
+ Nothing -> return [Note []]
+runToInlines (Endnote fnId) = do
+ (Docx _ notes _ _ _ ) <- asks docxDocument
case (getEndNote fnId notes) of
- Just bodyParts ->
- [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
- Nothing ->
- [Note []]
-
-parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
-parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
-parPartToInlines opts docx (Insertion _ author date runs) =
+ Just bodyParts -> do
+ blks <- concatMapM bodyPartToBlocks bodyParts
+ return $ [Note blks]
+ Nothing -> return [Note []]
+
+parPartToInlines :: ParPart -> DocxContext [Inline]
+parPartToInlines (PlainRun r) = runToInlines r
+parPartToInlines (Insertion _ author date runs) = do
+ opts <- asks docxOptions
case readerTrackChanges opts of
- AcceptChanges -> concatMap (runToInlines opts docx) runs
- RejectChanges -> []
- AllChanges ->
- [Span
- ("", ["insertion"], [("author", author), ("date", date)])
- (concatMap (runToInlines opts docx) runs)]
-parPartToInlines opts docx (Deletion _ author date runs) =
+ AcceptChanges -> concatMapM runToInlines runs >>= return
+ RejectChanges -> return []
+ AllChanges -> do
+ ils <- (concatMapM runToInlines runs)
+ return [Span
+ ("", ["insertion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (Deletion _ author date runs) = do
+ opts <- asks docxOptions
case readerTrackChanges opts of
- AcceptChanges -> []
- RejectChanges -> concatMap (runToInlines opts docx) runs
- AllChanges ->
- [Span
- ("", ["deletion"], [("author", author), ("date", date)])
- (concatMap (runToInlines opts docx) runs)]
-parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = []
-parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []]
-parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
- case lookupRelationship relid rels of
+ AcceptChanges -> return []
+ RejectChanges -> concatMapM runToInlines runs >>= return
+ AllChanges -> do
+ ils <- concatMapM runToInlines runs
+ return [Span
+ ("", ["deletion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return []
+parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []]
+parPartToInlines (Drawing relid) = do
+ (Docx _ _ _ rels _) <- asks docxDocument
+ return $ case lookupRelationship relid rels of
Just target -> [Image [] (combine "word" target, "")]
Nothing -> [Image [] ("", "")]
-parPartToInlines opts docx (InternalHyperLink anchor runs) =
- [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
-parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) =
- case lookupRelationship relid rels of
+parPartToInlines (InternalHyperLink anchor runs) = do
+ ils <- concatMapM runToInlines runs
+ return [Link ils ('#' : anchor, "")]
+parPartToInlines (ExternalHyperLink relid runs) = do
+ (Docx _ _ _ rels _) <- asks docxDocument
+ rs <- concatMapM runToInlines runs
+ return $ case lookupRelationship relid rels of
Just target ->
- [Link (concatMap (runToInlines opts docx) runs) (target, "")]
+ [Link rs (target, "")]
Nothing ->
- [Link (concatMap (runToInlines opts docx) runs) ("", "")]
+ [Link rs ("", "")]
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (ident, classes, kvs) ils) =
@@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
_ -> h
makeHeaderAnchors blk = blk
-parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
-parPartsToInlines opts docx parparts =
- --
- -- We're going to skip data-uri's for now. It should be an option,
- -- not mandatory.
- --
- (if False -- TODO depend on option
- then walk (makeImagesSelfContained docx)
- else id) $
- -- bottomUp spanTrim $
- -- bottomUp spanCorrect $
- -- bottomUp spanReduce $
- reduceList $ concatMap (parPartToInlines opts docx) parparts
-
-cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
-cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
-
-rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
-rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
+parPartsToInlines :: [ParPart] -> DocxContext [Inline]
+parPartsToInlines parparts = do
+ ils <- concatMapM parPartToInlines parparts >>=
+ -- TODO: Option for self-containted images
+ (if False then (walkM makeImagesSelfContained) else return)
+ return $ reduceList $ ils
+
+cellToBlocks :: Cell -> DocxContext [Block]
+cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps
+
+rowToBlocksList :: Row -> DocxContext [[Block]]
+rowToBlocksList (Row cells) = mapM cellToBlocks cells
blockCodeContainer :: Container Block -> Bool
blockCodeContainer (Container f) = case f [] of
@@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of
_ -> False
blockCodeContainer _ = False
-bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
-bodyPartToBlocks _ _ (Paragraph pPr parparts)
+bodyPartToBlocks :: BodyPart -> DocxContext [Block]
+bodyPartToBlocks (Paragraph pPr parparts)
| any blockCodeContainer (parStyleToContainers pPr) =
let
otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr)
in
+ return $
rebuild
otherConts
[CodeBlock ("", [], []) (concatMap parPartToString parparts)]
-bodyPartToBlocks opts docx (Paragraph pPr parparts) =
- case parPartsToInlines opts docx parparts of
- [] ->
- []
- _ ->
- let parContents = parPartsToInlines opts docx parparts
- trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents
- in
+bodyPartToBlocks (Paragraph pPr parparts) = do
+ ils <- parPartsToInlines parparts
+ case ils of
+ [] -> return []
+ _ -> do
+ parContents <- parPartsToInlines parparts
+ let trimmedContents = reverse $
+ dropWhile (Space ==) $
+ reverse $
+ dropWhile (Space ==) parContents
+ return $
rebuild
(parStyleToContainers pPr)
[Para trimmedContents]
-bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
+bodyPartToBlocks (ListItem pPr numId lvl parparts) = do
+ (Docx _ _ numbering _ _) <- asks docxDocument
let
kvs = case lookupLevel numId lvl numbering of
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
@@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp
, ("text", txt)
]
Nothing -> []
- in
- [Div
- ("", ["list-item"], kvs)
- (bodyPartToBlocks opts docx (Paragraph pPr parparts))]
-bodyPartToBlocks _ _ (Tbl _ _ _ []) =
- [Para []]
-bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
+ blks <- bodyPartToBlocks (Paragraph pPr parparts)
+ return $ [Div ("", ["list-item"], kvs) blks]
+bodyPartToBlocks (Tbl _ _ _ []) =
+ return [Para []]
+bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
False -> (Nothing, r:rs)
- hdrCells = case hdr of
- Just r' -> rowToBlocksList opts docx r'
- Nothing -> []
- cells = map (rowToBlocksList opts docx) rows
+ hdrCells <- case hdr of
+ Just r' -> rowToBlocksList r'
+ Nothing -> return []
+
+ cells <- mapM rowToBlocksList rows
- size = case null hdrCells of
+ let size = case null hdrCells of
True -> length $ head cells
False -> length $ hdrCells
--
@@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
- alignments = take size (repeat AlignDefault)
- widths = take size (repeat 0) :: [Double]
- in
- [Table caption alignments widths hdrCells cells]
-
-
-makeImagesSelfContained :: Docx -> Inline -> Inline
-makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
- case lookup uri media of
- Just bs -> case getMimeType uri of
- Just mime -> let data_uri =
- "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
- in
- Image alt (data_uri, title)
- Nothing -> i
+ alignments = replicate size AlignDefault
+ widths = replicate size 0 :: [Double]
+
+ return [Table caption alignments widths hdrCells cells]
+
+
+makeImagesSelfContained :: Inline -> DocxContext Inline
+makeImagesSelfContained i@(Image alt (uri, title)) = do
+ (Docx _ _ _ _ media) <- asks docxDocument
+ return $ case lookup uri media of
+ Just bs ->
+ case getMimeType uri of
+ Just mime ->
+ let data_uri = "data:" ++ mime ++ ";base64," ++
+ toString (encode $ BS.concat $ B.toChunks bs)
+ in
+ Image alt (data_uri, title)
+ Nothing -> i
Nothing -> i
-makeImagesSelfContained _ inline = inline
+makeImagesSelfContained inline = return inline
-bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
-bodyToBlocks opts docx (Body bps) =
- map (makeHeaderAnchors) $
- blocksToDefinitions $
- blocksToBullets $
- concatMap (bodyPartToBlocks opts docx) bps
+bodyToBlocks :: Body -> DocxContext [Block]
+bodyToBlocks (Body bps) = do
+ blks <- concatMapM bodyPartToBlocks bps
+ return $
+ map (makeHeaderAnchors) $
+ blocksToDefinitions $
+ blocksToBullets $ blks
docxToBlocks :: ReaderOptions -> Docx -> [Block]
-docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
-
+docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) =
+ let dState = DState { docxHdrLinks = M.empty }
+ dEnv = DEnv { docxOptions = opts
+ , docxDocument = d}
+ in
+ evalDocxContext (bodyToBlocks body) dEnv dState
ilToCode :: Inline -> String
ilToCode (Str s) = s