From 4248f25152d5715ad99f9d8dda8bf83f33f650ff Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Thu, 26 Jun 2014 16:48:41 -0400 Subject: Move Docx reader to DocxContext monad This is a ReaderT State stack, which keeps track of some environment info, such as the options and the docx doc. The state will come in handy in the future, for a couple of planned features (rewriting the section anchors as auto_idents, and hopefully smart-quoting). --- src/Text/Pandoc/Readers/Docx.hs | 247 +++++++++++++++++++++++----------------- 1 file changed, 140 insertions(+), 107 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index cb0735e31..5773027f2 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -88,6 +88,9 @@ import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import Data.ByteString.Base64 (encode) import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State readDocx :: ReaderOptions -> B.ByteString @@ -97,11 +100,24 @@ readDocx opts bytes = Just docx -> Pandoc nullMeta (docxToBlocks opts docx) Nothing -> error $ "couldn't parse docx file" -spansToKeep :: [String] -spansToKeep = [] +data DState = DState { docxHdrLinks :: M.Map String String } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) -- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + divsToKeep :: [String] divsToKeep = ["list-item", "Definition", "DefinitionTerm"] @@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of inlineCodeContainer _ = False -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) | any inlineCodeContainer (runStyleToContainers rs) = + return $ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] | otherwise = + return $ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = +runToInlines (Footnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getFootNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] +runToInlines (Endnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getEndNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines opts docx (Insertion _ author date runs) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> concatMap (runToInlines opts docx) runs - RejectChanges -> [] - AllChanges -> - [Span - ("", ["insertion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines opts docx (Deletion _ author date runs) = + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> [] - RejectChanges -> concatMap (runToInlines opts docx) runs - AllChanges -> - [Span - ("", ["deletion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = [] -parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do + (Docx _ _ _ rels _) <- asks docxDocument + return $ case lookupRelationship relid rels of Just target -> [Image [] (combine "word" target, "")] Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do + (Docx _ _ _ rels _) <- asks docxDocument + rs <- concatMapM runToInlines runs + return $ case lookupRelationship relid rels of Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] + [Link rs (target, "")] Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] + [Link rs ("", "")] isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = _ -> h makeHeaderAnchors blk = blk -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - (if False -- TODO depend on option - then walk (makeImagesSelfContained docx) - else id) $ - -- bottomUp spanTrim $ - -- bottomUp spanCorrect $ - -- bottomUp spanReduce $ - reduceList $ concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts >>= + -- TODO: Option for self-containted images + (if False then (walkM makeImagesSelfContained) else return) + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells blockCodeContainer :: Container Block -> Bool blockCodeContainer (Container f) = case f [] of @@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of _ -> False blockCodeContainer _ = False -bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block] -bodyPartToBlocks _ _ (Paragraph pPr parparts) +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) | any blockCodeContainer (parStyleToContainers pPr) = let otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr) in + return $ rebuild otherConts [CodeBlock ("", [], []) (concatMap parPartToString parparts)] -bodyPartToBlocks opts docx (Paragraph pPr parparts) = - case parPartsToInlines opts docx parparts of - [] -> - [] - _ -> - let parContents = parPartsToInlines opts docx parparts - trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents - in +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts + case ils of + [] -> return [] + _ -> do + parContents <- parPartsToInlines parparts + let trimmedContents = reverse $ + dropWhile (Space ==) $ + reverse $ + dropWhile (Space ==) parContents + return $ rebuild (parStyleToContainers pPr) [Para trimmedContents] -bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do + (Docx _ _ numbering _ _) <- asks docxDocument let kvs = case lookupLevel numId lvl numbering of Just (_, fmt, txt, Just start) -> [ ("level", lvl) @@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp , ("text", txt) ] Nothing -> [] - in - [Div - ("", ["list-item"], kvs) - (bodyPartToBlocks opts docx (Paragraph pPr parparts))] -bodyPartToBlocks _ _ (Tbl _ _ _ []) = - [Para []] -bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do let caption = strToInlines cap (hdr, rows) = case firstRowFormatting look of True -> (Just r, rs) False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows - size = case null hdrCells of + let size = case null hdrCells of True -> length $ head cells False -> length $ hdrCells -- @@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = -- moment. Width information is in the TblGrid field of the Tbl, -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - [Table caption alignments widths hdrCells cells] - - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] + + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do + (Docx _ _ _ _ media) <- asks docxDocument + return $ case lookup uri media of + Just bs -> + case getMimeType uri of + Just mime -> + let data_uri = "data:" ++ mime ++ ";base64," ++ + toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i Nothing -> i -makeImagesSelfContained _ inline = inline +makeImagesSelfContained inline = return inline -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - map (makeHeaderAnchors) $ - blocksToDefinitions $ - blocksToBullets $ - concatMap (bodyPartToBlocks opts docx) bps +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do + blks <- concatMapM bodyPartToBlocks bps + return $ + map (makeHeaderAnchors) $ + blocksToDefinitions $ + blocksToBullets $ blks docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = + let dState = DState { docxHdrLinks = M.empty } + dEnv = DEnv { docxOptions = opts + , docxDocument = d} + in + evalDocxContext (bodyToBlocks body) dEnv dState ilToCode :: Inline -> String ilToCode (Str s) = s -- cgit v1.2.3