From 4248f25152d5715ad99f9d8dda8bf83f33f650ff Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Thu, 26 Jun 2014 16:48:41 -0400 Subject: Move Docx reader to DocxContext monad This is a ReaderT State stack, which keeps track of some environment info, such as the options and the docx doc. The state will come in handy in the future, for a couple of planned features (rewriting the section anchors as auto_idents, and hopefully smart-quoting). --- src/Text/Pandoc/Readers/Docx.hs | 247 +++++++++++++++++++++++----------------- 1 file changed, 140 insertions(+), 107 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index cb0735e31..5773027f2 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -88,6 +88,9 @@ import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import Data.ByteString.Base64 (encode) import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State readDocx :: ReaderOptions -> B.ByteString @@ -97,11 +100,24 @@ readDocx opts bytes = Just docx -> Pandoc nullMeta (docxToBlocks opts docx) Nothing -> error $ "couldn't parse docx file" -spansToKeep :: [String] -spansToKeep = [] +data DState = DState { docxHdrLinks :: M.Map String String } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) -- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + divsToKeep :: [String] divsToKeep = ["list-item", "Definition", "DefinitionTerm"] @@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of inlineCodeContainer _ = False -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) | any inlineCodeContainer (runStyleToContainers rs) = + return $ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] | otherwise = + return $ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = +runToInlines (Footnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getFootNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] +runToInlines (Endnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getEndNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines opts docx (Insertion _ author date runs) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> concatMap (runToInlines opts docx) runs - RejectChanges -> [] - AllChanges -> - [Span - ("", ["insertion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines opts docx (Deletion _ author date runs) = + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> [] - RejectChanges -> concatMap (runToInlines opts docx) runs - AllChanges -> - [Span - ("", ["deletion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = [] -parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do + (Docx _ _ _ rels _) <- asks docxDocument + return $ case lookupRelationship relid rels of Just target -> [Image [] (combine "word" target, "")] Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do + (Docx _ _ _ rels _) <- asks docxDocument + rs <- concatMapM runToInlines runs + return $ case lookupRelationship relid rels of Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] + [Link rs (target, "")] Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] + [Link rs ("", "")] isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = _ -> h makeHeaderAnchors blk = blk -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - (if False -- TODO depend on option - then walk (makeImagesSelfContained docx) - else id) $ - -- bottomUp spanTrim $ - -- bottomUp spanCorrect $ - -- bottomUp spanReduce $ - reduceList $ concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts >>= + -- TODO: Option for self-containted images + (if False then (walkM makeImagesSelfContained) else return) + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells blockCodeContainer :: Container Block -> Bool blockCodeContainer (Container f) = case f [] of @@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of _ -> False blockCodeContainer _ = False -bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block] -bodyPartToBlocks _ _ (Paragraph pPr parparts) +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) | any blockCodeContainer (parStyleToContainers pPr) = let otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr) in + return $ rebuild otherConts [CodeBlock ("", [], []) (concatMap parPartToString parparts)] -bodyPartToBlocks opts docx (Paragraph pPr parparts) = - case parPartsToInlines opts docx parparts of - [] -> - [] - _ -> - let parContents = parPartsToInlines opts docx parparts - trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents - in +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts + case ils of + [] -> return [] + _ -> do + parContents <- parPartsToInlines parparts + let trimmedContents = reverse $ + dropWhile (Space ==) $ + reverse $ + dropWhile (Space ==) parContents + return $ rebuild (parStyleToContainers pPr) [Para trimmedContents] -bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do + (Docx _ _ numbering _ _) <- asks docxDocument let kvs = case lookupLevel numId lvl numbering of Just (_, fmt, txt, Just start) -> [ ("level", lvl) @@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp , ("text", txt) ] Nothing -> [] - in - [Div - ("", ["list-item"], kvs) - (bodyPartToBlocks opts docx (Paragraph pPr parparts))] -bodyPartToBlocks _ _ (Tbl _ _ _ []) = - [Para []] -bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do let caption = strToInlines cap (hdr, rows) = case firstRowFormatting look of True -> (Just r, rs) False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows - size = case null hdrCells of + let size = case null hdrCells of True -> length $ head cells False -> length $ hdrCells -- @@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = -- moment. Width information is in the TblGrid field of the Tbl, -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - [Table caption alignments widths hdrCells cells] - - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] + + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do + (Docx _ _ _ _ media) <- asks docxDocument + return $ case lookup uri media of + Just bs -> + case getMimeType uri of + Just mime -> + let data_uri = "data:" ++ mime ++ ";base64," ++ + toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i Nothing -> i -makeImagesSelfContained _ inline = inline +makeImagesSelfContained inline = return inline -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - map (makeHeaderAnchors) $ - blocksToDefinitions $ - blocksToBullets $ - concatMap (bodyPartToBlocks opts docx) bps +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do + blks <- concatMapM bodyPartToBlocks bps + return $ + map (makeHeaderAnchors) $ + blocksToDefinitions $ + blocksToBullets $ blks docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = + let dState = DState { docxHdrLinks = M.empty } + dEnv = DEnv { docxOptions = opts + , docxDocument = d} + in + evalDocxContext (bodyToBlocks body) dEnv dState ilToCode :: Inline -> String ilToCode (Str s) = s -- cgit v1.2.3 From db187348cd8bb17ce66d2d4c1db6a5ff46a1ffbc Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Thu, 26 Jun 2014 23:10:11 -0400 Subject: Docx rdr: Avoid mapping makeHeaderAnchors globally It only applies to headers, so we can just apply it when we make a header. --- src/Text/Pandoc/Readers/Docx.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 5773027f2..42352a845 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -152,6 +152,7 @@ divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) = let n = fromJust (isHeaderClass c) in [(Container $ \blks -> + makeHeaderAnchor $ Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) @@ -304,8 +305,8 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchors :: Block -> Block -makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = +makeHeaderAnchor :: Block -> Block +makeHeaderAnchor h@(Header n (_, classes, kvs) ils) = case filter isAnchorSpan ils of [] -> h (x@(Span (ident, _, _) _) : xs) -> @@ -313,7 +314,7 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = True -> h False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) _ -> h -makeHeaderAnchors blk = blk +makeHeaderAnchor blk = blk parPartsToInlines :: [ParPart] -> DocxContext [Inline] parPartsToInlines parparts = do @@ -424,7 +425,6 @@ bodyToBlocks :: Body -> DocxContext [Block] bodyToBlocks (Body bps) = do blks <- concatMapM bodyPartToBlocks bps return $ - map (makeHeaderAnchors) $ blocksToDefinitions $ blocksToBullets $ blks -- cgit v1.2.3 From ab76bbebbe7afd3acdf3218b88f02482c885cc87 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Fri, 27 Jun 2014 11:35:50 -0400 Subject: Docx Reader: Clean up guards Use PatternGuards to get rid of need for `isJust`, `fromJust` altogether. --- src/Text/Pandoc/Readers/Docx.hs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 42352a845..0c52b1acb 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards #-} + {- Copyright (C) 2014 Jesse Rosenthal @@ -82,7 +84,7 @@ import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Reducible -import Data.Maybe (mapMaybe, isJust, fromJust) +import Data.Maybe (mapMaybe) import Data.List (delete, isPrefixOf, (\\)) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B @@ -148,12 +150,10 @@ runStyleToContainers rPr = divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] -divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) = - let n = fromJust (isHeaderClass c) - in - [(Container $ \blks -> - makeHeaderAnchor $ - Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] +divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c = + [(Container $ \blks -> + makeHeaderAnchor $ + Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) divAttrToContainers (c:cs) kvs | c `elem` codeDivs = @@ -167,10 +167,10 @@ divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs = divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs = (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs) divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs -divAttrToContainers [] kvs | isJust (lookup "indent" kvs) = +divAttrToContainers [] kvs | Just numString <- lookup "indent" kvs = let kvs' = filter (\(k,_) -> k /= "indent") kvs in - case fromJust (lookup "indent" kvs) of + case numString of "0" -> divAttrToContainers [] kvs' ('-' : _) -> divAttrToContainers [] kvs' _ -> (Container BlockQuote) : divAttrToContainers [] kvs' -- cgit v1.2.3 From 1de8d4d08788ef24f69f9f90266604854996080e Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Fri, 27 Jun 2014 11:45:06 -0400 Subject: Docx Reader: Simplify makeHeaderAnchors Using pattern guard, in preparation for doing some more complicated stuff with it (recording header anchors, so we can change them to auto ids.) --- src/Text/Pandoc/Readers/Docx.hs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 0c52b1acb..9aaf1d340 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -306,14 +306,11 @@ dummyAnchors :: [String] dummyAnchors = ["_GoBack"] makeHeaderAnchor :: Block -> Block -makeHeaderAnchor h@(Header n (_, classes, kvs) ils) = - case filter isAnchorSpan ils of - [] -> h - (x@(Span (ident, _, _) _) : xs) -> - case ident `elem` dummyAnchors of - True -> h - False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) - _ -> h +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + Header n (ident, classes, kvs) (ils \\ (x:xs)) makeHeaderAnchor blk = blk parPartsToInlines :: [ParPart] -> DocxContext [Inline] -- cgit v1.2.3 From 5969baf5b97c0926384b1619be3c4be6d92b277b Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 28 Jun 2014 02:47:40 -0400 Subject: Rewrote header generation. In preparation for auto ids. --- src/Text/Pandoc/Readers/Docx.hs | 50 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 9aaf1d340..bbe770f6e 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -84,6 +84,7 @@ import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared import Data.Maybe (mapMaybe) import Data.List (delete, isPrefixOf, (\\)) import qualified Data.ByteString as BS @@ -151,9 +152,8 @@ runStyleToContainers rPr = divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c = - [(Container $ \blks -> - makeHeaderAnchor $ - Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] + [Container $ \_ -> + Header n ("", delete ("Heading" ++ show n) cs, []) []] divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) divAttrToContainers (c:cs) kvs | c `elem` codeDivs = @@ -305,13 +305,14 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchor :: Block -> Block +makeHeaderAnchor :: Block -> DocxContext Block makeHeaderAnchor (Header n (_, classes, kvs) ils) | (x : xs) <- filter isAnchorSpan ils , (Span (ident, _, _) _) <- x , notElem ident dummyAnchors = - Header n (ident, classes, kvs) (ils \\ (x:xs)) -makeHeaderAnchor blk = blk + return $ Header n (ident, classes, kvs) (ils \\ (x:xs)) +makeHeaderAnchor blk = return blk + parPartsToInlines :: [ParPart] -> DocxContext [Inline] parPartsToInlines parparts = do @@ -326,36 +327,40 @@ cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps rowToBlocksList :: Row -> DocxContext [[Block]] rowToBlocksList (Row cells) = mapM cellToBlocks cells -blockCodeContainer :: Container Block -> Bool -blockCodeContainer (Container f) = case f [] of - CodeBlock _ _ -> True - _ -> False -blockCodeContainer _ = False +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False bodyPartToBlocks :: BodyPart -> DocxContext [Block] bodyPartToBlocks (Paragraph pPr parparts) - | any blockCodeContainer (parStyleToContainers pPr) = + | any isBlockCodeContainer (parStyleToContainers pPr) = let - otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr) + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) in return $ rebuild otherConts [CodeBlock ("", [], []) (concatMap parPartToString parparts)] +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] bodyPartToBlocks (Paragraph pPr parparts) = do - ils <- parPartsToInlines parparts + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) case ils of [] -> return [] _ -> do - parContents <- parPartsToInlines parparts - let trimmedContents = reverse $ - dropWhile (Space ==) $ - reverse $ - dropWhile (Space ==) parContents return $ rebuild (parStyleToContainers pPr) - [Para trimmedContents] + [Para ils] bodyPartToBlocks (ListItem pPr numId lvl parparts) = do (Docx _ _ numbering _ _) <- asks docxDocument let @@ -446,8 +451,3 @@ isHeaderClass s | "Heading" `isPrefixOf` s = ((n, "") : []) -> Just n _ -> Nothing isHeaderClass _ = Nothing - -blksToInlines :: [Block] -> [Inline] -blksToInlines (Para ils : _) = ils -blksToInlines (Plain ils : _) = ils -blksToInlines _ = [] -- cgit v1.2.3 From b89a3ba2b1069205a308ad0f444457d595e5a77f Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 28 Jun 2014 03:04:34 -0400 Subject: make makeHeaderAnchors make an auto id Record relationship between original id and auto id, so we can fix links after. --- src/Text/Pandoc/Readers/Docx.hs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index bbe770f6e..a3053b72a 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -104,7 +104,7 @@ readDocx opts bytes = Nothing -> error $ "couldn't parse docx file" -data DState = DState { docxHdrLinks :: M.Map String String } +data DState = DState { docxHeaderAnchors :: M.Map String String } data DEnv = DEnv { docxOptions :: ReaderOptions , docxDocument :: Docx} @@ -310,7 +310,11 @@ makeHeaderAnchor (Header n (_, classes, kvs) ils) | (x : xs) <- filter isAnchorSpan ils , (Span (ident, _, _) _) <- x , notElem ident dummyAnchors = - return $ Header n (ident, classes, kvs) (ils \\ (x:xs)) + do + hdrIDMap <- gets docxHeaderAnchors + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxHeaderAnchors = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) makeHeaderAnchor blk = return blk @@ -432,7 +436,7 @@ bodyToBlocks (Body bps) = do docxToBlocks :: ReaderOptions -> Docx -> [Block] docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = - let dState = DState { docxHdrLinks = M.empty } + let dState = DState { docxHeaderAnchors = M.empty } dEnv = DEnv { docxOptions = opts , docxDocument = d} in -- cgit v1.2.3 From dce360e1e6fee089e849c07785d8e21961fefb9b Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 28 Jun 2014 03:54:58 -0400 Subject: Docx Reader: Introduce link rewriting. --- src/Text/Pandoc/Readers/Docx.hs | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index a3053b72a..0607aac7f 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -104,7 +104,7 @@ readDocx opts bytes = Nothing -> error $ "couldn't parse docx file" -data DState = DState { docxHeaderAnchors :: M.Map String String } +data DState = DState { docxAnchorMap :: M.Map String String } data DEnv = DEnv { docxOptions :: ReaderOptions , docxDocument :: Docx} @@ -276,7 +276,22 @@ parPartToInlines (Deletion _ author date runs) = do ("", ["deletion"], [("author", author), ("date", date)]) ils] parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] -parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- Check to see if the id is already in there. Rewrite if + -- necessary. This will have the possible effect of rewriting + -- user-defined anchor links. However, since these are not defined + -- in pandoc, it seems like a necessary evil to avoid an extra + -- pass. + let newAnchor = case anchor `elem` (M.elems anchorMap) of + True -> uniqueIdent [Str anchor] (M.elems anchorMap) + False -> anchor + put DState{ docxAnchorMap = M.insert anchor newAnchor anchorMap} + return [Span (anchor, ["anchor"], []) []] parPartToInlines (Drawing relid) = do (Docx _ _ _ rels _) <- asks docxDocument return $ case lookupRelationship relid rels of @@ -311,9 +326,9 @@ makeHeaderAnchor (Header n (_, classes, kvs) ils) , (Span (ident, _, _) _) <- x , notElem ident dummyAnchors = do - hdrIDMap <- gets docxHeaderAnchors + hdrIDMap <- gets docxAnchorMap let newIdent = uniqueIdent ils (M.elems hdrIDMap) - put DState{docxHeaderAnchors = M.insert ident newIdent hdrIDMap} + put DState{docxAnchorMap = M.insert ident newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) makeHeaderAnchor blk = return blk @@ -411,6 +426,14 @@ bodyPartToBlocks (Tbl cap _ look (r:rs)) = do return [Table caption alignments widths hdrCells cells] +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il makeImagesSelfContained :: Inline -> DocxContext Inline makeImagesSelfContained i@(Image alt (uri, title)) = do @@ -429,14 +452,15 @@ makeImagesSelfContained inline = return inline bodyToBlocks :: Body -> DocxContext [Block] bodyToBlocks (Body bps) = do - blks <- concatMapM bodyPartToBlocks bps + blks <- concatMapM bodyPartToBlocks bps >>= + walkM rewriteLink return $ blocksToDefinitions $ blocksToBullets $ blks docxToBlocks :: ReaderOptions -> Docx -> [Block] docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = - let dState = DState { docxHeaderAnchors = M.empty } + let dState = DState { docxAnchorMap = M.empty } dEnv = DEnv { docxOptions = opts , docxDocument = d} in @@ -447,7 +471,6 @@ ilToCode (Str s) = s ilToCode Space = " " ilToCode _ = "" - isHeaderClass :: String -> Maybe Int isHeaderClass s | "Heading" `isPrefixOf` s = case reads (drop (length "Heading") s) :: [(Int, String)] of -- cgit v1.2.3