diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx/Parse.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 526 |
1 files changed, 333 insertions, 193 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index deb2caccf..1f7f07e36 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,7 +1,9 @@ -{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +22,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Docx.Parse - Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal License : GNU GPL, version 2 or above Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> @@ -49,28 +51,34 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParagraphStyle(..) , Row(..) , Cell(..) + , TrackedChange(..) + , ChangeType(..) + , ChangeInfo(..) + , FieldInfo(..) , archiveToDocx , archiveToDocxWithWarnings ) where import Codec.Archive.Zip -import Text.XML.Light -import Data.Maybe -import Data.List -import System.FilePath +import Control.Applicative ((<|>)) +import Control.Monad.Except +import Control.Monad.Reader +import Control.Monad.State.Strict import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B -import qualified Text.Pandoc.UTF8 as UTF8 -import Control.Monad.Reader -import Control.Monad.State -import Control.Applicative ((<|>)) +import Data.Char (chr, ord, readLitChar) +import Data.List import qualified Data.Map as M -import Control.Monad.Except -import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive) -import Text.TeXMath.Readers.OMML (readOMML) -import Text.TeXMath.Unicode.Fonts (getUnicode, stringToFont, Font(..)) -import Text.TeXMath (Exp) +import Data.Maybe +import System.FilePath import Text.Pandoc.Readers.Docx.Util -import Data.Char (readLitChar, ord, chr, isDigit) +import Text.Pandoc.Readers.Docx.Fields +import Text.Pandoc.Shared (filteredFilesFromArchive, safeRead) +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.TeXMath (Exp) +import Text.TeXMath.Readers.OMML (readOMML) +import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont) +import Text.XML.Light +import qualified Text.XML.Light.Cursor as XMLC data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -84,10 +92,19 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show -data ReaderState = ReaderState { stateWarnings :: [String] } +data ReaderState = ReaderState { stateWarnings :: [String] + , stateFldCharState :: FldCharState + } deriving Show -data DocxError = DocxError | WrongElem +data FldCharState = FldCharOpen + | FldCharFieldInfo FieldInfo + | FldCharContent FieldInfo [Run] + | FldCharClosed + deriving (Show) + +data DocxError = DocxError + | WrongElem deriving Show type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) @@ -97,7 +114,7 @@ runD dx re rs = runState (runReaderT (runExceptT dx) re) rs maybeToD :: Maybe a -> D a maybeToD (Just a) = return a -maybeToD Nothing = throwError DocxError +maybeToD Nothing = throwError DocxError eitherToD :: Either a b -> D b eitherToD (Right b) = return b @@ -115,6 +132,36 @@ mapD f xs = in concatMapM handler xs +unwrapSDT :: NameSpaces -> Content -> [Content] +unwrapSDT ns (Elem element) + | isElem ns "w" "sdt" element + , Just sdtContent <- findChildByName ns "w" "sdtContent" element + = concatMap (unwrapSDT ns) $ map Elem $ elChildren sdtContent +unwrapSDT _ content = [content] + +unwrapSDTchild :: NameSpaces -> Content -> Content +unwrapSDTchild ns (Elem element) = + Elem $ element { elContent = concatMap (unwrapSDT ns) (elContent element) } +unwrapSDTchild _ content = content + +walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor +walkDocument' ns cur = + let modifiedCur = XMLC.modifyContent (unwrapSDTchild ns) cur + in + case XMLC.nextDF modifiedCur of + Just cur' -> walkDocument' ns cur' + Nothing -> XMLC.root modifiedCur + +walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument ns element = + let cur = XMLC.fromContent (Elem element) + cur' = walkDocument' ns cur + in + case XMLC.toTree cur' of + Elem element' -> Just element' + _ -> Nothing + + data Docx = Docx Document deriving Show @@ -160,17 +207,27 @@ data Notes = Notes NameSpaces data Comments = Comments NameSpaces (M.Map String Element) deriving Show -data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer - , rightParIndent :: Maybe Integer +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer , hangingParIndent :: Maybe Integer} deriving Show -data ParagraphStyle = ParagraphStyle { pStyle :: [String] +data ChangeType = Insertion | Deletion + deriving Show + +data ChangeInfo = ChangeInfo ChangeId Author ChangeDate + deriving Show + +data TrackedChange = TrackedChange ChangeType ChangeInfo + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] , indentation :: Maybe ParIndentation , dropCap :: Bool , pHeading :: Maybe (String, Int) , pNumInfo :: Maybe (String, String) , pBlockQuote :: Maybe Bool + , pChange :: Maybe TrackedChange } deriving Show @@ -181,6 +238,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] , pHeading = Nothing , pNumInfo = Nothing , pBlockQuote = Nothing + , pChange = Nothing } @@ -208,8 +266,7 @@ data Cell = Cell [BodyPart] type Extent = Maybe (Double, Double) data ParPart = PlainRun Run - | Insertion ChangeId Author ChangeDate [Run] - | Deletion ChangeId Author ChangeDate [Run] + | ChangedRuns TrackedChange [Run] | CommentStart CommentId Author CommentDate [BodyPart] | CommentEnd CommentId | BookMark BookMarkId Anchor @@ -218,6 +275,10 @@ data ParPart = PlainRun Run | Drawing FilePath String String B.ByteString Extent -- title, alt | Chart -- placeholder for now | PlainOMath [Exp] + | SmartTag [Run] + | Field FieldInfo [Run] + | NullParPart -- when we need to return nothing, but + -- not because of an error. deriving Show data Run = Run RunStyle [RunElem] @@ -233,19 +294,19 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen data VertAlign = BaseLn | SupScrpt | SubScrpt deriving Show -data RunStyle = RunStyle { isBold :: Maybe Bool - , isItalic :: Maybe Bool +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool , isSmallCaps :: Maybe Bool - , isStrike :: Maybe Bool - , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String - , rStyle :: Maybe CharStyle} + , isStrike :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rStyle :: Maybe CharStyle} deriving Show -data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) +data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) , isBlockQuote :: Maybe Bool - , numInfo :: Maybe (String, String) - , psStyle :: Maybe ParStyle} + , numInfo :: Maybe (String, String) + , psStyle :: Maybe ParStyle} deriving Show defaultRunStyle :: RunStyle @@ -281,7 +342,9 @@ archiveToDocxWithWarnings archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument - rState = ReaderState { stateWarnings = [] } + rState = ReaderState { stateWarnings = [] + , stateFldCharState = FldCharClosed + } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of Right doc -> Right (Docx doc, stateWarnings st) @@ -294,14 +357,14 @@ archiveToDocument zf = do entry <- maybeToD $ findEntryByPath "word/document.xml" zf docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces docElem - bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem - body <- elemToBody namespaces bodyElem + bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem + let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + body <- elemToBody namespaces bodyElem' return $ Document namespaces body elemToBody :: NameSpaces -> Element -> D Body elemToBody ns element | isElem ns "w" "body" element = - mapD (elemToBodyPart ns) (elChildren element) >>= - (\bps -> return $ Body bps) + fmap Body (mapD (elemToBodyPart ns) (elChildren element)) elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) @@ -322,15 +385,15 @@ archiveToStyles zf = isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element - , Just styleType <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle - , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>= - findAttr (elemName ns "w" "val") - , Just ps <- parentStyle = (basedOnVal == getStyleId ps) + , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= + findAttrByName ns "w" "val" + , Just ps <- parentStyle = basedOnVal == getStyleId ps | isElem ns "w" "style" element - , Just styleType <- findAttr (elemName ns "w" "type") element + , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle - , Nothing <- findChild (elemName ns "w" "basedOn") element + , Nothing <- findChildByName ns "w" "basedOn" element , Nothing <- parentStyle = True | otherwise = False @@ -343,8 +406,8 @@ instance ElemToStyle CharStyle where cStyleType _ = "character" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttr (elemName ns "w" "type") element - , Just styleId <- findAttr (elemName ns "w" "styleId") element = + , Just "character" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = Just (styleId, elemToRunStyle ns element parentStyle) | otherwise = Nothing getStyleId s = fst s @@ -353,8 +416,8 @@ instance ElemToStyle ParStyle where cStyleType _ = "paragraph" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "paragraph" <- findAttr (elemName ns "w" "type") element - , Just styleId <- findAttr (elemName ns "w" "styleId") element = + , Just "paragraph" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = Just (styleId, elemToParStyleData ns element parentStyle) | otherwise = Nothing getStyleId s = fst s @@ -368,10 +431,10 @@ getStyleChildren ns element parentStyle buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] buildBasedOnList ns element rootStyle = - case (getStyleChildren ns element rootStyle) of + case getStyleChildren ns element rootStyle of [] -> [] stys -> stys ++ - (concatMap (\s -> buildBasedOnList ns element (Just s)) stys) + concatMap (buildBasedOnList ns element . Just) stys archiveToNotes :: Archive -> Notes archiveToNotes zf = @@ -380,14 +443,14 @@ archiveToNotes zf = enElem = findEntryByPath "word/endnotes.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) fn_namespaces = case fnElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] en_namespaces = case enElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= (elemToNotes ns "footnote") - en = enElem >>= (elemToNotes ns "endnote") + fn = fnElem >>= elemToNotes ns "footnote" + en = enElem >>= elemToNotes ns "endnote" in Notes ns fn en @@ -396,19 +459,19 @@ archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) cmts_namespaces = case cmtsElem of - Just e -> elemToNameSpaces e + Just e -> elemToNameSpaces e Nothing -> [] - cmts = (elemToComments cmts_namespaces) <$> cmtsElem + cmts = elemToComments cmts_namespaces <$> cmtsElem in case cmts of - Just c -> Comments cmts_namespaces c + Just c -> Comments cmts_namespaces c Nothing -> Comments cmts_namespaces M.empty filePathToRelType :: FilePath -> Maybe DocumentLocation -filePathToRelType "word/_rels/document.xml.rels" = Just InDocument +filePathToRelType "word/_rels/document.xml.rels" = Just InDocument filePathToRelType "word/_rels/footnotes.xml.rels" = Just InFootnote -filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote -filePathToRelType _ = Nothing +filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote +filePathToRelType _ = Nothing relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship relElemToRelationship relType element | qName (elName element) == "Relationship" = @@ -439,24 +502,23 @@ lookupLevel :: String -> String -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs - lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls - return lvl + lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttr (elemName ns "w" "numId") element - absNumId <- findChild (elemName ns "w" "abstractNumId") element - >>= findAttr (elemName ns "w" "val") + numId <- findAttrByName ns "w" "numId" element + absNumId <- findChildByName ns "w" "abstractNumId" element + >>= findAttrByName ns "w" "val" return $ Numb numId absNumId numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttr (elemName ns "w" "abstractNumId") element - let levelElems = findChildren (elemName ns "w" "lvl") element + absNumId <- findAttrByName ns "w" "abstractNumId" element + let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels absNumElemToAbsNum _ _ = Nothing @@ -464,26 +526,26 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttr (elemName ns "w" "ilvl") element - fmt <- findChild (elemName ns "w" "numFmt") element - >>= findAttr (elemName ns "w" "val") - txt <- findChild (elemName ns "w" "lvlText") element - >>= findAttr (elemName ns "w" "val") - let start = findChild (elemName ns "w" "start") element - >>= findAttr (elemName ns "w" "val") + ilvl <- findAttrByName ns "w" "ilvl" element + fmt <- findChildByName ns "w" "numFmt" element + >>= findAttrByName ns "w" "val" + txt <- findChildByName ns "w" "lvlText" element + >>= findAttrByName ns "w" "val" + let start = findChildByName ns "w" "start" element + >>= findAttrByName ns "w" "val" >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) return (ilvl, fmt, txt, start) levelElemToLevel _ _ = Nothing archiveToNumbering' :: Archive -> Maybe Numbering -archiveToNumbering' zf = do +archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of Nothing -> Just $ Numbering [] [] [] Just entry -> do numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces numberingElem - numElems = findChildren (elemName namespaces "w" "num") numberingElem - absNumElems = findChildren (elemName namespaces "w" "abstractNum") numberingElem + numElems = findChildrenByName namespaces "w" "num" numberingElem + absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem nums = mapMaybe (numElemToNum namespaces) numElems absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems return $ Numbering namespaces nums absNums @@ -496,22 +558,23 @@ elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) elemToNotes ns notetype element | isElem ns "w" (notetype ++ "s") element = let pairs = mapMaybe - (\e -> findAttr (elemName ns "w" "id") e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) - (findChildren (elemName ns "w" notetype) element) + (findChildrenByName ns "w" notetype element) in - Just $ M.fromList $ pairs + Just $ + M.fromList pairs elemToNotes _ _ _ = Nothing elemToComments :: NameSpaces -> Element -> M.Map String Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttr (elemName ns "w" "id") e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) - (findChildren (elemName ns "w" "comment") element) + (findChildrenByName ns "w" "comment" element) in - M.fromList $ pairs + M.fromList pairs elemToComments _ _ = M.empty @@ -520,16 +583,16 @@ elemToComments _ _ = M.empty elemToTblGrid :: NameSpaces -> Element -> D TblGrid elemToTblGrid ns element | isElem ns "w" "tblGrid" element = - let cols = findChildren (elemName ns "w" "gridCol") element + let cols = findChildrenByName ns "w" "gridCol" element in - mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger)) + mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) cols elemToTblGrid _ _ = throwError WrongElem elemToTblLook :: NameSpaces -> Element -> D TblLook elemToTblLook ns element | isElem ns "w" "tblLook" element = - let firstRow = findAttr (elemName ns "w" "firstRow") element - val = findAttr (elemName ns "w" "val") element + let firstRow = findAttrByName ns "w" "firstRow" element + val = findAttrByName ns "w" "val" element firstRowFmt = case firstRow of Just "1" -> True @@ -538,13 +601,13 @@ elemToTblLook ns element | isElem ns "w" "tblLook" element = Just bitMask -> testBitMask bitMask 0x020 Nothing -> False in - return $ TblLook{firstRowFormatting = firstRowFmt} + return TblLook{firstRowFormatting = firstRowFmt} elemToTblLook _ _ = throwError WrongElem elemToRow :: NameSpaces -> Element -> D Row elemToRow ns element | isElem ns "w" "tr" element = do - let cellElems = findChildren (elemName ns "w" "tc") element + let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems return $ Row cells elemToRow _ _ = throwError WrongElem @@ -558,15 +621,15 @@ elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation elemToParIndentation ns element | isElem ns "w" "ind" element = - Just $ ParIndentation { + Just ParIndentation { leftParIndent = - findAttr (elemName ns "w" "left") element >>= + findAttrByName ns "w" "left" element >>= stringToInteger , rightParIndent = - findAttr (elemName ns "w" "right") element >>= + findAttrByName ns "w" "right" element >>= stringToInteger , hangingParIndent = - findAttr (elemName ns "w" "hanging") element >>= + findAttrByName ns "w" "hanging" element >>= stringToInteger} elemToParIndentation _ _ = Nothing @@ -574,7 +637,7 @@ testBitMask :: String -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of [] -> False - ((n', _) : _) -> ((n' .|. n) /= 0) + ((n', _) : _) -> (n' .|. n) /= 0 stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) @@ -582,7 +645,7 @@ stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildren (elemName ns "m" "oMathPara") element = + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do expsLst <- eitherToD $ readOMML $ showElement c return $ OMathPara expsLst @@ -610,17 +673,17 @@ elemToBodyPart ns element _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChild (elemName ns "w" "tblPr") element - >>= findChild (elemName ns "w" "tblCaption") - >>= findAttr (elemName ns "w" "val") + let caption' = findChildByName ns "w" "tblPr" element + >>= findChildByName ns "w" "tblCaption" + >>= findAttrByName ns "w" "val" caption = (fromMaybe "" caption') - grid' = case findChild (elemName ns "w" "tblGrid") element of - Just g -> elemToTblGrid ns g + grid' = case findChildByName ns "w" "tblGrid" element of + Just g -> elemToTblGrid ns g Nothing -> return [] - tblLook' = case findChild (elemName ns "w" "tblPr") element >>= - findChild (elemName ns "w" "tblLook") + tblLook' = case findChildByName ns "w" "tblPr" element >>= + findChildByName ns "w" "tblLook" of - Just l -> elemToTblLook ns l + Just l -> elemToTblLook ns l Nothing -> return defaultTblLook grid <- grid' @@ -649,26 +712,22 @@ expandDrawingId s = do getTitleAndAlt :: NameSpaces -> Element -> (String, String) getTitleAndAlt ns element = - let mbDocPr = findChild (elemName ns "wp" "inline") element >>= - findChild (elemName ns "wp" "docPr") - title = case mbDocPr >>= findAttr (elemName ns "" "title") of - Just title' -> title' - Nothing -> "" - alt = case mbDocPr >>= findAttr (elemName ns "" "descr") of - Just alt' -> alt' - Nothing -> "" + let mbDocPr = findChildByName ns "wp" "inline" element >>= + findChildByName ns "wp" "docPr" + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart elemToParPart ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element + , Just drawingElem <- findChildByName ns "w" "drawing" element , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttr (elemName ns "r" "embed") + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) @@ -676,9 +735,9 @@ elemToParPart ns element -- The below is an attempt to deal with images in deprecated vml format. elemToParPart ns element | isElem ns "w" "r" element - , Just _ <- findChild (elemName ns "w" "pict") element = + , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttr (elemName ns "r" "id") + >>= findAttrByName ns "r" "id" in case drawing of -- Todo: check out title and attr for deprecated format. @@ -687,86 +746,148 @@ elemToParPart ns element -- Chart elemToParPart ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element + , Just drawingElem <- findChildByName ns "w" "drawing" element , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem - = return Chart + = return Chart +{- +The next one is a bit complicated. fldChar fields work by first +having a <w:fldChar fldCharType="begin"> in a run, then a run with +<w:instrText>, then a <w:fldChar fldCharType="separate"> run, then the +content runs, and finally a <w:fldChar fldCharType="end"> run. For +example (omissions and my comments in brackets): + + <w:r> + [...] + <w:fldChar w:fldCharType="begin"/> + </w:r> + <w:r> + [...] + <w:instrText xml:space="preserve"> HYPERLINK [hyperlink url] </w:instrText> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="separate"/> + </w:r> + <w:r w:rsidRPr=[...]> + [...] + <w:t>Foundations of Analysis, 2nd Edition</w:t> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="end"/> + </w:r> + +So we do this in a number of steps. If we encounter the fldchar begin +tag, we start open a fldchar state variable (see state above). We add +the instrtext to it as FieldInfo. Then we close that and start adding +the runs when we get to separate. Then when we get to end, we produce +the Field type with approriate FieldInfo and Runs. +-} elemToParPart ns element - | isElem ns "w" "r" element = - elemToRun ns element >>= (\r -> return $ PlainRun r) + | isElem ns "w" "r" element + , Just fldChar <- findChildByName ns "w" "fldChar" element + , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharClosed | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen} + return NullParPart + FldCharFieldInfo info | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info []} + return NullParPart + FldCharContent info runs | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = FldCharClosed} + return $ Field info $ reverse runs + _ -> throwError WrongElem elemToParPart ns element - | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttr (elemName ns "w" "id") element - , Just cAuthor <- findAttr (elemName ns "w" "author") element - , Just cDate <- findAttr (elemName ns "w" "date") element = do - runs <- mapD (elemToRun ns) (elChildren element) - return $ Insertion cId cAuthor cDate runs + | isElem ns "w" "r" element + , Just instrText <- findChildByName ns "w" "instrText" element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharOpen -> do + info <- eitherToD $ parseFieldInfo $ strContent instrText + modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + return NullParPart + _ -> return NullParPart elemToParPart ns element - | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttr (elemName ns "w" "id") element - , Just cAuthor <- findAttr (elemName ns "w" "author") element - , Just cDate <- findAttr (elemName ns "w" "date") element = do + | isElem ns "w" "r" element = do + run <- elemToRun ns element + -- we check to see if we have an open FldChar in state that we're + -- recording. + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info runs -> do + modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} + return NullParPart + _ -> return $ PlainRun run +elemToParPart ns element + | Just change <- getTrackedChange ns element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ ChangedRuns change runs +elemToParPart ns element + | isElem ns "w" "smartTag" element = do runs <- mapD (elemToRun ns) (elChildren element) - return $ Deletion cId cAuthor cDate runs + return $ SmartTag runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttr (elemName ns "w" "id") element - , Just bmName <- findAttr (elemName ns "w" "name") element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttr (elemName ns "r" "id") element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of - Just target -> do - case findAttr (elemName ns "w" "anchor") element of + Just target -> + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttr (elemName ns "w" "id") element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem - Nothing -> throwError WrongElem + Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttr (elemName ns "w" "id") element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttr (elemName ns "w" "id") element - , Just cmtAuthor <- findAttr (elemName ns "w" "author") element - , Just cmtDate <- findAttr (elemName ns "w" "date") element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , Just cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem lookupFootnote :: String -> Notes -> Maybe Element -lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) +lookupFootnote s (Notes _ fns _) = fns >>= M.lookup s lookupEndnote :: String -> Notes -> Maybe Element -lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) +lookupEndnote s (Notes _ _ ens) = ens >>= M.lookup s elemToExtent :: Element -> Extent elemToExtent drawingElem = case (getDim "cx", getDim "cy") of (Just w, Just h) -> Just (w, h) - _ -> Nothing + _ -> Nothing where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem @@ -794,7 +915,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttr (elemName ns "w" "id") element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -802,7 +923,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttr (elemName ns "w" "id") element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -813,8 +934,8 @@ childElemToRun _ _ = throwError WrongElem elemToRun :: NameSpaces -> Element -> D Run elemToRun ns element | isElem ns "w" "r" element - , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element = - do let choices = findChildren (elemName ns "mc" "Choice") altCont + , Just altCont <- findChildByName ns "mc" "AlternateContent" element = + do let choices = findChildrenByName ns "mc" "Choice" altCont choiceChildren = map head $ filter (not . null) $ map elChildren choices outputs <- mapD (childElemToRun ns) choiceChildren case outputs of @@ -822,15 +943,15 @@ elemToRun ns element [] -> throwError WrongElem elemToRun ns element | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + , Just drawingElem <- findChildByName ns "w" "drawing" element = childElemToRun ns drawingElem elemToRun ns element | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "footnoteReference") element = + , Just ref <- findChildByName ns "w" "footnoteReference" element = childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "endnoteReference") element = + , Just ref <- findChildByName ns "w" "endnoteReference" element = childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element = do @@ -854,22 +975,37 @@ getParStyleField field stylemap styles = Just y getParStyleField _ _ _ = Nothing +getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange +getTrackedChange ns element + | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = + Just $ TrackedChange Insertion (ChangeInfo cId cAuthor cDate) +getTrackedChange ns element + | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = + Just $ TrackedChange Deletion (ChangeInfo cId cAuthor cDate) +getTrackedChange _ _ = Nothing + elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle elemToParagraphStyle ns element sty - | Just pPr <- findChild (elemName ns "w" "pPr") element = + | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (findAttr (elemName ns "w" "val")) - (findChildren (elemName ns "w" "pStyle") pPr) + (findAttrByName ns "w" "val") + (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = style , indentation = - findChild (elemName ns "w" "ind") pPr >>= + findChildByName ns "w" "ind" pPr >>= elemToParIndentation ns , dropCap = case - findChild (elemName ns "w" "framePr") pPr >>= - findAttr (elemName ns "w" "dropCap") + findChildByName ns "w" "framePr" pPr >>= + findAttrByName ns "w" "dropCap" of Just "none" -> False Just _ -> True @@ -877,13 +1013,20 @@ elemToParagraphStyle ns element sty , pHeading = getParStyleField headingLev sty style , pNumInfo = getParStyleField numInfo sty style , pBlockQuote = getParStyleField isBlockQuote sty style + , pChange = findChildByName ns "w" "rPr" pPr >>= + filterChild (\e -> isElem ns "w" "ins" e || + isElem ns "w" "moveTo" e || + isElem ns "w" "del" e || + isElem ns "w" "moveFrom" e + ) >>= + getTrackedChange ns } elemToParagraphStyle _ _ _ = defaultParagraphStyle checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag | Just t <- findChild tag rPr - , Just val <- findAttr (elemName ns "w" "val") t = + , Just val <- findAttrByName ns "w" "val" t = Just $ case val of "true" -> True "false" -> False @@ -897,11 +1040,11 @@ checkOnOff _ _ _ = Nothing elemToRunStyleD :: NameSpaces -> Element -> D RunStyle elemToRunStyleD ns element - | Just rPr <- findChild (elemName ns "w" "rPr") element = do + | Just rPr <- findChildByName ns "w" "rPr" element = do charStyles <- asks envCharStyles let parentSty = case - findChild (elemName ns "w" "rStyle") rPr >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "rStyle" rPr >>= + findAttrByName ns "w" "val" of Just styName | Just style <- M.lookup styName charStyles -> Just (styName, style) @@ -911,7 +1054,7 @@ elemToRunStyleD _ _ = return defaultRunStyle elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle elemToRunStyle ns element parentStyle - | Just rPr <- findChild (elemName ns "w" "rPr") element = + | Just rPr <- findChildByName ns "w" "rPr" element = RunStyle { isBold = checkOnOff ns rPr (elemName ns "w" "b") @@ -919,32 +1062,31 @@ elemToRunStyle ns element parentStyle , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") , rVertAlign = - findChild (elemName ns "w" "vertAlign") rPr >>= - findAttr (elemName ns "w" "val") >>= + findChildByName ns "w" "vertAlign" rPr >>= + findAttrByName ns "w" "val" >>= \v -> Just $ case v of "superscript" -> SupScrpt "subscript" -> SubScrpt _ -> BaseLn , rUnderline = - findChild (elemName ns "w" "u") rPr >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "u" rPr >>= + findAttrByName ns "w" "val" , rStyle = parentStyle } elemToRunStyle _ _ _ = defaultRunStyle -isNumericNotNull :: String -> Bool -isNumericNotNull str = (str /= []) && (all isDigit str) - getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) getHeaderLevel ns element - | Just styleId <- findAttr (elemName ns "w" "styleId") element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just index <- stripPrefix "Heading" styleId - , isNumericNotNull index = Just (styleId, read index) - | Just styleId <- findAttr (elemName ns "w" "styleId") element - , Just index <- findChild (elemName ns "w" "name") element >>= - findAttr (elemName ns "w" "val") >>= + , Just n <- stringToInteger index + , n > 0 = Just (styleId, fromInteger n) + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just index <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" >>= stripPrefix "heading " - , isNumericNotNull index = Just (styleId, read index) + , Just n <- stringToInteger index + , n > 0 = Just (styleId, fromInteger n) getHeaderLevel _ _ = Nothing blockQuoteStyleIds :: [String] @@ -955,23 +1097,23 @@ blockQuoteStyleNames = ["Quote", "Block Text"] getBlockQuote :: NameSpaces -> Element -> Maybe Bool getBlockQuote ns element - | Just styleId <- findAttr (elemName ns "w" "styleId") element + | Just styleId <- findAttrByName ns "w" "styleId" element , styleId `elem` blockQuoteStyleIds = Just True - | Just styleName <- findChild (elemName ns "w" "name") element >>= - findAttr (elemName ns "w" "val") + | Just styleName <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" , styleName `elem` blockQuoteStyleNames = Just True getBlockQuote _ _ = Nothing getNumInfo :: NameSpaces -> Element -> Maybe (String, String) getNumInfo ns element = do - let numPr = findChild (elemName ns "w" "pPr") element >>= - findChild (elemName ns "w" "numPr") + let numPr = findChildByName ns "w" "pPr" element >>= + findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= - findChild (elemName ns "w" "ilvl") >>= - findAttr (elemName ns "w" "val")) + findChildByName ns "w" "ilvl" >>= + findAttrByName ns "w" "val") numId <- numPr >>= - findChild (elemName ns "w" "numId") >>= - findAttr (elemName ns "w" "val") + findChildByName ns "w" "numId" >>= + findAttrByName ns "w" "val" return (numId, lvl) @@ -1015,10 +1157,10 @@ getSymChar ns element let [(char, _)] = readLitChar ("\\x" ++ s) in TextRun . maybe "" (:[]) $ getUnicode font char where - getCodepoint = findAttr (elemName ns "w" "char") element - getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + getCodepoint = findAttrByName ns "w" "char" element + getFont = stringToFont =<< findAttrByName ns "w" "font" element lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + lowerFromPrivate xs = xs getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1029,11 +1171,9 @@ elemToRunElems ns element let font = do fontElem <- findElement (qualName "rFonts") element stringToFont =<< - (foldr (<|>) Nothing $ - map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem setFont :: Maybe Font -> ReaderEnv -> ReaderEnv setFont f s = s{envFont = f} - |