Merge branch 'master' of git://github.com/jgm/pandoc into dokuwiki

author: Clare Macrae <github@cfmacrae.fastmail.co.uk> 2014-07-01 22:10:08 +0100
committer: Clare Macrae <github@cfmacrae.fastmail.co.uk> 2014-07-01 22:10:08 +0100
commit: 3cb76d956028c4c529dc95aab8cc4dce93f8e886 (patch)
tree: 58f5d31b79db338d8646ee9d2e26fa6801193eba /src/Text
parent: 244c4eee7487e386e3e6ff7cf78146385eef9d1f (diff)
parent: 264e366f1a973efa56fc32079927fc51cc1936ca (diff)
4 files changed, 207 insertions, 101 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 71baa5dde..61c17156e 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -110,6 +110,11 @@ data DEnv = DEnv { docxOptions  :: ReaderOptions
 
 type DocxContext = ReaderT DEnv (State DState)
 
+updateDState :: (DState -> DState) -> DocxContext ()
+updateDState f = do
+  st <- get
+  put $ f st
+
 evalDocxContext :: DocxContext a -> DEnv -> DState -> a
 evalDocxContext ctx env st = evalState (runReaderT ctx env) st
 
@@ -148,42 +153,48 @@ runStyleToContainers rPr =
   in
    classContainers ++ formatters
 
-
-divAttrToContainers :: [String] -> [(String, String)] -> [Container Block]
-divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c =
-  [Container $ \_ ->
-     Header n ("", delete ("Heading" ++ show n) cs, []) []]
-divAttrToContainers (c:cs) kvs | c `elem` divsToKeep =
-  (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs)
-divAttrToContainers (c:cs) kvs | c `elem` codeDivs =
+parStyleToContainers :: ParagraphStyle -> [Container Block]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, Just n <- isHeaderClass c =
+  [Container $ \_ -> Header n ("", delete ("Heading" ++ show n) cs, []) []]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` divsToKeep =
+  let pPr' = pPr { pStyle = cs }
+  in
+   (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` codeDivs =
   -- This is a bit of a cludge. We make the codeblock from the raw
   -- parparts in bodyPartToBlocks. But we need something to match against.
-  (Container $ \_ -> CodeBlock ("", [], []) "") : (divAttrToContainers cs kvs)
-divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs =
-  let kvs' = filter (\(k,_) -> k /= "indent") kvs
+  let pPr' = pPr { pStyle = cs }
   in
-   (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs')
-divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs =
-  (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs)
-divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs
-divAttrToContainers [] kvs | Just numString <- lookup "indent" kvs =
-  let kvs' = filter (\(k,_) -> k /= "indent") kvs
+   (Container $ \_ -> CodeBlock ("", [], []) "") : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr,  c `elem` listParagraphDivs =
+  let pPr' = pPr { pStyle = cs, indentation = Nothing}
   in
-   case numString of
-     "0"       -> divAttrToContainers [] kvs'
-     ('-' : _) -> divAttrToContainers [] kvs'
-     _         -> (Container BlockQuote) : divAttrToContainers [] kvs'
-divAttrToContainers _ _ = []
-
+   (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
 
-parStyleToContainers :: ParagraphStyle -> [Container Block]
-parStyleToContainers pPr =
-  let classes = pStyle pPr
-      kvs = case indent pPr of
-        Just n -> [("indent", show n)]
-        Nothing -> []
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` blockQuoteDivs =
+  let pPr' = pPr { pStyle = cs \\ blockQuoteDivs }
+  in
+   (Container BlockQuote) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (_:cs) <- pStyle pPr =
+  let pPr' = pPr { pStyle = cs}
+  in
+    parStyleToContainers pPr'                                 
+parStyleToContainers pPr | null (pStyle pPr),
+                          Just left <- indentation pPr >>= leftParIndent,
+                          Just hang <- indentation pPr >>= hangingParIndent =
+  let pPr' = pPr { indentation = Nothing }
+  in
+   case (left - hang) > 0 of
+     True -> (Container BlockQuote) : (parStyleToContainers pPr')
+     False -> parStyleToContainers pPr'
+parStyleToContainers pPr | null (pStyle pPr),
+                          Just left <- indentation pPr >>= leftParIndent =
+  let pPr' = pPr { indentation = Nothing }
   in
-   divAttrToContainers classes kvs
+   case left > 0 of
+     True -> (Container BlockQuote) : (parStyleToContainers pPr')
+     False -> parStyleToContainers pPr'
+parStyleToContainers _ = []
   
 
 strToInlines :: String -> [Inline]
@@ -289,7 +300,7 @@ parPartToInlines (BookMark _ anchor) =
     let newAnchor = case anchor `elem` (M.elems anchorMap) of
           True -> uniqueIdent [Str anchor] (M.elems anchorMap)
           False -> anchor
-    put DState{ docxAnchorMap = M.insert anchor newAnchor anchorMap}
+    updateDState $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}
     return [Span (anchor, ["anchor"], []) []]
 parPartToInlines (Drawing relid) = do
   (Docx _ _ _ rels _) <- asks docxDocument
@@ -329,7 +340,7 @@ makeHeaderAnchor (Header n (_, classes, kvs) ils)
     do
       hdrIDMap <- gets docxAnchorMap
       let newIdent = uniqueIdent ils (M.elems hdrIDMap)
-      put DState{docxAnchorMap = M.insert ident newIdent hdrIDMap}
+      updateDState $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
       return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs))
 -- Otherwise we just give it a name, and register that name (associate
 -- it with itself.)
@@ -337,7 +348,7 @@ makeHeaderAnchor (Header n (_, classes, kvs) ils) =
   do
     hdrIDMap <- gets docxAnchorMap
     let newIdent = uniqueIdent ils (M.elems hdrIDMap)
-    put DState{docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
+    updateDState $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
     return $ Header n (newIdent, classes, kvs) ils
 makeHeaderAnchor blk = return blk
 
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 07f34450d..537c5c272 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -42,6 +42,7 @@ module Text.Pandoc.Readers.Docx.Parse (  Docx(..)
                                        , Relationship
                                        , Media
                                        , RunStyle(..)
+                                       , ParIndentation(..)
                                        , ParagraphStyle(..)
                                        , Row(..)
                                        , Cell(..)
@@ -341,16 +342,37 @@ testBitMask bitMaskS n =
     []            -> False
     ((n', _) : _) -> ((n' .|. n) /= 0)
 
+data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer
+                                     , rightParIndent :: Maybe Integer
+                                     , hangingParIndent :: Maybe Integer}
+                      deriving Show
+
 data ParagraphStyle = ParagraphStyle { pStyle :: [String]
-                                     , indent :: Maybe Integer
+                                     , indentation :: Maybe ParIndentation
                                      }
                       deriving Show
 
 defaultParagraphStyle :: ParagraphStyle
 defaultParagraphStyle = ParagraphStyle { pStyle = []
-                                       , indent = Nothing
+                                       , indentation = Nothing
                                        }
 
+elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
+elemToParIndentation ns element
+  | qName (elName element) == "ind" &&
+    qURI (elName element) == (lookup "w" ns) =
+      Just $ ParIndentation {
+        leftParIndent =
+           findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>=
+           stringToInteger
+        , rightParIndent = 
+          findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>=
+          stringToInteger
+        , hangingParIndent = 
+          findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>=
+          stringToInteger}
+elemToParIndentation _ _ = Nothing
+
 elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
 elemToParagraphStyle ns element =
   case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
@@ -360,10 +382,9 @@ elemToParagraphStyle ns element =
           mapMaybe
           (findAttr (QName "val" (lookup "w" ns) (Just "w")))
           (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
-      , indent =
+      , indentation =
         findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
-        findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
-        stringToInteger
+        elemToParIndentation ns
         }
     Nothing -> defaultParagraphStyle
 
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 5b0d9b6b4..4a536330d 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -55,6 +55,8 @@ module Text.Pandoc.Shared (
                      normalizeSpaces,
                      extractSpaces,
                      normalize,
+                     normalizeInlines,
+                     normalizeBlocks,
                      stringify,
                      compactify,
                      compactify',
@@ -84,7 +86,6 @@ module Text.Pandoc.Shared (
 
 import Text.Pandoc.Definition
 import Text.Pandoc.Walk
-import Text.Pandoc.Generic
 import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..))
 import qualified Text.Pandoc.Builder as B
 import qualified Text.Pandoc.UTF8 as UTF8
@@ -350,72 +351,142 @@ extractSpaces f is =
 -- | Normalize @Pandoc@ document, consolidating doubled 'Space's,
 -- combining adjacent 'Str's and 'Emph's, remove 'Null's and
 -- empty elements, etc.
-normalize :: (Eq a, Data a) => a -> a
-normalize = topDown removeEmptyBlocks .
-            topDown consolidateInlines .
-            bottomUp (removeEmptyInlines . removeTrailingInlineSpaces)
-
-removeEmptyBlocks :: [Block] -> [Block]
-removeEmptyBlocks (Null : xs) = removeEmptyBlocks xs
-removeEmptyBlocks (BulletList [] : xs) = removeEmptyBlocks xs
-removeEmptyBlocks (OrderedList _ [] : xs) = removeEmptyBlocks xs
-removeEmptyBlocks (DefinitionList [] : xs) = removeEmptyBlocks xs
-removeEmptyBlocks (RawBlock _ [] : xs) = removeEmptyBlocks xs
-removeEmptyBlocks (x:xs) = x : removeEmptyBlocks xs
-removeEmptyBlocks [] = []
-
-removeEmptyInlines :: [Inline] -> [Inline]
-removeEmptyInlines (Emph [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Strong [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Subscript [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Superscript [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (SmallCaps [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Strikeout [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (RawInline _ [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Code _ [] : zs) = removeEmptyInlines zs
-removeEmptyInlines (Str "" : zs) = removeEmptyInlines zs
-removeEmptyInlines (x : xs) = x : removeEmptyInlines xs
-removeEmptyInlines [] = []
-
-removeTrailingInlineSpaces :: [Inline] -> [Inline]
-removeTrailingInlineSpaces = reverse . removeLeadingInlineSpaces . reverse
-
-removeLeadingInlineSpaces :: [Inline] -> [Inline]
-removeLeadingInlineSpaces = dropWhile isSpaceOrEmpty
-
-consolidateInlines :: [Inline] -> [Inline]
-consolidateInlines (Str x : ys) =
+normalize :: Pandoc -> Pandoc
+normalize (Pandoc (Meta meta) blocks) =
+  Pandoc (Meta $ M.map go meta) (normalizeBlocks blocks)
+  where go (MetaInlines xs) = MetaInlines $ normalizeInlines xs
+        go (MetaBlocks xs)  = MetaBlocks  $ normalizeBlocks xs
+        go (MetaList ms)    = MetaList $ map go ms
+        go (MetaMap m)      = MetaMap $ M.map go m
+        go x                = x
+
+normalizeBlocks :: [Block] -> [Block]
+normalizeBlocks (Null : xs) = normalizeBlocks xs
+normalizeBlocks (Div attr bs : xs) =
+  Div attr (normalizeBlocks bs) : normalizeBlocks xs
+normalizeBlocks (BlockQuote bs : xs) =
+  case normalizeBlocks bs of
+       []    -> normalizeBlocks xs
+       bs'   -> BlockQuote bs' : normalizeBlocks xs
+normalizeBlocks (BulletList [] : xs) = normalizeBlocks xs
+normalizeBlocks (BulletList items : xs) =
+  BulletList (map normalizeBlocks items) : normalizeBlocks xs
+normalizeBlocks (OrderedList _ [] : xs) = normalizeBlocks xs
+normalizeBlocks (OrderedList attr items : xs) =
+  OrderedList attr (map normalizeBlocks items) : normalizeBlocks xs
+normalizeBlocks (DefinitionList [] : xs) = normalizeBlocks xs
+normalizeBlocks (DefinitionList items : xs) =
+  DefinitionList (map go items) : normalizeBlocks xs
+  where go (ils, bs) = (normalizeInlines ils, map normalizeBlocks bs)
+normalizeBlocks (RawBlock _ "" : xs) = normalizeBlocks xs
+normalizeBlocks (Para ils : xs) =
+  case normalizeInlines ils of
+       []   -> normalizeBlocks xs
+       ils' -> Para ils' : normalizeBlocks xs
+normalizeBlocks (Plain ils : xs) =
+  case normalizeInlines ils of
+       []   -> normalizeBlocks xs
+       ils' -> Plain ils' : normalizeBlocks xs
+normalizeBlocks (Header lev attr ils : xs) =
+  Header lev attr (normalizeInlines ils) : normalizeBlocks xs
+normalizeBlocks (Table capt aligns widths hdrs rows : xs) =
+  Table (normalizeInlines capt) aligns widths
+    (map normalizeBlocks hdrs) (map (map normalizeBlocks) rows)
+  : normalizeBlocks xs
+normalizeBlocks (x:xs) = x : normalizeBlocks xs
+normalizeBlocks [] = []
+
+normalizeInlines :: [Inline] -> [Inline]
+normalizeInlines (Str x : ys) =
   case concat (x : map fromStr strs) of
-        ""     -> consolidateInlines rest
-        n      -> Str n : consolidateInlines rest
+        ""     -> rest
+        n      -> Str n : rest
    where
-     (strs, rest)  = span isStr ys
+     (strs, rest)  = span isStr $ normalizeInlines ys
      isStr (Str _) = True
      isStr _       = False
      fromStr (Str z) = z
-     fromStr _       = error "consolidateInlines - fromStr - not a Str"
-consolidateInlines (Space : ys) = Space : rest
+     fromStr _       = error "normalizeInlines - fromStr - not a Str"
+normalizeInlines (Space : ys) =
+  if null rest
+     then []
+     else Space : rest
    where isSp Space = True
          isSp _     = False
-         rest       = consolidateInlines $ dropWhile isSp ys
-consolidateInlines (Emph xs : Emph ys : zs) = consolidateInlines $
-  Emph (xs ++ ys) : zs
-consolidateInlines (Strong xs : Strong ys : zs) = consolidateInlines $
-  Strong (xs ++ ys) : zs
-consolidateInlines (Subscript xs : Subscript ys : zs) = consolidateInlines $
-  Subscript (xs ++ ys) : zs
-consolidateInlines (Superscript xs : Superscript ys : zs) = consolidateInlines $
-  Superscript (xs ++ ys) : zs
-consolidateInlines (SmallCaps xs : SmallCaps ys : zs) = consolidateInlines $
-  SmallCaps (xs ++ ys) : zs
-consolidateInlines (Strikeout xs : Strikeout ys : zs) = consolidateInlines $
-  Strikeout (xs ++ ys) : zs
-consolidateInlines (RawInline f x : RawInline f' y : zs) | f == f' =
-  consolidateInlines $ RawInline f (x ++ y) : zs
-consolidateInlines (Code a1 x : Code a2 y : zs) | a1 == a2 =
-  consolidateInlines $ Code a1 (x ++ y) : zs
-consolidateInlines (x : xs) = x : consolidateInlines xs
-consolidateInlines [] = []
+         rest       = dropWhile isSp $ normalizeInlines ys
+normalizeInlines (Emph xs : zs) =
+  case normalizeInlines zs of
+       (Emph ys : rest) -> normalizeInlines $
+         Emph (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> Emph xs' : rest
+normalizeInlines (Strong xs : zs) =
+  case normalizeInlines zs of
+       (Strong ys : rest) -> normalizeInlines $
+         Strong (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> Strong xs' : rest
+normalizeInlines (Subscript xs : zs) =
+  case normalizeInlines zs of
+       (Subscript ys : rest) -> normalizeInlines $
+         Subscript (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> Subscript xs' : rest
+normalizeInlines (Superscript xs : zs) =
+  case normalizeInlines zs of
+       (Superscript ys : rest) -> normalizeInlines $
+         Superscript (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> Superscript xs' : rest
+normalizeInlines (SmallCaps xs : zs) =
+  case normalizeInlines zs of
+       (SmallCaps ys : rest) -> normalizeInlines $
+         SmallCaps (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> SmallCaps xs' : rest
+normalizeInlines (Strikeout xs : zs) =
+  case normalizeInlines zs of
+       (Strikeout ys : rest) -> normalizeInlines $
+         Strikeout (normalizeInlines $ xs ++ ys) : rest
+       rest -> case normalizeInlines xs of
+                    []  -> rest
+                    xs' -> Strikeout xs' : rest
+normalizeInlines (RawInline _ [] : ys) = normalizeInlines ys
+normalizeInlines (RawInline f xs : zs) =
+  case normalizeInlines zs of
+       (RawInline f' ys : rest) | f == f' -> normalizeInlines $
+         RawInline f (xs ++ ys) : rest
+       rest -> RawInline f xs : rest
+normalizeInlines (Code _ "" : ys) = normalizeInlines ys
+normalizeInlines (Code attr xs : zs) =
+  case normalizeInlines zs of
+       (Code attr' ys : rest) | attr == attr' -> normalizeInlines $
+         Code attr (xs ++ ys) : rest
+       rest -> Code attr xs : rest
+-- allow empty spans, they may carry identifiers etc.
+-- normalizeInlines (Span _ [] : ys) = normalizeInlines ys
+normalizeInlines (Span attr xs : zs) =
+  case normalizeInlines zs of
+       (Span attr' ys : rest) | attr == attr' -> normalizeInlines $
+         Span attr (normalizeInlines $ xs ++ ys) : rest
+       rest -> Span attr (normalizeInlines xs) : rest
+normalizeInlines (Note bs : ys) = Note (normalizeBlocks bs) :
+  normalizeInlines ys
+normalizeInlines (Quoted qt ils : ys) =
+  Quoted qt (normalizeInlines ils) : normalizeInlines ys
+normalizeInlines (Link ils t : ys) =
+  Link (normalizeInlines ils) t : normalizeInlines ys
+normalizeInlines (Image ils t : ys) =
+  Image (normalizeInlines ils) t : normalizeInlines ys
+normalizeInlines (Cite cs ils : ys) =
+  Cite cs (normalizeInlines ils) : normalizeInlines ys
+normalizeInlines (x : xs) = x : normalizeInlines xs
+normalizeInlines [] = []
 
 -- | Convert pandoc structure to a string with formatting removed.
 -- Footnotes are skipped (since we don't want their contents in link
diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs
index 31e64f14e..4b787b023 100644
--- a/src/Text/Pandoc/Writers/Docx.hs
+++ b/src/Text/Pandoc/Writers/Docx.hs
@@ -514,8 +514,11 @@ blockToOpenXML :: WriterOptions -> Block -> WS [Element]
 blockToOpenXML _ Null = return []
 blockToOpenXML opts (Div _ bs) = blocksToOpenXML opts bs
 blockToOpenXML opts (Header lev (ident,_,_) lst) = do
-  contents <- withParaProp (pStyle $ "Heading" ++ show lev) $
-               blockToOpenXML opts (Para lst)
+
+  paraProps <- withParaProp (pStyle $ "Heading" ++ show lev) $
+               getParaProps False
+  contents <- inlinesToOpenXML opts lst
+
   usedIdents <- gets stSectionIds
   let bookmarkName = if null ident
                         then uniqueIdent lst usedIdents
@@ -525,7 +528,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do
   let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id')
                                                ,("w:name",bookmarkName)] ()
   let bookmarkEnd = mknode "w:bookmarkEnd" [("w:id", id')] ()
-  return $ [bookmarkStart] ++ contents ++ [bookmarkEnd]
+  return [mknode "w:p" [] (paraProps ++ [bookmarkStart, bookmarkEnd] ++ contents)]
 blockToOpenXML opts (Plain lst) = withParaProp (pStyle "Compact")
   $ blockToOpenXML opts (Para lst)
 -- title beginning with fig: indicates that the image is a figure
author	Clare Macrae <github@cfmacrae.fastmail.co.uk>	2014-07-01 22:10:08 +0100
committer	Clare Macrae <github@cfmacrae.fastmail.co.uk>	2014-07-01 22:10:08 +0100
commit	3cb76d956028c4c529dc95aab8cc4dce93f8e886 (patch)
tree	58f5d31b79db338d8646ee9d2e26fa6801193eba /src/Text
parent	244c4eee7487e386e3e6ff7cf78146385eef9d1f (diff)
parent	264e366f1a973efa56fc32079927fc51cc1936ca (diff)