From 9c7e0dc84b2384347099827999f0e2f7be4f7e51 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 21 Jun 2014 17:53:45 -0400 Subject: Implement new normalization. There were some problems with the old str normalization. This fixes those problems. Also, since it drills down on its own, it only needs to be mapped over the blocks, not walked over the tree. --- src/Text/Pandoc/Readers/Docx.hs | 68 ++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 479a88161..299adf5a8 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -148,25 +148,71 @@ runElemsToString = concatMap runElemToString --- Text.Pandoc.Shared.normalize for reasons of efficiency. For --- whatever reason, `normalize` makes a run take almost twice as --- long. (It does more, but this does what we need) -strNormalize :: [Inline] -> [Inline] -strNormalize [] = [] -strNormalize (Str "" : ils) = strNormalize ils -strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) -strNormalize (il:ils) = il : (strNormalize ils) +inlineNormalize :: [Inline] -> [Inline] +inlineNormalize [] = [] +inlineNormalize (Str "" : ils) = inlineNormalize ils +inlineNormalize ((Str s) : (Str s') : l) = + inlineNormalize (Str (s++s') : l) +inlineNormalize ((Emph ils) : (Emph ils') : l) = + inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Emph ils) : l) = + Emph (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strong ils) : (Strong ils') : l) = + inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strong ils) : l) = + Strong (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) = + inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strikeout ils) : l) = + Strikeout (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Superscript ils) : (Superscript ils') : l) = + inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Superscript ils) : l) = + Superscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Subscript ils) : (Subscript ils') : l) = + inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Subscript ils) : l) = + Subscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Space : Space : l)) = + inlineNormalize $ (Space : l) +inlineNormalize ((Quoted qt ils) : l) = + Quoted qt (inlineNormalize ils) : inlineNormalize l +inlineNormalize ((Cite cits ils) : l) = + let + f :: Citation -> Citation + f (Citation s pref suff mode num hash) = + Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash + in + Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Link ils s) : l) = + Link (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Image ils s) : l) = + Image (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Note blks) : l) = + Note (map blockNormalize blks) : (inlineNormalize l) +inlineNormalize ((Span attr ils) : l) = + Span attr (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize (il : l) = il : (inlineNormalize l) stripSpaces :: [Inline] -> [Inline] stripSpaces ils = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils blockNormalize :: Block -> Block -blockNormalize (Plain ils) = Plain $ bottomUp strNormalize $ stripSpaces ils -blockNormalize (Para ils) = Para $ bottomUp strNormalize $ stripSpaces ils +blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils +blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils blockNormalize (Header n attr ils) = - Header n attr $ bottomUp strNormalize $ stripSpaces ils + Header n attr $ stripSpaces $ inlineNormalize ils blockNormalize (Table ils align width hdr cells) = - Table (bottomUp strNormalize $ stripSpaces ils) align width hdr cells + Table (stripSpaces $ inlineNormalize ils) align width hdr cells blockNormalize (DefinitionList pairs) = - DefinitionList $ map (\(ils, blklsts) -> (bottomUp strNormalize (stripSpaces ils), blklsts)) pairs + DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs +blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks) +blockNormalize (OrderedList attr blkslst) = + OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (BulletList blkslst) = + BulletList $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (Div attr blks) = Div attr (map blockNormalize blks) blockNormalize blk = blk runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] @@ -315,7 +361,7 @@ makeImagesSelfContained _ inline = inline bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] bodyToBlocks opts docx (Body bps) = bottomUp removeEmptyPars $ - bottomUp blockNormalize $ + map blockNormalize $ bottomUp spanRemove $ bottomUp divRemove $ map (makeHeaderAnchors) $ -- cgit v1.2.3 From ca4add679ce6dd438cc3f6d58f82d04a9ad6305e Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 21 Jun 2014 17:58:32 -0400 Subject: Add normalization test. Add torture-test for new normalization functions. One problem that this test demonstrates is that word has a tendency to turn off formatting at a space, and then turn it back on after. I'm not sure yet whether this is something we should fix. --- tests/Tests/Readers/Docx.hs | 4 ++++ tests/docx.normalize.docx | Bin 0 -> 25994 bytes tests/docx.normalize.native | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 tests/docx.normalize.docx create mode 100644 tests/docx.normalize.native diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index e8fa33241..74184efc6 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -74,6 +74,10 @@ tests = [ testGroup "inlines" "literal tabs" "docx.tabs.docx" "docx.tabs.native" + , testCompare + "normalizing inlines" + "docx.normalize.docx" + "docx.normalize.native" ] , testGroup "blocks" [ testCompare diff --git a/tests/docx.normalize.docx b/tests/docx.normalize.docx new file mode 100644 index 000000000..5e4370a47 Binary files /dev/null and b/tests/docx.normalize.docx differ diff --git a/tests/docx.normalize.native b/tests/docx.normalize.native new file mode 100644 index 000000000..fa34d0581 --- /dev/null +++ b/tests/docx.normalize.native @@ -0,0 +1,2 @@ +[Para [Str "These",Space,Str "are",Space,Str "different",Space,Str "fonts."] +,Para [Strong [Str "These",Space,Emph [Str "are"]],Space,Strong [Emph [Strikeout [Str "different"]],Space,Str "fonts."]]] -- cgit v1.2.3 From 8e5bd9d851aa0f60462015f61e3980b134e3c131 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sun, 22 Jun 2014 01:47:11 -0400 Subject: Docx reader: Fix spacing in formatting. The normalizing tests revealed a problem with unformatted spaces, brought about by `spanTrim`. This fixes by not trimming the spaces out of spans until they are in their final form. --- src/Text/Pandoc/Readers/Docx.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 299adf5a8..09c2330fb 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -286,8 +286,8 @@ parPartsToInlines opts docx parparts = (if False -- TODO depend on option then bottomUp (makeImagesSelfContained docx) else id) $ - bottomUp spanCorrect $ bottomUp spanTrim $ + bottomUp spanCorrect $ bottomUp spanReduce $ concatMap (parPartToInlines opts docx) parparts -- cgit v1.2.3 From b3df3a38611fe4fd03fa2d4e38ba45ae7cf8fe08 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sun, 22 Jun 2014 01:56:33 -0400 Subject: Docx reader tests: Correct normalize test. --- tests/docx.normalize.native | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/docx.normalize.native b/tests/docx.normalize.native index fa34d0581..aeba672c4 100644 --- a/tests/docx.normalize.native +++ b/tests/docx.normalize.native @@ -1,2 +1,2 @@ [Para [Str "These",Space,Str "are",Space,Str "different",Space,Str "fonts."] -,Para [Strong [Str "These",Space,Emph [Str "are"]],Space,Strong [Emph [Strikeout [Str "different"]],Space,Str "fonts."]]] +,Para [Strong [Str "These",Space,Emph [Str "are",Space,Strikeout [Str "different"]],Space,Str "fonts."]]] -- cgit v1.2.3 From ed43513087b514a5240fde04784dbf8709182513 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sun, 22 Jun 2014 01:58:41 -0400 Subject: Docx reader tests: add tests for normalization deep in blocks. --- tests/Tests/Readers/Docx.hs | 4 ++++ tests/docx.deep_normalize.docx | Bin 0 -> 29246 bytes tests/docx.deep_normalize.native | 6 ++++++ 3 files changed, 10 insertions(+) create mode 100644 tests/docx.deep_normalize.docx create mode 100644 tests/docx.deep_normalize.native diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 74184efc6..ffb079eee 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -78,6 +78,10 @@ tests = [ testGroup "inlines" "normalizing inlines" "docx.normalize.docx" "docx.normalize.native" + , testCompare + "normalizing inlines deep inside blocks" + "docx.deep_normalize.docx" + "docx.deep_normalize.native" ] , testGroup "blocks" [ testCompare diff --git a/tests/docx.deep_normalize.docx b/tests/docx.deep_normalize.docx new file mode 100644 index 000000000..7626d59ce Binary files /dev/null and b/tests/docx.deep_normalize.docx differ diff --git a/tests/docx.deep_normalize.native b/tests/docx.deep_normalize.native new file mode 100644 index 000000000..9b2089ec8 --- /dev/null +++ b/tests/docx.deep_normalize.native @@ -0,0 +1,6 @@ +[OrderedList (1,Decimal,OneParen) + [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "first",Space,Str "level"] + ,OrderedList (1,LowerAlpha,DefaultDelim) + [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "second",Space,Str "level"] + ,OrderedList (1,LowerRoman,DefaultDelim) + [[Para [Str "This",Space,Str "is",Space,Emph [Str "at",Space,Strong [Str "the",Space,Str "third",Space,Str "level"],Str ",",Space,Str "and",Space,Str "I",Space,Str "want",Space,Str "to"],Space,Str "test",Space,Str "normalization",Space,Str "here."]]]]]]]] -- cgit v1.2.3