diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/DocBook.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt.hs | 44 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/ContentReader.hs | 114 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/StyleReader.hs | 32 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org/Blocks.hs | 10 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 9 |
8 files changed, 174 insertions, 65 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 336b40933..4d8d5ab94 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -592,8 +592,6 @@ checkInMeta p = do when accepts p return mempty - - addMeta :: ToMetaValue a => String -> a -> DB () addMeta field val = modify (setMeta field val) @@ -612,7 +610,7 @@ isBlockElement (Elem e) = qName (elName e) `elem` blocktags "important","caution","note","tip","warning","qandadiv", "question","answer","abstract","itemizedlist","orderedlist", "variablelist","article","book","table","informaltable", - "informalexample", + "informalexample", "linegroup", "screen","programlisting","example","calloutlist"] isBlockElement _ = False @@ -779,6 +777,7 @@ parseBlock (Elem e) = "informaltable" -> parseTable "informalexample" -> divWith ("", ["informalexample"], []) <$> getBlocks e + "linegroup" -> lineBlock <$> lineItems "literallayout" -> codeBlockWithLang "screen" -> codeBlockWithLang "programlisting" -> codeBlockWithLang @@ -900,6 +899,7 @@ parseBlock (Elem e) = let ident = attrValue "id" e modify $ \st -> st{ dbSectionLevel = n - 1 } return $ headerWith (ident,[],[]) n' headerText <> b + lineItems = mapM getInlines $ filterChildren (named "line") e metaBlock = acceptingMetadata (getBlocks e) >> return mempty getInlines :: Element -> DB Inlines diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index b9021ec08..7b9779105 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -65,7 +65,7 @@ import Control.Monad.State import Control.Applicative ((<|>)) import qualified Data.Map as M import Control.Monad.Except -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive) import Text.TeXMath.Readers.OMML (readOMML) import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) import Text.TeXMath (Exp) @@ -86,7 +86,6 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes data ReaderState = ReaderState { stateWarnings :: [String] } deriving Show - data DocxError = DocxError | WrongElem deriving Show @@ -276,7 +275,7 @@ archiveToDocxWithWarnings archive = do comments = archiveToComments archive numbering = archiveToNumbering archive rels = archiveToRelationships archive - media = archiveToMedia archive + media = filteredFilesFromArchive archive filePathIsMedia (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument @@ -402,7 +401,6 @@ archiveToComments zf = case cmts of Just c -> Comments cmts_namespaces c Nothing -> Comments cmts_namespaces M.empty - filePathToRelType :: FilePath -> Maybe DocumentLocation filePathToRelType "word/_rels/document.xml.rels" = Just InDocument @@ -424,7 +422,7 @@ filePathToRelationships ar fp | Just relType <- filePathToRelType fp , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = mapMaybe (relElemToRelationship relType) $ elChildren relElems filePathToRelationships _ _ = [] - + archiveToRelationships :: Archive -> [Relationship] archiveToRelationships archive = concatMap (filePathToRelationships archive) $ filesInArchive archive @@ -435,16 +433,6 @@ filePathIsMedia fp = in (dir == "word/media/") -getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) -getMediaPair zf fp = - case findEntryByPath fp zf of - Just e -> Just (fp, fromEntry e) - Nothing -> Nothing - -archiveToMedia :: Archive -> Media -archiveToMedia zf = - mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) - lookupLevel :: String -> String -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs @@ -741,7 +729,7 @@ elemToCommentStart ns element , Just cmtDate <- findAttr (elemName ns "w" "date") element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps -elemToCommentStart _ _ = throwError WrongElem +elemToCommentStart _ _ = throwError WrongElem lookupFootnote :: String -> Notes -> Maybe Element lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 2e95c518d..68bc936b1 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -32,7 +32,7 @@ Conversion of markdown-formatted plain text to 'Pandoc' document. module Text.Pandoc.Readers.Markdown ( readMarkdown, readMarkdownWithWarnings ) where -import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) +import Data.List ( transpose, sortBy, findIndex, intercalate ) import qualified Data.Map as M import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) @@ -1106,7 +1106,7 @@ lineBlock = try $ do guardEnabled Ext_line_blocks lines' <- lineBlockLines >>= mapM (parseFromString (trimInlinesF . mconcat <$> many inline)) - return $ B.para <$> (mconcat $ intersperse (return B.linebreak) lines') + return $ B.lineBlock <$> sequence lines' -- -- Tables diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 68e89263c..046fb4d6d 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -37,6 +37,8 @@ import qualified Text.XML.Light as XML import qualified Data.ByteString.Lazy as B +import System.FilePath + import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Options @@ -48,39 +50,49 @@ import Text.Pandoc.Readers.Odt.StyleReader import Text.Pandoc.Readers.Odt.Generic.XMLConverter import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Shared (filteredFilesFromArchive) -- readOdt :: ReaderOptions -> B.ByteString -> Either PandocError (Pandoc, MediaBag) -readOdt _ bytes = case bytesToOdt bytes of - Right pandoc -> Right (pandoc , mempty) - Left err -> Left err +readOdt _ bytes = bytesToOdt bytes-- of +-- Right (pandoc, mediaBag) -> Right (pandoc , mediaBag) +-- Left err -> Left err -- -bytesToOdt :: B.ByteString -> Either PandocError Pandoc +bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag) bytesToOdt bytes = case toArchiveOrFail bytes of Right archive -> archiveToOdt archive Left _ -> Left $ ParseFailure "Couldn't parse odt file." -- -archiveToOdt :: Archive -> Either PandocError Pandoc +archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) archiveToOdt archive - | Just contentEntry <- findEntryByPath "content.xml" archive - , Just stylesEntry <- findEntryByPath "styles.xml" archive - , Just contentElem <- entryToXmlElem contentEntry - , Just stylesElem <- entryToXmlElem stylesEntry - , Right styles <- chooseMax (readStylesAt stylesElem ) - (readStylesAt contentElem) - , startState <- readerState styles - , Right pandoc <- runConverter' read_body - startState - contentElem - = Right pandoc + | Just contentEntry <- findEntryByPath "content.xml" archive + , Just stylesEntry <- findEntryByPath "styles.xml" archive + , Just contentElem <- entryToXmlElem contentEntry + , Just stylesElem <- entryToXmlElem stylesEntry + , Right styles <- chooseMax (readStylesAt stylesElem ) + (readStylesAt contentElem) + , media <- filteredFilesFromArchive archive filePathIsOdtMedia + , startState <- readerState styles media + , Right pandocWithMedia <- runConverter' read_body + startState + contentElem + + = Right pandocWithMedia | otherwise -- Not very detailed, but I don't think more information would be helpful = Left $ ParseFailure "Couldn't parse odt file." + where + filePathIsOdtMedia :: FilePath -> Bool + filePathIsOdtMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "Pictures/") + -- entryToXmlElem :: Entry -> Maybe XML.Element diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index ffd2f61d1..0b152268f 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -42,6 +42,7 @@ module Text.Pandoc.Readers.Odt.ContentReader import Control.Arrow import Control.Applicative hiding ( liftA, liftA2, liftA3 ) +import qualified Data.ByteString.Lazy as B import qualified Data.Map as M import Data.List ( find ) import Data.Maybe @@ -50,6 +51,7 @@ import qualified Text.XML.Light as XML import Text.Pandoc.Definition import Text.Pandoc.Builder +import Text.Pandoc.MediaBag (insertMedia, MediaBag) import Text.Pandoc.Shared import Text.Pandoc.Readers.Odt.Base @@ -68,6 +70,7 @@ import qualified Data.Set as Set -------------------------------------------------------------------------------- type Anchor = String +type Media = [(FilePath, B.ByteString)] data ReaderState = ReaderState { -- | A collection of styles read somewhere else. @@ -87,14 +90,17 @@ data ReaderState -- | A map from internal anchor names to "pretty" ones. -- The mapping is a purely cosmetic one. , bookmarkAnchors :: M.Map Anchor Anchor - + -- | A map of files / binary data from the archive + , envMedia :: Media + -- | Hold binary resources used in the document + , odtMediaBag :: MediaBag -- , sequences -- , trackedChangeIDs } deriving ( Show ) -readerState :: Styles -> ReaderState -readerState styles = ReaderState styles [] 0 Nothing M.empty +readerState :: Styles -> Media -> ReaderState +readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty -- pushStyle' :: Style -> ReaderState -> ReaderState @@ -134,6 +140,16 @@ putPrettyAnchor ugly pretty state@ReaderState{..} usedAnchors :: ReaderState -> [Anchor] usedAnchors ReaderState{..} = M.elems bookmarkAnchors +getMediaBag :: ReaderState -> MediaBag +getMediaBag ReaderState{..} = odtMediaBag + +getMediaEnv :: ReaderState -> Media +getMediaEnv ReaderState{..} = envMedia + +insertMedia' :: (FilePath, B.ByteString) -> ReaderState -> ReaderState +insertMedia' (fp, bs) state@ReaderState{..} + = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag } + -------------------------------------------------------------------------------- -- Reader type and associated tools -------------------------------------------------------------------------------- @@ -190,6 +206,22 @@ popStyle = keepingTheValue ( getCurrentListLevel :: OdtReaderSafe _x ListLevel getCurrentListLevel = getExtraState >>^ currentListLevel +-- +updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString) +updateMediaWithResource = keepingTheValue ( + (keepingTheValue getExtraState + >>% insertMedia' + ) + >>> setExtraState + ) + >>^ fst + +lookupResource :: OdtReaderSafe String (FilePath, B.ByteString) +lookupResource = proc target -> do + state <- getExtraState -< () + case lookup target (getMediaEnv state) of + Just bs -> returnV (target, bs) -<< () + Nothing -> returnV ("", B.empty) -< () type AnchorPrefix = String @@ -386,7 +418,7 @@ getListConstructor ListLevelStyle{..} = LltNumbered -> let listNumberStyle = toListNumberStyle listItemFormat listNumberDelim = toListNumberDelim listItemPrefix listItemSuffix - in orderedListWith (1, listNumberStyle, listNumberDelim) + in orderedListWith (listItemStart, listNumberStyle, listNumberDelim) where toListNumberStyle LinfNone = DefaultStyle toListNumberStyle LinfNumber = Decimal @@ -511,6 +543,10 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover extractText (XML.Text cData) = succeedWith (XML.cdData cData) extractText _ = failEmpty +read_text_seq :: InlineMatcher +read_text_seq = matchingElement NsText "sequence" + $ matchChildContent [] read_plain_text + -- specifically. I honor that, although the current implementation of '(<>)' -- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein. @@ -559,6 +595,8 @@ read_paragraph = matchingElement NsText "p" , read_reference_start , read_bookmark_ref , read_reference_ref + , read_maybe_nested_img_frame + , read_text_seq ] read_plain_text @@ -583,6 +621,7 @@ read_header = matchingElement NsText "h" , read_reference_start , read_bookmark_ref , read_reference_ref + , read_maybe_nested_img_frame ] read_plain_text ) -< blocks anchor <- getHeaderAnchor -< children @@ -688,6 +727,64 @@ read_table_cell = matchingElement NsTable "table-cell" ] ---------------------- +-- Images +---------------------- + +-- +read_maybe_nested_img_frame :: InlineMatcher +read_maybe_nested_img_frame = matchingElement NsDraw "frame" + $ proc blocks -> do + img <- (findChild' NsDraw "image") -< () + case img of + Just _ -> read_frame -< blocks + Nothing -> matchChildContent' [ read_frame_text_box ] -< blocks + +read_frame :: OdtReaderSafe Inlines Inlines +read_frame = + proc blocks -> do + w <- ( findAttr' NsSVG "width" ) -< () + h <- ( findAttr' NsSVG "height" ) -< () + titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks + src <- matchChildContent' [ read_image_src ] -< blocks + resource <- lookupResource -< src + _ <- updateMediaWithResource -< resource + alt <- (matchChildContent [] read_plain_text) -< blocks + arr (uncurry4 imageWith ) -< + (image_attributes w h, src, inlineListToIdentifier (toList titleNodes), alt) + +image_attributes :: Maybe String -> Maybe String -> Attr +image_attributes x y = + ( "", [], (dim "width" x) ++ (dim "height" y)) + where + dim _ (Just "") = [] + dim name (Just v) = [(name, v)] + dim _ Nothing = [] + +read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor) +read_image_src = matchingElement NsDraw "image" + $ proc _ -> do + imgSrc <- findAttr NsXLink "href" -< () + case imgSrc of + Right src -> returnV src -<< () + Left _ -> returnV "" -< () + +read_frame_title :: InlineMatcher +read_frame_title = matchingElement NsSVG "title" + $ (matchChildContent [] read_plain_text) + +read_frame_text_box :: InlineMatcher +read_frame_text_box = matchingElement NsDraw "text-box" + $ proc blocks -> do + paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks + arr read_img_with_caption -< toList paragraphs + +read_img_with_caption :: [Block] -> Inlines +read_img_with_caption ((Para ((Image attr _ target) : txt)) : _) = + singleton (Image attr txt target) -- override caption with the text that follows +read_img_with_caption _ = + mempty + +---------------------- -- Internal links ---------------------- @@ -782,8 +879,11 @@ read_text = matchChildContent' [ read_header ] >>^ doc -read_body :: OdtReader _x Pandoc +read_body :: OdtReader _x (Pandoc, MediaBag) read_body = executeIn NsOffice "body" $ executeIn NsOffice "text" - $ liftAsSuccess read_text - + $ liftAsSuccess + $ proc inlines -> do + txt <- read_text -< inlines + state <- getExtraState -< () + returnA -< (txt, getMediaBag state) diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 96cfed0b3..26ba6df82 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -76,8 +76,9 @@ import Control.Applicative hiding ( liftA, liftA2, liftA3 ) import qualified Data.Foldable as F import qualified Data.Map as M import qualified Data.Set as S -import Data.List ( unfoldr ) +import Data.Char ( isDigit ) import Data.Default +import Data.List ( unfoldr ) import Data.Maybe import qualified Text.XML.Light as XML @@ -390,6 +391,7 @@ data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType , listItemPrefix :: Maybe String , listItemSuffix :: Maybe String , listItemFormat :: ListItemNumberFormat + , listItemStart :: Int } deriving ( Eq, Ord ) @@ -578,25 +580,31 @@ readListLevelStyles namespace elementName levelType = readListLevelStyle :: ListLevelType -> StyleReader _x (Int, ListLevelStyle) readListLevelStyle levelType = readAttr NsText "level" >>?! keepingTheValue - ( liftA4 toListLevelStyle - ( returnV levelType ) - ( findAttr' NsStyle "num-prefix" ) - ( findAttr' NsStyle "num-suffix" ) - ( getAttr NsStyle "num-format" ) + ( liftA5 toListLevelStyle + ( returnV levelType ) + ( findAttr' NsStyle "num-prefix" ) + ( findAttr' NsStyle "num-suffix" ) + ( getAttr NsStyle "num-format" ) + ( findAttr' NsText "start-value" ) ) where - toListLevelStyle _ p s LinfNone = ListLevelStyle LltBullet p s LinfNone - toListLevelStyle _ p s f@(LinfString _) = ListLevelStyle LltBullet p s f - toListLevelStyle t p s f = ListLevelStyle t p s f + toListLevelStyle _ p s LinfNone b = ListLevelStyle LltBullet p s LinfNone (startValue b) + toListLevelStyle _ p s f@(LinfString _) b = ListLevelStyle LltBullet p s f (startValue b) + toListLevelStyle t p s f b = ListLevelStyle t p s f (startValue b) + startValue (Just "") = 1 + startValue (Just v) = if all isDigit v + then read v + else 1 + startValue Nothing = 1 -- chooseMostSpecificListLevelStyle :: S.Set ListLevelStyle -> Maybe ListLevelStyle chooseMostSpecificListLevelStyle ls | ls == mempty = Nothing | otherwise = Just ( F.foldr1 select ls ) where - select ( ListLevelStyle t1 p1 s1 f1 ) - ( ListLevelStyle t2 p2 s2 f2 ) - = ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) (selectLinf f1 f2) + select ( ListLevelStyle t1 p1 s1 f1 b1 ) + ( ListLevelStyle t2 p2 s2 f2 _ ) + = ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) (selectLinf f1 f2) b1 select' LltNumbered _ = LltNumbered select' _ LltNumbered = LltNumbered select' _ _ = LltBullet diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs index 82c3a6cbe..61978f79f 100644 --- a/src/Text/Pandoc/Readers/Org/Blocks.hs +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -50,7 +50,7 @@ import Text.Pandoc.Shared ( compactify', compactify'DL ) import Control.Monad ( foldM, guard, mzero, void ) import Data.Char ( isSpace, toLower, toUpper) -import Data.List ( foldl', intersperse, isPrefixOf ) +import Data.List ( foldl', isPrefixOf ) import Data.Maybe ( fromMaybe, isNothing ) import Data.Monoid ((<>)) @@ -288,9 +288,9 @@ blockAttributes = try $ do let kvAttrs = foldl' (appendValues "ATTR_HTML") Nothing kv let name = lookup "NAME" kv let label = lookup "LABEL" kv - caption' <- maybe (return Nothing) - (fmap Just . parseFromString inlines) - caption + caption' <- case caption of + Nothing -> return Nothing + Just s -> Just <$> parseFromString inlines (s ++ "\n") kvAttrs' <- parseFromString keyValues . (++ "\n") $ fromMaybe mempty kvAttrs return $ BlockAttributes { blockAttrName = name @@ -427,7 +427,7 @@ verseBlock :: String -> OrgParser (F Blocks) verseBlock blockType = try $ do ignHeaders content <- rawBlockContent blockType - fmap B.para . mconcat . intersperse (pure B.linebreak) + fmap B.lineBlock . sequence <$> mapM parseVerseLine (lines content) where -- replace initial spaces with nonbreaking spaces to preserve diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index f181d523a..1b06c6f23 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -39,7 +39,7 @@ import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Options import Control.Monad ( when, liftM, guard, mzero ) -import Data.List ( findIndex, intersperse, intercalate, +import Data.List ( findIndex, intercalate, transpose, sort, deleteFirstsBy, isSuffixOf , nub, union) import Data.Maybe (fromMaybe) import qualified Data.Map as M @@ -228,7 +228,7 @@ lineBlock :: RSTParser Blocks lineBlock = try $ do lines' <- lineBlockLines lines'' <- mapM parseInlineFromString lines' - return $ B.para (mconcat $ intersperse B.linebreak lines'') + return $ B.lineBlock lines'' -- -- paragraph block @@ -949,7 +949,8 @@ table = gridTable False <|> simpleTable False <|> -- inline :: RSTParser Inlines -inline = choice [ whitespace +inline = choice [ note -- can start with whitespace, so try before ws + , whitespace , link , str , endline @@ -958,7 +959,6 @@ inline = choice [ whitespace , code , subst , interpretedRole - , note , smart , hyphens , escapedChar @@ -1174,6 +1174,7 @@ subst = try $ do note :: RSTParser Inlines note = try $ do + optional whitespace ref <- noteMarker char '_' state <- getState |