diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/EPUB.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/EPUB.hs | 140 |
1 files changed, 66 insertions, 74 deletions
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 4c31bf1ae..3b13bbe13 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -1,55 +1,54 @@ -{-# LANGUAGE - ViewPatterns - , StandaloneDeriving - , TupleSections - , FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts #-} + +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE ViewPatterns #-} module Text.Pandoc.Readers.EPUB (readEPUB) where -import Text.XML.Light +import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry, + toArchiveOrFail) +import Control.DeepSeq (NFData, deepseq) +import Control.Monad (guard, liftM) +import Control.Monad.Except (throwError) +import qualified Data.ByteString.Lazy as BL (ByteString) +import Data.List (isInfixOf, isPrefixOf) +import qualified Data.Map as M (Map, elems, fromList, lookup) +import Data.Maybe (fromMaybe, mapMaybe) +import Data.Monoid ((<>)) +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL +import Network.URI (unEscapeString) +import System.FilePath (dropFileName, dropFileName, normalise, splitFileName, + takeFileName, (</>)) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad, insertMedia) import Text.Pandoc.Definition hiding (Attr) -import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Error -import Text.Pandoc.Walk (walk, query) -import Text.Pandoc.Options ( ReaderOptions(..), readerTrace) -import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) -import Network.URI (unEscapeString) -import Text.Pandoc.MediaBag (MediaBag, insertMedia) -import Control.Monad.Except (MonadError, throwError, runExcept, Except) +import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) -import qualified Text.Pandoc.Builder as B -import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry - , findEntryByPath, Entry) -import qualified Data.ByteString.Lazy as BL (ByteString) -import System.FilePath ( takeFileName, (</>), dropFileName, normalise - , dropFileName - , splitFileName ) +import Text.Pandoc.Options (ReaderOptions (..)) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) -import Control.Monad (guard, liftM, when) -import Data.List (isPrefixOf, isInfixOf) -import Data.Maybe (mapMaybe, fromMaybe) -import qualified Data.Map as M (Map, lookup, fromList, elems) -import Data.Monoid ((<>)) -import Control.DeepSeq (deepseq, NFData) - -import Debug.Trace (trace) +import Text.Pandoc.Walk (query, walk) +import Text.XML.Light type Items = M.Map String (FilePath, MimeType) -readEPUB :: ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag) +readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of - Right archive -> runEPUB $ archiveToEPUB opts $ archive - Left _ -> Left $ ParseFailure "Couldn't extract ePub file" + Right archive -> archiveToEPUB opts archive + Left _ -> throwError $ PandocParseError "Couldn't extract ePub file" -runEPUB :: Except PandocError a -> Either PandocError a -runEPUB = runExcept +-- runEPUB :: Except PandocError a -> Either PandocError a +-- runEPUB = runExcept -- Note that internal reference are aggresively normalised so that all ids -- are of the form "filename#id" -- -archiveToEPUB :: (MonadError PandocError m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) +archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc archiveToEPUB os archive = do -- root is path to folder with manifest file in (root, content) <- getManifest archive @@ -62,40 +61,36 @@ archiveToEPUB os archive = do Pandoc _ bs <- foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine)) `liftM` parseSpineElem root b) mempty spine - let ast = coverDoc <> (Pandoc meta bs) - let mediaBag = fetchImages (M.elems items) root archive ast - return $ (ast, mediaBag) + let ast = coverDoc <> Pandoc meta bs + fetchImages (M.elems items) root archive ast + return ast where - os' = os {readerParseRaw = True} - parseSpineElem :: MonadError PandocError m => FilePath -> (FilePath, MimeType) -> m Pandoc + os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)} + parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc parseSpineElem (normalise -> r) (normalise -> path, mime) = do - when (readerTrace os) (traceM path) doc <- mimeToReader mime r path let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty return $ docSpan <> doc - mimeToReader :: MonadError PandocError m => MimeType -> FilePath -> FilePath -> m Pandoc + mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc mimeToReader "application/xhtml+xml" (unEscapeString -> root) (unEscapeString -> path) = do fname <- findEntryByPathE (root </> path) archive - html <- either throwError return . - readHtml os' . - UTF8.toStringLazy $ - fromEntry fname + html <- readHtml os' . TL.toStrict . TL.decodeUtf8 $ fromEntry fname return $ fixInternalReferences path html mimeToReader s _ (unEscapeString -> path) | s `elem` imageMimes = return $ imageToPandoc path - | otherwise = return $ mempty + | otherwise = return mempty -- paths should be absolute when this function is called -- renameImages should do this -fetchImages :: [(FilePath, MimeType)] +fetchImages :: PandocMonad m + => [(FilePath, MimeType)] -> FilePath -- ^ Root -> Archive -> Pandoc - -> MediaBag + -> m () fetchImages mimes root arc (query iq -> links) = - foldr (uncurry3 insertMedia) mempty - (mapMaybe getEntry links) + mapM_ (uncurry3 insertMedia) (mapMaybe getEntry links) where getEntry link = let abslink = normalise (root </> link) in @@ -104,7 +99,7 @@ fetchImages mimes root arc (query iq -> links) = iq :: Inline -> [FilePath] iq (Image _ _ (url, _)) = [url] -iq _ = [] +iq _ = [] -- Remove relative paths renameImages :: FilePath -> Inline -> Inline @@ -121,13 +116,13 @@ imageMimes = ["image/gif", "image/jpeg", "image/png"] type CoverImage = FilePath -parseManifest :: (MonadError PandocError m) => Element -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items) parseManifest content = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover, (M.fromList r)) + return (cover, M.fromList r) where findCover e = maybe False (isInfixOf "cover-image") (findAttr (emptyName "properties") e) @@ -137,18 +132,18 @@ parseManifest content = do mime <- findAttrE (emptyName "media-type") e return (uid, (href, mime)) -parseSpine :: MonadError PandocError m => Items -> Element -> m [(FilePath, MimeType)] +parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do spine <- findElementE (dfName "spine") e let itemRefs = findChildren (dfName "itemref") spine - mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + mapM (mkE "parseSpine" . flip M.lookup is) $ mapMaybe parseItemRef itemRefs where parseItemRef ref = do let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) guard linear findAttr (emptyName "idref") ref -parseMeta :: MonadError PandocError m => Element -> m Meta +parseMeta :: PandocMonad m => Element -> m Meta parseMeta content = do meta <- findElementE (dfName "metadata") content let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True @@ -164,29 +159,29 @@ parseMetaItem e@(stripNamespace . elName -> field) meta = renameMeta :: String -> String renameMeta "creator" = "author" -renameMeta s = s +renameMeta s = s -getManifest :: MonadError PandocError m => Archive -> m (String, Element) +getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry let namespaces = mapMaybe attrToNSPair (elAttribs docElem) ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) - as <- liftM ((map attrToPair) . elAttribs) + as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) manifestFile <- mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive - liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + fmap ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) -- Fixup fixInternalReferences :: FilePath -> Pandoc -> Pandoc fixInternalReferences pathToFile = - (walk $ renameImages root) - . (walk $ fixBlockIRs filename) - . (walk $ fixInlineIRs filename) + walk (renameImages root) + . walk (fixBlockIRs filename) + . walk (fixInlineIRs filename) where (root, escapeURI -> filename) = splitFileName pathToFile @@ -221,7 +216,7 @@ fixAttrs :: FilePath -> B.Attr -> B.Attr fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) addHash :: String -> String -> String -addHash _ "" = "" +addHash _ "" = "" addHash s ident = takeFileName s ++ "#" ++ ident removeEPUBAttrs :: [(String, String)] -> [(String, String)] @@ -242,9 +237,6 @@ foldM' f z (x:xs) = do uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d uncurry3 f (a, b, c) = f a b c -traceM :: Monad m => String -> m () -traceM = flip trace (return ()) - -- Utility stripNamespace :: QName -> String @@ -252,7 +244,7 @@ stripNamespace (QName v _ _) = v attrToNSPair :: Attr -> Maybe (String, String) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) -attrToNSPair _ = Nothing +attrToNSPair _ = Nothing attrToPair :: Attr -> (String, String) attrToPair (Attr (QName name _ _) val) = (name, val) @@ -268,18 +260,18 @@ emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: MonadError PandocError m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m String findAttrE q e = mkE "findAttr" $ findAttr q e -findEntryByPathE :: MonadError PandocError m => FilePath -> Archive -> m Entry +findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise -> path) a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a -parseXMLDocE :: MonadError PandocError m => String -> m Element +parseXMLDocE :: PandocMonad m => String -> m Element parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc -findElementE :: MonadError PandocError m => QName -> Element -> m Element +findElementE :: PandocMonad m => QName -> Element -> m Element findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x -mkE :: MonadError PandocError m => String -> Maybe a -> m a -mkE s = maybe (throwError . ParseFailure $ s) return +mkE :: PandocMonad m => String -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return |