summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/EPUB.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/EPUB.hs')
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs140
1 files changed, 66 insertions, 74 deletions
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 4c31bf1ae..3b13bbe13 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -1,55 +1,54 @@
-{-# LANGUAGE
- ViewPatterns
- , StandaloneDeriving
- , TupleSections
- , FlexibleContexts #-}
+{-# LANGUAGE FlexibleContexts #-}
+
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE ViewPatterns #-}
module Text.Pandoc.Readers.EPUB
(readEPUB)
where
-import Text.XML.Light
+import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry,
+ toArchiveOrFail)
+import Control.DeepSeq (NFData, deepseq)
+import Control.Monad (guard, liftM)
+import Control.Monad.Except (throwError)
+import qualified Data.ByteString.Lazy as BL (ByteString)
+import Data.List (isInfixOf, isPrefixOf)
+import qualified Data.Map as M (Map, elems, fromList, lookup)
+import Data.Maybe (fromMaybe, mapMaybe)
+import Data.Monoid ((<>))
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+import Network.URI (unEscapeString)
+import System.FilePath (dropFileName, dropFileName, normalise, splitFileName,
+ takeFileName, (</>))
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Class (PandocMonad, insertMedia)
import Text.Pandoc.Definition hiding (Attr)
-import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Error
-import Text.Pandoc.Walk (walk, query)
-import Text.Pandoc.Options ( ReaderOptions(..), readerTrace)
-import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField)
-import Network.URI (unEscapeString)
-import Text.Pandoc.MediaBag (MediaBag, insertMedia)
-import Control.Monad.Except (MonadError, throwError, runExcept, Except)
+import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension)
import Text.Pandoc.MIME (MimeType)
-import qualified Text.Pandoc.Builder as B
-import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry
- , findEntryByPath, Entry)
-import qualified Data.ByteString.Lazy as BL (ByteString)
-import System.FilePath ( takeFileName, (</>), dropFileName, normalise
- , dropFileName
- , splitFileName )
+import Text.Pandoc.Options (ReaderOptions (..))
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI)
import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
-import Control.Monad (guard, liftM, when)
-import Data.List (isPrefixOf, isInfixOf)
-import Data.Maybe (mapMaybe, fromMaybe)
-import qualified Data.Map as M (Map, lookup, fromList, elems)
-import Data.Monoid ((<>))
-import Control.DeepSeq (deepseq, NFData)
-
-import Debug.Trace (trace)
+import Text.Pandoc.Walk (query, walk)
+import Text.XML.Light
type Items = M.Map String (FilePath, MimeType)
-readEPUB :: ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag)
+readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc
readEPUB opts bytes = case toArchiveOrFail bytes of
- Right archive -> runEPUB $ archiveToEPUB opts $ archive
- Left _ -> Left $ ParseFailure "Couldn't extract ePub file"
+ Right archive -> archiveToEPUB opts archive
+ Left _ -> throwError $ PandocParseError "Couldn't extract ePub file"
-runEPUB :: Except PandocError a -> Either PandocError a
-runEPUB = runExcept
+-- runEPUB :: Except PandocError a -> Either PandocError a
+-- runEPUB = runExcept
-- Note that internal reference are aggresively normalised so that all ids
-- are of the form "filename#id"
--
-archiveToEPUB :: (MonadError PandocError m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
+archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc
archiveToEPUB os archive = do
-- root is path to folder with manifest file in
(root, content) <- getManifest archive
@@ -62,40 +61,36 @@ archiveToEPUB os archive = do
Pandoc _ bs <-
foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine))
`liftM` parseSpineElem root b) mempty spine
- let ast = coverDoc <> (Pandoc meta bs)
- let mediaBag = fetchImages (M.elems items) root archive ast
- return $ (ast, mediaBag)
+ let ast = coverDoc <> Pandoc meta bs
+ fetchImages (M.elems items) root archive ast
+ return ast
where
- os' = os {readerParseRaw = True}
- parseSpineElem :: MonadError PandocError m => FilePath -> (FilePath, MimeType) -> m Pandoc
+ os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)}
+ parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc
parseSpineElem (normalise -> r) (normalise -> path, mime) = do
- when (readerTrace os) (traceM path)
doc <- mimeToReader mime r path
let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
return $ docSpan <> doc
- mimeToReader :: MonadError PandocError m => MimeType -> FilePath -> FilePath -> m Pandoc
+ mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc
mimeToReader "application/xhtml+xml" (unEscapeString -> root)
(unEscapeString -> path) = do
fname <- findEntryByPathE (root </> path) archive
- html <- either throwError return .
- readHtml os' .
- UTF8.toStringLazy $
- fromEntry fname
+ html <- readHtml os' . TL.toStrict . TL.decodeUtf8 $ fromEntry fname
return $ fixInternalReferences path html
mimeToReader s _ (unEscapeString -> path)
| s `elem` imageMimes = return $ imageToPandoc path
- | otherwise = return $ mempty
+ | otherwise = return mempty
-- paths should be absolute when this function is called
-- renameImages should do this
-fetchImages :: [(FilePath, MimeType)]
+fetchImages :: PandocMonad m
+ => [(FilePath, MimeType)]
-> FilePath -- ^ Root
-> Archive
-> Pandoc
- -> MediaBag
+ -> m ()
fetchImages mimes root arc (query iq -> links) =
- foldr (uncurry3 insertMedia) mempty
- (mapMaybe getEntry links)
+ mapM_ (uncurry3 insertMedia) (mapMaybe getEntry links)
where
getEntry link =
let abslink = normalise (root </> link) in
@@ -104,7 +99,7 @@ fetchImages mimes root arc (query iq -> links) =
iq :: Inline -> [FilePath]
iq (Image _ _ (url, _)) = [url]
-iq _ = []
+iq _ = []
-- Remove relative paths
renameImages :: FilePath -> Inline -> Inline
@@ -121,13 +116,13 @@ imageMimes = ["image/gif", "image/jpeg", "image/png"]
type CoverImage = FilePath
-parseManifest :: (MonadError PandocError m) => Element -> m (Maybe CoverImage, Items)
+parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items)
parseManifest content = do
manifest <- findElementE (dfName "manifest") content
let items = findChildren (dfName "item") manifest
r <- mapM parseItem items
let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
- return (cover, (M.fromList r))
+ return (cover, M.fromList r)
where
findCover e = maybe False (isInfixOf "cover-image")
(findAttr (emptyName "properties") e)
@@ -137,18 +132,18 @@ parseManifest content = do
mime <- findAttrE (emptyName "media-type") e
return (uid, (href, mime))
-parseSpine :: MonadError PandocError m => Items -> Element -> m [(FilePath, MimeType)]
+parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)]
parseSpine is e = do
spine <- findElementE (dfName "spine") e
let itemRefs = findChildren (dfName "itemref") spine
- mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
+ mapM (mkE "parseSpine" . flip M.lookup is) $ mapMaybe parseItemRef itemRefs
where
parseItemRef ref = do
let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
guard linear
findAttr (emptyName "idref") ref
-parseMeta :: MonadError PandocError m => Element -> m Meta
+parseMeta :: PandocMonad m => Element -> m Meta
parseMeta content = do
meta <- findElementE (dfName "metadata") content
let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
@@ -164,29 +159,29 @@ parseMetaItem e@(stripNamespace . elName -> field) meta =
renameMeta :: String -> String
renameMeta "creator" = "author"
-renameMeta s = s
+renameMeta s = s
-getManifest :: MonadError PandocError m => Archive -> m (String, Element)
+getManifest :: PandocMonad m => Archive -> m (String, Element)
getManifest archive = do
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
- as <- liftM ((map attrToPair) . elAttribs)
+ as <- fmap (map attrToPair . elAttribs)
(findElementE (QName "rootfile" (Just ns) Nothing) docElem)
manifestFile <- mkE "Root not found" (lookup "full-path" as)
let rootdir = dropFileName manifestFile
--mime <- lookup "media-type" as
manifest <- findEntryByPathE manifestFile archive
- liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+ fmap ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
-- Fixup
fixInternalReferences :: FilePath -> Pandoc -> Pandoc
fixInternalReferences pathToFile =
- (walk $ renameImages root)
- . (walk $ fixBlockIRs filename)
- . (walk $ fixInlineIRs filename)
+ walk (renameImages root)
+ . walk (fixBlockIRs filename)
+ . walk (fixInlineIRs filename)
where
(root, escapeURI -> filename) = splitFileName pathToFile
@@ -221,7 +216,7 @@ fixAttrs :: FilePath -> B.Attr -> B.Attr
fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
addHash :: String -> String -> String
-addHash _ "" = ""
+addHash _ "" = ""
addHash s ident = takeFileName s ++ "#" ++ ident
removeEPUBAttrs :: [(String, String)] -> [(String, String)]
@@ -242,9 +237,6 @@ foldM' f z (x:xs) = do
uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
uncurry3 f (a, b, c) = f a b c
-traceM :: Monad m => String -> m ()
-traceM = flip trace (return ())
-
-- Utility
stripNamespace :: QName -> String
@@ -252,7 +244,7 @@ stripNamespace (QName v _ _) = v
attrToNSPair :: Attr -> Maybe (String, String)
attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
-attrToNSPair _ = Nothing
+attrToNSPair _ = Nothing
attrToPair :: Attr -> (String, String)
attrToPair (Attr (QName name _ _) val) = (name, val)
@@ -268,18 +260,18 @@ emptyName s = QName s Nothing Nothing
-- Convert Maybe interface to Either
-findAttrE :: MonadError PandocError m => QName -> Element -> m String
+findAttrE :: PandocMonad m => QName -> Element -> m String
findAttrE q e = mkE "findAttr" $ findAttr q e
-findEntryByPathE :: MonadError PandocError m => FilePath -> Archive -> m Entry
+findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
findEntryByPathE (normalise -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
-parseXMLDocE :: MonadError PandocError m => String -> m Element
+parseXMLDocE :: PandocMonad m => String -> m Element
parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
-findElementE :: MonadError PandocError m => QName -> Element -> m Element
+findElementE :: PandocMonad m => QName -> Element -> m Element
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
-mkE :: MonadError PandocError m => String -> Maybe a -> m a
-mkE s = maybe (throwError . ParseFailure $ s) return
+mkE :: PandocMonad m => String -> Maybe a -> m a
+mkE s = maybe (throwError . PandocParseError $ s) return