summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/EPUB.hs
diff options
context:
space:
mode:
authorMatthew Pickering <matthewtpickering@gmail.com>2014-08-05 12:51:18 +0100
committerMatthew Pickering <matthewtpickering@gmail.com>2014-08-07 22:56:30 +0100
commit19d2ff68b19f52b78ffac166206d2facd7c7a015 (patch)
treef68796d74ad995576d49240c1785db01238d2cfc /src/Text/Pandoc/Readers/EPUB.hs
parent482f7f8e157b713b5dcf81303844721d827b16de (diff)
EPUB Reader: Improved how images are extracted
Diffstat (limited to 'src/Text/Pandoc/Readers/EPUB.hs')
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs68
1 files changed, 45 insertions, 23 deletions
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index ca65a8f0f..968b815c0 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -22,7 +22,9 @@ import qualified Text.Pandoc.Builder as B
import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
, findEntryByPath, Entry)
import qualified Data.ByteString.Lazy as BL (ByteString)
-import System.FilePath (takeFileName, (</>), dropFileName, normalise)
+import System.FilePath ( takeFileName, (</>), dropFileName, normalise
+ , joinPath, dropFileName, splitDirectories
+ , splitFileName )
import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
import Control.Applicative ((<$>))
import Control.Monad (guard, liftM, when)
@@ -48,13 +50,12 @@ runEPUB = either error id . runExcept
-- Note that internal reference are aggresively normalised so that all ids
-- are of the form "filename#id"
--
--- For now all paths are stripped from images
archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
archiveToEPUB os archive = do
(root, content) <- getManifest archive
meta <- parseMeta content
(cover, items) <- parseManifest content
- let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName <$> cover)
+ let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
spine <- parseSpine items content
let escapedSpine = map (escapeURI . takeFileName . fst) spine
Pandoc _ bs <-
@@ -68,16 +69,19 @@ archiveToEPUB os archive = do
os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]}
os'' = os' {readerParseRaw = True}
parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
- parseSpineElem r (path, mime) = do
+ parseSpineElem (normalise -> r) (normalise -> path, mime) = do
when (readerTrace os) (traceM path)
- doc <- mimeToReader mime (normalise (r </> path))
+ doc <- mimeToReader mime r path
let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
- return $ docSpan <> fixInternalReferences (takeFileName path) doc
- mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc
- mimeToReader "application/xhtml+xml" path = do
- fname <- findEntryByPathE path archive
- return $ readHtml os'' . UTF8.toStringLazy $ fromEntry fname
- mimeToReader s path
+ return $ docSpan <> doc
+ mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc
+ mimeToReader "application/xhtml+xml" r path = do
+ fname <- findEntryByPathE (r </> path) archive
+ return $ fixInternalReferences (r </> path) .
+ readHtml os'' .
+ UTF8.toStringLazy $
+ fromEntry fname
+ mimeToReader s _ path
| s `elem` imageMimes = return $ imageToPandoc path
| otherwise = return $ mempty
@@ -86,18 +90,34 @@ fetchImages :: [(FilePath, MIME)]
-> Archive
-> Pandoc
-> MediaBag
-fetchImages mimes root a (query iq -> links) =
+fetchImages mimes root arc (query iq -> links) =
foldr (uncurry3 insertMedia) mempty
(mapMaybe getEntry links)
where
- getEntry l = let mediaPos = normalise (root </> l) in
- (l , lookup mediaPos mimes, ) . fromEntry
- <$> findEntryByPath mediaPos a
+ getEntry (normalise -> l) =
+ let mediaPos = normalise (root </> l) in
+ (l , lookup mediaPos mimes, ) . fromEntry
+ <$> findEntryByPath mediaPos arc
iq :: Inline -> [FilePath]
iq (Image _ (url, _)) = [url]
iq _ = []
+-- Remove relative paths
+renameImages :: FilePath -> Inline -> Inline
+renameImages root (Image a (url, b)) = Image a (collapse (root </> url), b)
+renameImages _ x = x
+
+collapse :: FilePath -> FilePath
+collapse = joinPath . reverse . foldl go [] . splitDirectories
+ where
+ go rs "." = rs
+ go r@(p:rs) ".." = case p of
+ ".." -> ("..":r)
+ "/" -> ("..":r)
+ _ -> rs
+ go _ "/" = ["/"]
+ go rs x = x:rs
imageToPandoc :: FilePath -> Pandoc
imageToPandoc s = B.doc . B.para $ B.image s "" mempty
@@ -168,11 +188,14 @@ getManifest archive = do
-- Fixup
-fixInternalReferences :: String -> Pandoc -> Pandoc
-fixInternalReferences s =
- (walk normalisePath) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s')
+fixInternalReferences :: FilePath -> Pandoc -> Pandoc
+fixInternalReferences pathToFile =
+ (walk $ renameImages root)
+ . (walk normalisePath)
+ . (walk $ fixBlockIRs filename)
+ . (walk $ fixInlineIRs filename)
where
- s' = escapeURI s
+ (root, escapeURI -> filename) = splitFileName pathToFile
fixInlineIRs :: String -> Inline -> Inline
fixInlineIRs s (Span as v) =
@@ -227,12 +250,12 @@ foldM' f z (x:xs) = do
z' <- f z x
z' `deepseq` foldM' f z' xs
-traceM :: Monad m => String -> m ()
-traceM = flip trace (return ())
-
uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
uncurry3 f (a, b, c) = f a b c
+traceM :: Monad m => String -> m ()
+traceM = flip trace (return ())
+
-- Utility
stripNamespace :: QName -> String
@@ -270,4 +293,3 @@ findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
mkE :: MonadError String m => String -> Maybe a -> m a
mkE s = maybe (throwError s) return
-