From 00662faefbca0b9889d3d79dbb2985350356d18a Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 31 Jul 2014 11:04:40 -0700 Subject: Made MediaBag a newtype, and added mime type information to media. Shared now exports functions for interacting with a MediaBag: - `emptyMediaBag` - `lookuMedia` - `insertMedia` - `mediaDirectory` - `extractMediaBag` --- src/Text/Pandoc/Options.hs | 5 +-- src/Text/Pandoc/Readers/Docx.hs | 4 +- src/Text/Pandoc/SelfContained.hs | 26 +++++++------ src/Text/Pandoc/Shared.hs | 80 ++++++++++++++++++++++++++++++++-------- src/Text/Pandoc/Writers/EPUB.hs | 2 +- 5 files changed, 85 insertions(+), 32 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 2de1a9e35..bf6b3d910 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -49,8 +49,7 @@ import Data.Set (Set) import qualified Data.Set as Set import Data.Default import Text.Pandoc.Highlighting (Style, pygments) -import Text.Pandoc.Shared (MediaBag) -import qualified Data.Map as M +import Text.Pandoc.Shared (MediaBag, emptyMediaBag) -- | Individually selectable syntax extensions. data Extension = @@ -359,7 +358,7 @@ instance Default WriterOptions where , writerTOCDepth = 3 , writerReferenceODT = Nothing , writerReferenceDocx = Nothing - , writerMediaBag = M.empty + , writerMediaBag = emptyMediaBag } -- | Returns True if the given extension is enabled. diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index f5fb6565a..2fb4da2d9 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -108,7 +108,7 @@ data DState = DState { docxAnchorMap :: M.Map String String defaultDState :: DState defaultDState = DState { docxAnchorMap = M.empty - , docxMediaBag = M.empty + , docxMediaBag = emptyMediaBag , docxInHeaderBlock = False , docxInTexSubscript = False} @@ -369,7 +369,7 @@ parPartToInlines (BookMark _ anchor) = return [Span (newAnchor, ["anchor"], []) []] parPartToInlines (Drawing fp bs) = do mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = M.insert fp bs mediaBag} + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } return [Image [] (fp, "")] parPartToInlines (InternalHyperLink anchor runs) = do ils <- concatMapM runToInlines runs diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index 777da3551..77f8b6530 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -41,11 +41,11 @@ import System.FilePath (takeExtension, dropExtension, takeDirectory, ()) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err, MediaBag) +import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err, + MediaBag, lookupMedia) import Text.Pandoc.UTF8 (toString, fromString) import Text.Pandoc.MIME (getMimeType) import System.Directory (doesFileExist) -import qualified Data.Map as M isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c @@ -110,16 +110,20 @@ getItem media userdata f = -- this is needed for things like cmunrm.eot?#iefix, -- which is used to get old versions of IE to work with web fonts. let f' = takeWhile (\c -> c /= '?' && c /= '#') f - let mime = case takeExtension f' of - ".gz" -> getMimeType $ dropExtension f' - x -> getMimeType x + let mbMime = case takeExtension f' of + ".gz" -> getMimeType $ dropExtension f' + x -> getMimeType x exists <- doesFileExist f' - cont <- if exists - then B.readFile f' - else case M.lookup f media of - Just bs -> return $ BS.concat $ L.toChunks bs - Nothing -> readDataFile userdata f' - return (cont, mime) + if exists + then do + cont <- B.readFile f' + return (cont, mbMime) + else case lookupMedia f media of + Just (mime,bs) -> return (BS.concat $ L.toChunks bs, + Just mime) + Nothing -> do + cont <- readDataFile userdata f' + return (cont, mbMime) where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e getRaw :: MediaBag -> Maybe FilePath -> String -> String diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index ee48f5bc1..deab1abc0 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,5 +1,5 @@ {-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, - FlexibleContexts, ScopedTypeVariables #-} + FlexibleContexts, ScopedTypeVariables, GeneralizedNewtypeDeriving #-} {- Copyright (C) 2006-2014 John MacFarlane @@ -50,7 +50,11 @@ module Text.Pandoc.Shared ( tabFilter, -- * Media Handling MediaBag, + emptyMediaBag, + lookupMedia, insertMedia, + mediaDirectory, + extractMediaBag, -- * Date/time normalizeDate, -- * Pandoc block and inline list processing @@ -102,16 +106,18 @@ import Data.Char ( toLower, isLower, isUpper, isAlpha, isLetter, isDigit, isSpace ) import Data.List ( find, isPrefixOf, intercalate ) import qualified Data.Map as M +import Data.Maybe ( fromMaybe ) import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, unEscapeString, parseURIReference ) import qualified Data.Set as Set import System.Directory import Text.Pandoc.MIME (getMimeType) -import System.FilePath ( (), takeExtension, dropExtension ) +import System.FilePath ( (), takeExtension, dropExtension, takeDirectory, + splitPath, joinPath ) import Data.Generics (Typeable, Data) import qualified Control.Monad.State as S import qualified Control.Exception as E -import Control.Monad (msum, unless) +import Control.Monad (msum, unless, MonadPlus(..), when) import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) import Data.Time @@ -292,15 +298,63 @@ tabFilter tabStop = --- Media handling --- --- | A map of media paths to their binary representations. -type MediaBag = M.Map String BL.ByteString +-- | A container for a collection of binary resources, with names and +-- mime types. +newtype MediaBag = MediaBag (M.Map String (String, BL.ByteString)) + deriving (Monoid) --- | Insert a media item into a `MediaBag` -insertMedia :: FilePath - -> BL.ByteString +instance Show MediaBag where + show bag = "MediaBag " ++ show (mediaDirectory bag) + +emptyMediaBag :: MediaBag +emptyMediaBag = MediaBag M.empty + +-- | Insert a media item into a 'MediaBag', replacing any existing +-- value with the same name. +insertMedia :: FilePath -- ^ relative path and canonical name of resource + -> Maybe String -- ^ mime type (Nothing = determine from extension) + -> BL.ByteString -- ^ contents of resource + -> MediaBag -> MediaBag +insertMedia fp mbMime contents (MediaBag mediamap) = + MediaBag (M.insert fp (mime, contents) mediamap) + where mime = fromMaybe "application/octet-stream" (mbMime `mplus` fallback) + fallback = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + _ -> getMimeType fp + +-- | Lookup a media item in a 'MediaBag', returning mime type and contents. +lookupMedia :: FilePath -> MediaBag -insertMedia = M.insert + -> Maybe (String, BL.ByteString) +lookupMedia fp (MediaBag mediamap) = M.lookup fp mediamap + +-- | Get a list of the file paths stored in a 'MediaBag', with +-- their corresponding mime types and the lengths in bytes of the contents. +mediaDirectory :: MediaBag -> [(String, String, Int)] +mediaDirectory (MediaBag mediamap) = + M.foldWithKey (\fp (mime,contents) -> + ((fp, mime, fromIntegral $ BL.length contents):)) [] mediamap + +-- | Extract contents of MediaBag to a given directory. Print informational +-- messages if 'verbose' is true. +extractMediaBag :: Bool + -> FilePath + -> MediaBag + -> IO () +extractMediaBag verbose dir (MediaBag mediamap) = do + sequence_ $ M.foldWithKey + (\fp (_ ,contents) -> + ((writeMedia verbose dir (fp, contents)):)) [] mediamap + +writeMedia :: Bool -> FilePath -> (FilePath, BL.ByteString) -> IO () +writeMedia verbose dir (subpath, bs) = do + -- we join and split to convert a/b/c to a\b\c on Windows; + -- in zip containers all paths use / + let fullpath = dir joinPath (splitPath subpath) + createDirectoryIfMissing True $ takeDirectory fullpath + when verbose $ warn $ "extracting " ++ fullpath + BL.writeFile fullpath bs -- -- Date/time @@ -803,13 +857,9 @@ fetchItem sourceURL s fetchItem' :: MediaBag -> Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) fetchItem' media sourceURL s = do - case M.lookup s media of + case lookupMedia s media of Nothing -> fetchItem sourceURL s - Just bs -> do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - return $ Right (BS.concat $ toChunks bs, mime) + Just (mime, bs) -> return $ Right (BS.concat $ toChunks bs, Just mime) -- | Read from a URL and return raw data and maybe mime type. openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 1f222b8b8..62dd70e73 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -793,7 +793,7 @@ transformInline opts mediaRef (Image lab (src,tit)) = do return $ Image lab (newsrc, tit) transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do - raw <- makeSelfContained M.empty Nothing $ writeHtmlInline opts x + raw <- makeSelfContained emptyMediaBag Nothing $ writeHtmlInline opts x return $ RawInline (Format "html") raw transformInline opts mediaRef (RawInline fmt raw) | fmt == Format "html" = do -- cgit v1.2.3