From b5210bc175ca7915934082158d4197ab89efa9a2 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 30 Jul 2014 11:44:25 -0700 Subject: Added --extract-media option. This has been documented to affect the epub and docx readers, so we should either add the epub reader before the next release or change the documentation. --- README | 8 +++++++- pandoc.hs | 63 +++++++++++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/README b/README index 2dfd5bb3d..c5223543c 100644 --- a/README +++ b/README @@ -318,7 +318,13 @@ Reader options classes, respectively. The author and time of change is included. *all* is useful for scripting: only accepting changes from a certain reviewer, say, or before a certain date. This - option only affects the Docx reader. + option only affects the docx reader. + +`--extract-media=`*DIR* +: Extract images and other media contained in a docx or epub container + to the path *DIR*, creating it if necessary, and adjust the images + references in the document so they point to the extracted files. + This option only affects the docx and epub readers. General writer options ---------------------- diff --git a/pandoc.hs b/pandoc.hs index 629c16c86..a1dedae3d 100644 --- a/pandoc.hs +++ b/pandoc.hs @@ -33,6 +33,7 @@ module Main where import Text.Pandoc import Text.Pandoc.Builder (setMeta) import Text.Pandoc.PDF (makePDF) +import Text.Pandoc.Walk (walk) import Text.Pandoc.Readers.LaTeX (handleIncludes) import Text.Pandoc.Shared ( tabFilter, readDataFileUTF8, readDataFile, safeRead, headerShift, normalize, err, warn, @@ -49,7 +50,8 @@ import System.Console.GetOpt import Data.Char ( toLower ) import Data.List ( intercalate, isPrefixOf, isSuffixOf, sort ) import System.Directory ( getAppUserDataDirectory, findExecutable, - doesFileExist, Permissions(..), getPermissions ) + doesFileExist, Permissions(..), getPermissions, + createDirectoryIfMissing ) import System.IO ( stdout, stderr ) import System.IO.Error ( isDoesNotExistError ) import qualified Control.Exception as E @@ -182,6 +184,7 @@ data Opt = Opt , optAscii :: Bool -- ^ Use ascii characters only in html , optTeXLigatures :: Bool -- ^ Use TeX ligatures for quotes/dashes , optDefaultImageExtension :: String -- ^ Default image extension + , optExtractMedia :: Maybe FilePath -- ^ Path to extract embedded media , optTrace :: Bool -- ^ Print debug information , optTrackChanges :: TrackChanges -- ^ Accept or reject MS Word track-changes. } @@ -239,6 +242,7 @@ defaultOpts = Opt , optAscii = False , optTeXLigatures = True , optDefaultImageExtension = "" + , optExtractMedia = Nothing , optTrace = False , optTrackChanges = AcceptChanges } @@ -343,6 +347,26 @@ options = "NUMBER") "" -- "Tab stop (default 4)" + , Option "" ["track-changes"] + (ReqArg + (\arg opt -> do + action <- case arg of + "accept" -> return AcceptChanges + "reject" -> return RejectChanges + "all" -> return AllChanges + _ -> err 6 + ("Unknown option for track-changes: " ++ arg) + return opt { optTrackChanges = action }) + "accept|reject|all") + "" -- "Accepting or reject MS Word track-changes."" + + , Option "" ["extract-media"] + (ReqArg + (\arg opt -> do + return opt { optExtractMedia = Just arg }) + "PATH") + "" -- "Directory to which to extract embedded media" + , Option "s" ["standalone"] (NoArg (\opt -> return opt { optStandalone = True })) @@ -787,19 +811,6 @@ options = (\opt -> return opt { optTrace = True })) "" -- "Turn on diagnostic tracing in readers." - , Option "" ["track-changes"] - (ReqArg - (\arg opt -> do - action <- case arg of - "accept" -> return AcceptChanges - "reject" -> return RejectChanges - "all" -> return AllChanges - _ -> err 6 - ("Unknown option for track-changes: " ++ arg) - return opt { optTrackChanges = action }) - "accept|reject|all") - "" -- "Accepting or reject MS Word track-changes."" - , Option "" ["dump-args"] (NoArg (\opt -> return opt { optDumpArgs = True })) @@ -998,6 +1009,7 @@ main = do , optAscii = ascii , optTeXLigatures = texLigatures , optDefaultImageExtension = defaultImageExtension + , optExtractMedia = mbExtractMedia , optTrace = trace , optTrackChanges = trackChanges } = opts @@ -1196,13 +1208,32 @@ main = do then handleIncludes else return + let writeMedia :: FilePath -> (FilePath, B.ByteString) -> IO () + writeMedia dir (subpath, bs) = do + -- we join and split to convert a/b/c to a\b\c on Windows; + -- in zip containers all paths use / + let fullpath = dir joinPath (splitPath subpath) + createDirectoryIfMissing True $ takeDirectory fullpath + warn $ "extracting " ++ fullpath + B.writeFile fullpath bs + + let adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline + adjustImagePath dir paths (Image lab (src, tit)) + | src `elem` paths = Image lab (dir ++ "/" ++ src, tit) + adjustImagePath _ _ x = x + doc <- case reader of StringReader r-> readSources sources >>= handleIncludes' . convertTabs . intercalate "\n" >>= r readerOpts - ByteStringReader r -> readFiles sources >>= r readerOpts >>= - (return . fst) + ByteStringReader r -> do + (d, media) <- readFiles sources >>= r readerOpts + case mbExtractMedia of + Just dir | not (M.null media) -> do + mapM_ (writeMedia dir) $ M.toList media + return $ walk (adjustImagePath dir (M.keys media)) d + _ -> return d let doc0 = M.foldWithKey setMeta doc metadata -- cgit v1.2.3