summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/SelfContained.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/SelfContained.hs')
-rw-r--r--src/Text/Pandoc/SelfContained.hs162
1 files changed, 162 insertions, 0 deletions
diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs
new file mode 100644
index 000000000..9c609b8fe
--- /dev/null
+++ b/src/Text/Pandoc/SelfContained.hs
@@ -0,0 +1,162 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-
+Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.SelfContained
+ Copyright : Copyright (C) 2011 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for converting an HTML file into one that can be viewed
+offline, by incorporating linked images, CSS, and scripts into
+the HTML using data URIs.
+-}
+module Text.Pandoc.SelfContained ( makeSelfContained ) where
+import Text.HTML.TagSoup
+import Network.URI (isAbsoluteURI, parseURI, escapeURIString)
+import Network.HTTP
+import Data.ByteString.Base64
+import qualified Data.ByteString.Char8 as B
+import Data.ByteString (ByteString)
+import Data.ByteString.UTF8 (toString, fromString)
+import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>))
+import Data.Char (toLower, isAscii, isAlphaNum)
+import Codec.Compression.GZip as Gzip
+import qualified Data.ByteString.Lazy as L
+import Text.Pandoc.Shared (findDataFile)
+import Text.Pandoc.MIME (getMimeType)
+import System.Directory (doesFileExist)
+
+getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String)
+getItem userdata f =
+ if isAbsoluteURI f
+ then openURL f
+ else do
+ let mime = case takeExtension f of
+ ".gz" -> getMimeType $ dropExtension f
+ x -> getMimeType x
+ exists <- doesFileExist f
+ if exists
+ then do
+ cont <- B.readFile f
+ return (cont, mime)
+ else do
+ res <- findDataFile userdata f
+ exists' <- doesFileExist res
+ if exists'
+ then do
+ cont <- B.readFile res
+ return (cont, mime)
+ else error $ "Could not find `" ++ f ++ "'"
+
+-- TODO - have this return mime type too - then it can work for google
+-- chart API, e.g.
+openURL :: String -> IO (ByteString, Maybe String)
+openURL u = getBodyAndMimeType =<< simpleHTTP (getReq u)
+ where getReq v = case parseURI v of
+ Nothing -> error $ "Could not parse URI: " ++ v
+ Just u' -> mkRequest GET u'
+ getBodyAndMimeType (Left err) = fail (show err)
+ getBodyAndMimeType (Right r) = return (rspBody r, findHeader HdrContentType r)
+
+isOk :: Char -> Bool
+isOk c = isAscii c && isAlphaNum c
+
+convertTag :: Maybe FilePath -> Tag String -> IO (Tag String)
+convertTag userdata t@(TagOpen "img" as) =
+ case fromAttrib "src" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata (fromAttrib "type" t) src
+ let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
+ return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
+convertTag userdata t@(TagOpen "video" as) =
+ case fromAttrib "src" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata (fromAttrib "type" t) src
+ let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
+ return $ TagOpen "video" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
+convertTag userdata t@(TagOpen "script" as) =
+ case fromAttrib "src" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata (fromAttrib "type" t) src
+ let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
+ return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
+convertTag userdata t@(TagOpen "link" as) =
+ case fromAttrib "href" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata (fromAttrib "type" t) src
+ let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
+ return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
+convertTag _ t = return t
+
+cssURLs :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
+cssURLs userdata d orig =
+ case B.breakSubstring "url(" orig of
+ (x,y) | B.null y -> return orig
+ | otherwise -> do
+ let (u,v) = B.breakSubstring ")" $ B.drop 4 y
+ let url = toString
+ $ case B.take 1 u of
+ "\"" -> B.takeWhile (/='"') $ B.drop 1 u
+ _ -> u
+ (raw, mime) <- getRaw userdata "" (d </> url)
+ rest <- cssURLs userdata d v
+ let enc = "data:" `B.append` fromString mime `B.append`
+ ";base64," `B.append` (encode raw)
+ return $ x `B.append` "url(" `B.append` enc `B.append` rest
+
+getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String)
+getRaw userdata mimetype src = do
+ let ext = map toLower $ takeExtension src
+ (raw, respMime) <- getItem userdata src
+ let raw' = if ext == ".gz"
+ then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
+ $ [raw]
+ else raw
+ let mime = case (mimetype, respMime) of
+ ("",Nothing) -> error
+ $ "Could not determine mime type for `" ++ src ++ "'"
+ (x, Nothing) -> x
+ (_, Just x ) -> x
+ result <- if mime == "text/css"
+ then cssURLs userdata (takeDirectory src) raw'
+ else return raw'
+ return (result, mime)
+
+-- | Convert HTML into self-contained HTML, incorporating images,
+-- scripts, and CSS using data: URIs. Items specified using absolute
+-- URLs will be downloaded; those specified using relative URLs will
+-- be sought first relative to the working directory, then relative
+-- to the user data directory (if the first parameter is 'Just'
+-- a directory), and finally relative to pandoc's default data
+-- directory.
+makeSelfContained :: Maybe FilePath -> String -> IO String
+makeSelfContained userdata inp = do
+ let tags = parseTags inp
+ out' <- mapM (convertTag userdata) tags
+ return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br"
+ || t == "img" || t == "meta" || t == "link" ) } out'
+