summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/SelfContained.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/SelfContained.hs')
-rw-r--r--src/Text/Pandoc/SelfContained.hs270
1 files changed, 185 insertions, 85 deletions
diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs
index d08d636df..a1c5c919e 100644
--- a/src/Text/Pandoc/SelfContained.hs
+++ b/src/Text/Pandoc/SelfContained.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE OverloadedStrings #-}
{-
-Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2011-2018 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.SelfContained
- Copyright : Copyright (C) 2011-2016 John MacFarlane
+ Copyright : Copyright (C) 2011-2018 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -30,32 +30,36 @@ Functions for converting an HTML file into one that can be viewed
offline, by incorporating linked images, CSS, and scripts into
the HTML using data URIs.
-}
-module Text.Pandoc.SelfContained ( makeSelfContained ) where
-import Text.HTML.TagSoup
-import Network.URI (isURI, escapeURIString, URI(..), parseURI)
+module Text.Pandoc.SelfContained ( makeDataURI, makeSelfContained ) where
+import Codec.Compression.GZip as Gzip
+import Control.Applicative ((<|>))
+import Control.Monad.Except (throwError)
+import Control.Monad.Trans (lift)
+import Data.ByteString (ByteString)
import Data.ByteString.Base64
import qualified Data.ByteString.Char8 as B
-import Data.ByteString (ByteString)
-import System.FilePath (takeExtension, takeDirectory, (</>))
-import Data.Char (toLower, isAscii, isAlphaNum)
-import Codec.Compression.GZip as Gzip
import qualified Data.ByteString.Lazy as L
-import Text.Pandoc.Shared (renderTags', err, fetchItem', warn, trim)
-import Text.Pandoc.MediaBag (MediaBag)
+import Data.Char (isAlphaNum, isAscii, toLower)
+import Data.List (isPrefixOf)
+import Data.Monoid ((<>))
+import Network.URI (escapeURIString)
+import System.FilePath (takeDirectory, takeExtension, (</>))
+import Text.HTML.TagSoup
+import Text.Pandoc.Class (PandocMonad (..), fetchItem, getInputFiles, report,
+ setInputFiles)
+import Text.Pandoc.Error
+import Text.Pandoc.Logging
import Text.Pandoc.MIME (MimeType)
+import Text.Pandoc.Shared (isURI, renderTags', trim)
import Text.Pandoc.UTF8 (toString)
-import Text.Pandoc.Options (WriterOptions(..))
-import Data.List (isPrefixOf)
-import Control.Applicative ((<|>))
-import Text.Parsec (runParserT, ParsecT)
+import Text.Parsec (ParsecT, runParserT)
import qualified Text.Parsec as P
-import Control.Monad.Trans (lift)
isOk :: Char -> Bool
isOk c = isAscii c && isAlphaNum c
-makeDataURI :: String -> ByteString -> String
-makeDataURI mime raw =
+makeDataURI :: (MimeType, ByteString) -> String
+makeDataURI (mime, raw) =
if textual
then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw)
else "data:" ++ mime' ++ ";base64," ++ toString (encode raw)
@@ -64,64 +68,143 @@ makeDataURI mime raw =
then mime ++ ";charset=utf-8"
else mime -- mime type already has charset
-convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String)
-convertTag media sourceURL t@(TagOpen tagname as)
+convertTags :: PandocMonad m => [Tag String] -> m [Tag String]
+convertTags [] = return []
+convertTags (t@TagOpen{}:ts)
+ | fromAttrib "data-external" t == "1" = (t:) <$> convertTags ts
+convertTags (t@(TagOpen tagname as):ts)
| tagname `elem`
- ["img", "embed", "video", "input", "audio", "source", "track"] = do
+ ["img", "embed", "video", "input", "audio", "source", "track",
+ "section"] = do
as' <- mapM processAttribute as
- return $ TagOpen tagname as'
+ rest <- convertTags ts
+ return $ TagOpen tagname as' : rest
where processAttribute (x,y) =
- if x == "src" || x == "href" || x == "poster"
+ if x `elem` ["src", "data-src", "href", "poster", "data-background-image"]
then do
- enc <- getDataURI media sourceURL (fromAttrib "type" t) y
+ enc <- getDataURI (fromAttrib "type" t) y
return (x, enc)
else return (x,y)
-convertTag media sourceURL t@(TagOpen "script" as) =
+convertTags (t@(TagOpen "script" as):TagClose "script":ts) =
case fromAttrib "src" t of
- [] -> return t
+ [] -> (t:) <$> convertTags ts
src -> do
- enc <- getDataURI media sourceURL (fromAttrib "type" t) src
- return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
-convertTag media sourceURL t@(TagOpen "link" as) =
+ let typeAttr = fromAttrib "type" t
+ res <- getData typeAttr src
+ rest <- convertTags ts
+ case res of
+ Left dataUri -> return $ TagOpen "script"
+ (("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"]) :
+ TagClose "script" : rest
+ Right (mime, bs)
+ | ("text/javascript" `isPrefixOf` mime ||
+ "application/javascript" `isPrefixOf` mime ||
+ "application/x-javascript" `isPrefixOf` mime) &&
+ not ("</script" `B.isInfixOf` bs) ->
+ return $
+ TagOpen "script" [("type", typeAttr)|not (null typeAttr)]
+ : TagText (toString bs)
+ : TagClose "script"
+ : rest
+ | otherwise ->
+ return $ TagOpen "script"
+ (("src",makeDataURI (mime, bs)) :
+ [(x,y) | (x,y) <- as, x /= "src"]) :
+ TagClose "script" : rest
+convertTags (t@(TagOpen "link" as):ts) =
case fromAttrib "href" t of
- [] -> return t
+ [] -> (t:) <$> convertTags ts
src -> do
- enc <- getDataURI media sourceURL (fromAttrib "type" t) src
- return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
-convertTag _ _ t = return t
-
-cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString
- -> IO ByteString
-cssURLs media sourceURL d orig = do
- res <- runParserT (parseCSSUrls media sourceURL d) () "css" orig
+ res <- getData (fromAttrib "type" t) src
+ case res of
+ Left dataUri -> do
+ rest <- convertTags ts
+ return $ TagOpen "link"
+ (("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"]) :
+ rest
+ Right (mime, bs)
+ | "text/css" `isPrefixOf` mime
+ && null (fromAttrib "media" t)
+ && not ("</" `B.isInfixOf` bs) -> do
+ rest <- convertTags $
+ dropWhile (==TagClose "link") ts
+ return $
+ TagOpen "style" [("type", mime)]
+ : TagText (toString bs)
+ : TagClose "style"
+ : rest
+ | otherwise -> do
+ rest <- convertTags ts
+ return $ TagOpen "link"
+ (("href",makeDataURI (mime, bs)) :
+ [(x,y) | (x,y) <- as, x /= "href"]) : rest
+convertTags (t:ts) = (t:) <$> convertTags ts
+
+cssURLs :: PandocMonad m
+ => FilePath -> ByteString -> m ByteString
+cssURLs d orig = do
+ res <- runParserT (parseCSSUrls d) () "css" orig
case res of
- Left e -> warn ("Could not parse CSS: " ++ show e) >> return orig
+ Left e -> do
+ report $ CouldNotParseCSS (show e)
+ return orig
Right bs -> return bs
-parseCSSUrls :: MediaBag -> Maybe String -> FilePath
- -> ParsecT ByteString () IO ByteString
-parseCSSUrls media sourceURL d = B.concat <$> P.many
- (pCSSWhite <|> pCSSComment <|> pCSSUrl media sourceURL d <|> pCSSOther)
+parseCSSUrls :: PandocMonad m
+ => FilePath -> ParsecT ByteString () m ByteString
+parseCSSUrls d = B.concat <$> P.many
+ (pCSSWhite <|> pCSSComment <|> pCSSImport d <|> pCSSUrl d <|> pCSSOther)
+
+pCSSImport :: PandocMonad m
+ => FilePath -> ParsecT ByteString () m ByteString
+pCSSImport d = P.try $ do
+ P.string "@import"
+ P.spaces
+ res <- (pQuoted <|> pUrl) >>= handleCSSUrl d
+ P.spaces
+ P.char ';'
+ P.spaces
+ case res of
+ Left b -> return $ B.pack "@import " <> b
+ Right (_, b) -> return b
-- Note: some whitespace in CSS is significant, so we can't collapse it!
-pCSSWhite :: ParsecT ByteString () IO ByteString
+pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString
pCSSWhite = B.singleton <$> P.space <* P.spaces
-pCSSComment :: ParsecT ByteString () IO ByteString
+pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString
pCSSComment = P.try $ do
P.string "/*"
P.manyTill P.anyChar (P.try (P.string "*/"))
return B.empty
-pCSSOther :: ParsecT ByteString () IO ByteString
-pCSSOther = do
+pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString
+pCSSOther =
(B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|>
- (B.singleton <$> P.char 'u') <|>
- (B.singleton <$> P.char '/')
+ (B.singleton <$> P.char 'u') <|>
+ (B.singleton <$> P.char '/')
+
+pCSSUrl :: PandocMonad m
+ => FilePath -> ParsecT ByteString () m ByteString
+pCSSUrl d = P.try $ do
+ res <- pUrl >>= handleCSSUrl d
+ case res of
+ Left b -> return b
+ Right (mt,b) -> do
+ let enc = makeDataURI (mt, b)
+ return (B.pack $ "url(" ++ enc ++ ")")
+
+pQuoted :: PandocMonad m
+ => ParsecT ByteString () m (String, ByteString)
+pQuoted = P.try $ do
+ quote <- P.oneOf "\"'"
+ url <- P.manyTill P.anyChar (P.char quote)
+ let fallback = B.pack ([quote] ++ trim url ++ [quote])
+ return (url, fallback)
-pCSSUrl :: MediaBag -> Maybe String -> FilePath
- -> ParsecT ByteString () IO ByteString
-pCSSUrl media sourceURL d = P.try $ do
+pUrl :: PandocMonad m
+ => ParsecT ByteString () m (String, ByteString)
+pUrl = P.try $ do
P.string "url("
P.spaces
quote <- P.option Nothing (Just <$> P.oneOf "\"'")
@@ -130,49 +213,66 @@ pCSSUrl media sourceURL d = P.try $ do
P.char ')'
let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++
maybe "" (:[]) quote ++ ")")
- case trim url of
- '#':_ -> return fallback
- 'd':'a':'t':'a':':':_ -> return fallback
+ return (url, fallback)
+
+handleCSSUrl :: PandocMonad m
+ => FilePath -> (String, ByteString)
+ -> ParsecT ByteString () m
+ (Either ByteString (MimeType, ByteString))
+handleCSSUrl d (url, fallback) =
+ case escapeURIString (/='|') (trim url) of
+ '#':_ -> return $ Left fallback
+ 'd':'a':'t':'a':':':_ -> return $ Left fallback
u -> do let url' = if isURI u then u else d </> u
- enc <- lift $ getDataURI media sourceURL "" url'
- return (B.pack $ "url(" ++ enc ++ ")")
+ res <- lift $ getData "" url'
+ case res of
+ Left uri -> return $ Left (B.pack $ "url(" ++ uri ++ ")")
+ Right (mt, raw) -> do
+ -- note that the downloaded CSS may
+ -- itself contain url(...).
+ b <- if "text/css" `isPrefixOf` mt
+ then cssURLs d raw
+ else return raw
+ return $ Right (mt, b)
+getDataURI :: PandocMonad m => MimeType -> String -> m String
+getDataURI mimetype src = do
+ res <- getData mimetype src
+ case res of
+ Left uri -> return uri
+ Right x -> return $ makeDataURI x
-getDataURI :: MediaBag -> Maybe String -> MimeType -> String
- -> IO String
-getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri
-getDataURI media sourceURL mimetype src = do
+getData :: PandocMonad m
+ => MimeType -> String
+ -> m (Either String (MimeType, ByteString))
+getData _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri
+getData mimetype src = do
let ext = map toLower $ takeExtension src
- fetchResult <- fetchItem' media sourceURL src
- (raw, respMime) <- case fetchResult of
- Left msg -> err 67 $ "Could not fetch " ++ src ++
- "\n" ++ show msg
- Right x -> return x
+ (raw, respMime) <- fetchItem src
let raw' = if ext == ".gz"
- then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
- $ [raw]
+ then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks [raw]
else raw
- let mime = case (mimetype, respMime) of
- ("",Nothing) -> error
+ mime <- case (mimetype, respMime) of
+ ("",Nothing) -> throwError $ PandocSomeError
$ "Could not determine mime type for `" ++ src ++ "'"
- (x, Nothing) -> x
- (_, Just x ) -> x
- let cssSourceURL = case parseURI src of
- Just u
- | uriScheme u `elem` ["http:","https:"] ->
- Just $ show u{ uriPath = "",
- uriQuery = "",
- uriFragment = "" }
- _ -> Nothing
- result <- if mime == "text/css"
- then cssURLs media cssSourceURL (takeDirectory src) raw'
+ (x, Nothing) -> return x
+ (_, Just x ) -> return x
+ result <- if "text/css" `isPrefixOf` mime
+ then do
+ oldInputs <- getInputFiles
+ setInputFiles [src]
+ res <- cssURLs (takeDirectory src) raw'
+ setInputFiles oldInputs
+ return res
else return raw'
- return $ makeDataURI mime result
+ return $ Right (mime, result)
+
+
-- | Convert HTML into self-contained HTML, incorporating images,
-- scripts, and CSS using data: URIs.
-makeSelfContained :: WriterOptions -> String -> IO String
-makeSelfContained opts inp = do
+makeSelfContained :: PandocMonad m => String -> m String
+makeSelfContained inp = do
let tags = parseTags inp
- out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags
+ out' <- convertTags tags
return $ renderTags' out'