diff options
Diffstat (limited to 'src/Text/Pandoc/SelfContained.hs')
-rw-r--r-- | src/Text/Pandoc/SelfContained.hs | 270 |
1 files changed, 185 insertions, 85 deletions
diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index d08d636df..a1c5c919e 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.SelfContained - Copyright : Copyright (C) 2011-2016 John MacFarlane + Copyright : Copyright (C) 2011-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -30,32 +30,36 @@ Functions for converting an HTML file into one that can be viewed offline, by incorporating linked images, CSS, and scripts into the HTML using data URIs. -} -module Text.Pandoc.SelfContained ( makeSelfContained ) where -import Text.HTML.TagSoup -import Network.URI (isURI, escapeURIString, URI(..), parseURI) +module Text.Pandoc.SelfContained ( makeDataURI, makeSelfContained ) where +import Codec.Compression.GZip as Gzip +import Control.Applicative ((<|>)) +import Control.Monad.Except (throwError) +import Control.Monad.Trans (lift) +import Data.ByteString (ByteString) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B -import Data.ByteString (ByteString) -import System.FilePath (takeExtension, takeDirectory, (</>)) -import Data.Char (toLower, isAscii, isAlphaNum) -import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', err, fetchItem', warn, trim) -import Text.Pandoc.MediaBag (MediaBag) +import Data.Char (isAlphaNum, isAscii, toLower) +import Data.List (isPrefixOf) +import Data.Monoid ((<>)) +import Network.URI (escapeURIString) +import System.FilePath (takeDirectory, takeExtension, (</>)) +import Text.HTML.TagSoup +import Text.Pandoc.Class (PandocMonad (..), fetchItem, getInputFiles, report, + setInputFiles) +import Text.Pandoc.Error +import Text.Pandoc.Logging import Text.Pandoc.MIME (MimeType) +import Text.Pandoc.Shared (isURI, renderTags', trim) import Text.Pandoc.UTF8 (toString) -import Text.Pandoc.Options (WriterOptions(..)) -import Data.List (isPrefixOf) -import Control.Applicative ((<|>)) -import Text.Parsec (runParserT, ParsecT) +import Text.Parsec (ParsecT, runParserT) import qualified Text.Parsec as P -import Control.Monad.Trans (lift) isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c -makeDataURI :: String -> ByteString -> String -makeDataURI mime raw = +makeDataURI :: (MimeType, ByteString) -> String +makeDataURI (mime, raw) = if textual then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw) else "data:" ++ mime' ++ ";base64," ++ toString (encode raw) @@ -64,64 +68,143 @@ makeDataURI mime raw = then mime ++ ";charset=utf-8" else mime -- mime type already has charset -convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) -convertTag media sourceURL t@(TagOpen tagname as) +convertTags :: PandocMonad m => [Tag String] -> m [Tag String] +convertTags [] = return [] +convertTags (t@TagOpen{}:ts) + | fromAttrib "data-external" t == "1" = (t:) <$> convertTags ts +convertTags (t@(TagOpen tagname as):ts) | tagname `elem` - ["img", "embed", "video", "input", "audio", "source", "track"] = do + ["img", "embed", "video", "input", "audio", "source", "track", + "section"] = do as' <- mapM processAttribute as - return $ TagOpen tagname as' + rest <- convertTags ts + return $ TagOpen tagname as' : rest where processAttribute (x,y) = - if x == "src" || x == "href" || x == "poster" + if x `elem` ["src", "data-src", "href", "poster", "data-background-image"] then do - enc <- getDataURI media sourceURL (fromAttrib "type" t) y + enc <- getDataURI (fromAttrib "type" t) y return (x, enc) else return (x,y) -convertTag media sourceURL t@(TagOpen "script" as) = +convertTags (t@(TagOpen "script" as):TagClose "script":ts) = case fromAttrib "src" t of - [] -> return t + [] -> (t:) <$> convertTags ts src -> do - enc <- getDataURI media sourceURL (fromAttrib "type" t) src - return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag media sourceURL t@(TagOpen "link" as) = + let typeAttr = fromAttrib "type" t + res <- getData typeAttr src + rest <- convertTags ts + case res of + Left dataUri -> return $ TagOpen "script" + (("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"]) : + TagClose "script" : rest + Right (mime, bs) + | ("text/javascript" `isPrefixOf` mime || + "application/javascript" `isPrefixOf` mime || + "application/x-javascript" `isPrefixOf` mime) && + not ("</script" `B.isInfixOf` bs) -> + return $ + TagOpen "script" [("type", typeAttr)|not (null typeAttr)] + : TagText (toString bs) + : TagClose "script" + : rest + | otherwise -> + return $ TagOpen "script" + (("src",makeDataURI (mime, bs)) : + [(x,y) | (x,y) <- as, x /= "src"]) : + TagClose "script" : rest +convertTags (t@(TagOpen "link" as):ts) = case fromAttrib "href" t of - [] -> return t + [] -> (t:) <$> convertTags ts src -> do - enc <- getDataURI media sourceURL (fromAttrib "type" t) src - return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) -convertTag _ _ t = return t - -cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString - -> IO ByteString -cssURLs media sourceURL d orig = do - res <- runParserT (parseCSSUrls media sourceURL d) () "css" orig + res <- getData (fromAttrib "type" t) src + case res of + Left dataUri -> do + rest <- convertTags ts + return $ TagOpen "link" + (("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"]) : + rest + Right (mime, bs) + | "text/css" `isPrefixOf` mime + && null (fromAttrib "media" t) + && not ("</" `B.isInfixOf` bs) -> do + rest <- convertTags $ + dropWhile (==TagClose "link") ts + return $ + TagOpen "style" [("type", mime)] + : TagText (toString bs) + : TagClose "style" + : rest + | otherwise -> do + rest <- convertTags ts + return $ TagOpen "link" + (("href",makeDataURI (mime, bs)) : + [(x,y) | (x,y) <- as, x /= "href"]) : rest +convertTags (t:ts) = (t:) <$> convertTags ts + +cssURLs :: PandocMonad m + => FilePath -> ByteString -> m ByteString +cssURLs d orig = do + res <- runParserT (parseCSSUrls d) () "css" orig case res of - Left e -> warn ("Could not parse CSS: " ++ show e) >> return orig + Left e -> do + report $ CouldNotParseCSS (show e) + return orig Right bs -> return bs -parseCSSUrls :: MediaBag -> Maybe String -> FilePath - -> ParsecT ByteString () IO ByteString -parseCSSUrls media sourceURL d = B.concat <$> P.many - (pCSSWhite <|> pCSSComment <|> pCSSUrl media sourceURL d <|> pCSSOther) +parseCSSUrls :: PandocMonad m + => FilePath -> ParsecT ByteString () m ByteString +parseCSSUrls d = B.concat <$> P.many + (pCSSWhite <|> pCSSComment <|> pCSSImport d <|> pCSSUrl d <|> pCSSOther) + +pCSSImport :: PandocMonad m + => FilePath -> ParsecT ByteString () m ByteString +pCSSImport d = P.try $ do + P.string "@import" + P.spaces + res <- (pQuoted <|> pUrl) >>= handleCSSUrl d + P.spaces + P.char ';' + P.spaces + case res of + Left b -> return $ B.pack "@import " <> b + Right (_, b) -> return b -- Note: some whitespace in CSS is significant, so we can't collapse it! -pCSSWhite :: ParsecT ByteString () IO ByteString +pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString pCSSWhite = B.singleton <$> P.space <* P.spaces -pCSSComment :: ParsecT ByteString () IO ByteString +pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString pCSSComment = P.try $ do P.string "/*" P.manyTill P.anyChar (P.try (P.string "*/")) return B.empty -pCSSOther :: ParsecT ByteString () IO ByteString -pCSSOther = do +pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString +pCSSOther = (B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|> - (B.singleton <$> P.char 'u') <|> - (B.singleton <$> P.char '/') + (B.singleton <$> P.char 'u') <|> + (B.singleton <$> P.char '/') + +pCSSUrl :: PandocMonad m + => FilePath -> ParsecT ByteString () m ByteString +pCSSUrl d = P.try $ do + res <- pUrl >>= handleCSSUrl d + case res of + Left b -> return b + Right (mt,b) -> do + let enc = makeDataURI (mt, b) + return (B.pack $ "url(" ++ enc ++ ")") + +pQuoted :: PandocMonad m + => ParsecT ByteString () m (String, ByteString) +pQuoted = P.try $ do + quote <- P.oneOf "\"'" + url <- P.manyTill P.anyChar (P.char quote) + let fallback = B.pack ([quote] ++ trim url ++ [quote]) + return (url, fallback) -pCSSUrl :: MediaBag -> Maybe String -> FilePath - -> ParsecT ByteString () IO ByteString -pCSSUrl media sourceURL d = P.try $ do +pUrl :: PandocMonad m + => ParsecT ByteString () m (String, ByteString) +pUrl = P.try $ do P.string "url(" P.spaces quote <- P.option Nothing (Just <$> P.oneOf "\"'") @@ -130,49 +213,66 @@ pCSSUrl media sourceURL d = P.try $ do P.char ')' let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++ maybe "" (:[]) quote ++ ")") - case trim url of - '#':_ -> return fallback - 'd':'a':'t':'a':':':_ -> return fallback + return (url, fallback) + +handleCSSUrl :: PandocMonad m + => FilePath -> (String, ByteString) + -> ParsecT ByteString () m + (Either ByteString (MimeType, ByteString)) +handleCSSUrl d (url, fallback) = + case escapeURIString (/='|') (trim url) of + '#':_ -> return $ Left fallback + 'd':'a':'t':'a':':':_ -> return $ Left fallback u -> do let url' = if isURI u then u else d </> u - enc <- lift $ getDataURI media sourceURL "" url' - return (B.pack $ "url(" ++ enc ++ ")") + res <- lift $ getData "" url' + case res of + Left uri -> return $ Left (B.pack $ "url(" ++ uri ++ ")") + Right (mt, raw) -> do + -- note that the downloaded CSS may + -- itself contain url(...). + b <- if "text/css" `isPrefixOf` mt + then cssURLs d raw + else return raw + return $ Right (mt, b) +getDataURI :: PandocMonad m => MimeType -> String -> m String +getDataURI mimetype src = do + res <- getData mimetype src + case res of + Left uri -> return uri + Right x -> return $ makeDataURI x -getDataURI :: MediaBag -> Maybe String -> MimeType -> String - -> IO String -getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri -getDataURI media sourceURL mimetype src = do +getData :: PandocMonad m + => MimeType -> String + -> m (Either String (MimeType, ByteString)) +getData _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri +getData mimetype src = do let ext = map toLower $ takeExtension src - fetchResult <- fetchItem' media sourceURL src - (raw, respMime) <- case fetchResult of - Left msg -> err 67 $ "Could not fetch " ++ src ++ - "\n" ++ show msg - Right x -> return x + (raw, respMime) <- fetchItem src let raw' = if ext == ".gz" - then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks - $ [raw] + then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks [raw] else raw - let mime = case (mimetype, respMime) of - ("",Nothing) -> error + mime <- case (mimetype, respMime) of + ("",Nothing) -> throwError $ PandocSomeError $ "Could not determine mime type for `" ++ src ++ "'" - (x, Nothing) -> x - (_, Just x ) -> x - let cssSourceURL = case parseURI src of - Just u - | uriScheme u `elem` ["http:","https:"] -> - Just $ show u{ uriPath = "", - uriQuery = "", - uriFragment = "" } - _ -> Nothing - result <- if mime == "text/css" - then cssURLs media cssSourceURL (takeDirectory src) raw' + (x, Nothing) -> return x + (_, Just x ) -> return x + result <- if "text/css" `isPrefixOf` mime + then do + oldInputs <- getInputFiles + setInputFiles [src] + res <- cssURLs (takeDirectory src) raw' + setInputFiles oldInputs + return res else return raw' - return $ makeDataURI mime result + return $ Right (mime, result) + + -- | Convert HTML into self-contained HTML, incorporating images, -- scripts, and CSS using data: URIs. -makeSelfContained :: WriterOptions -> String -> IO String -makeSelfContained opts inp = do +makeSelfContained :: PandocMonad m => String -> m String +makeSelfContained inp = do let tags = parseTags inp - out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags + out' <- convertTags tags return $ renderTags' out' |