diff options
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 865 |
1 files changed, 264 insertions, 601 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index bd2da945e..52e1447db 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,8 +1,12 @@ -{-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, - FlexibleContexts, ScopedTypeVariables, PatternGuards, - ViewPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE MultiParamTypeClasses #-} + +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} {- -Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Shared - Copyright : Copyright (C) 2006-2016 John MacFarlane + Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -49,21 +53,18 @@ module Text.Pandoc.Shared ( toRomanNumeral, escapeURI, tabFilter, + crFilter, -- * Date/time normalizeDate, -- * Pandoc block and inline list processing orderedListMarkers, - normalizeSpaces, extractSpaces, - normalize, - normalizeInlines, - normalizeBlocks, removeFormatting, + deNote, stringify, capitalize, compactify, - compactify', - compactify'DL, + compactifyDL, linesToPara, Element (..), hierarchicalize, @@ -71,29 +72,26 @@ module Text.Pandoc.Shared ( inlineListToIdentifier, isHeaderBlock, headerShift, + stripEmptyParagraphs, isTightList, addMetaField, makeMeta, + eastAsianLineBreakFilter, + underlineSpan, -- * TagSoup HTML handling renderTags', -- * File handling inDirectory, - getDefaultReferenceDocx, - getDefaultReferenceODT, - readDataFile, - readDataFileUTF8, - fetchItem, - fetchItem', - openURL, collapseFilePath, filteredFilesFromArchive, + -- * URI handling + schemes, + isURI, -- * Error handling - err, - warn, mapLeft, - hush, -- * for squashing blocks blocksToInlines, + blocksToInlines', -- * Safe read safeRead, -- * Temp directory @@ -102,74 +100,37 @@ module Text.Pandoc.Shared ( pandocVersion ) where -import Text.Pandoc.Definition -import Text.Pandoc.Walk -import Text.Pandoc.MediaBag (MediaBag, lookupMedia) -import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) -import qualified Text.Pandoc.Builder as B -import qualified Text.Pandoc.UTF8 as UTF8 -import System.Environment (getProgName) -import System.Exit (exitWith, ExitCode(..)) -import Data.Char ( toLower, isLower, isUpper, isAlpha, - isLetter, isDigit, isSpace ) -import Data.List ( find, stripPrefix, intercalate ) -import Data.Maybe (mapMaybe) -import Data.Version ( showVersion ) +import Codec.Archive.Zip +import qualified Control.Exception as E +import Control.Monad (MonadPlus (..), msum, unless) +import qualified Control.Monad.State.Strict as S +import qualified Data.ByteString.Lazy as BL +import Data.Char (isAlpha, isDigit, isLetter, isLower, isSpace, isUpper, + toLower) +import Data.Data (Data, Typeable) +import Data.List (find, intercalate, intersperse, stripPrefix) import qualified Data.Map as M -import Network.URI ( escapeURIString, nonStrictRelativeTo, - unEscapeString, parseURIReference, isAllowedInURI, - parseURI, URI(..) ) +import Data.Maybe (mapMaybe) +import Data.Monoid ((<>)) +import Data.Sequence (ViewL (..), ViewR (..), viewl, viewr) import qualified Data.Set as Set +import qualified Data.Text as T +import Data.Version (showVersion) +import Network.URI (URI (uriScheme), escapeURIString, parseURI) +import Paths_pandoc (version) import System.Directory -import System.FilePath (splitDirectories, isPathSeparator) +import System.FilePath (isPathSeparator, splitDirectories) import qualified System.FilePath.Posix as Posix -import Text.Pandoc.MIME (MimeType, getMimeType) -import System.FilePath ( (</>), takeExtension, dropExtension) -import Data.Generics (Typeable, Data) -import qualified Control.Monad.State as S -import Control.Monad.Trans (MonadIO (..)) -import qualified Control.Exception as E -import Control.Monad (msum, unless, MonadPlus(..)) -import Text.Pandoc.Pretty (charWidth) -import Text.Pandoc.Compat.Time -import Data.Time.Clock.POSIX -import System.IO (stderr) import System.IO.Temp -import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), - renderOptions) -import Data.Monoid ((<>)) -import qualified Data.ByteString as BS -import qualified Data.ByteString.Char8 as B8 -import Data.ByteString.Base64 (decodeLenient) -import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) -import qualified Data.Text as T (toUpper, pack, unpack) -import Data.ByteString.Lazy (toChunks, fromChunks) -import qualified Data.ByteString.Lazy as BL -import Paths_pandoc (version) - -import Codec.Archive.Zip - -#ifdef EMBED_DATA_FILES -import Text.Pandoc.Data (dataFiles) -#else -import Paths_pandoc (getDataFileName) -#endif -#ifdef HTTP_CLIENT -import Network.HTTP.Client (httpLbs, responseBody, responseHeaders, - Request(port,host)) -import Network.HTTP.Client (parseRequest) -import Network.HTTP.Client (newManager) -import Network.HTTP.Client.Internal (addProxy) -import Network.HTTP.Client.TLS (tlsManagerSettings) -import System.Environment (getEnv) -import Network.HTTP.Types.Header ( hContentType) -import Network (withSocketsDo) -#else -import Network.URI (parseURI) -import Network.HTTP (findHeader, rspBody, - RequestMethod(..), HeaderName(..), mkRequest) -import Network.Browser (browse, setAllowRedirects, setOutHandler, request) -#endif +import Text.HTML.TagSoup (RenderOptions (..), Tag (..), renderOptions, + renderTagsOptions) +import Text.Pandoc.Builder (Blocks, Inlines, ToMetaValue (..)) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Compat.Time +import Text.Pandoc.Definition +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Pretty (charWidth) +import Text.Pandoc.Walk -- | Version number of pandoc library. pandocVersion :: String @@ -185,11 +146,11 @@ splitBy _ [] = [] splitBy isSep lst = let (first, rest) = break isSep lst rest' = dropWhile isSep rest - in first:(splitBy isSep rest') + in first:splitBy isSep rest' splitByIndices :: [Int] -> [a] -> [[a]] splitByIndices [] lst = [lst] -splitByIndices (x:xs) lst = first:(splitByIndices (map (\y -> y - x) xs) rest) +splitByIndices (x:xs) lst = first:splitByIndices (map (\y -> y - x) xs) rest where (first, rest) = splitAt x lst -- | Split string into chunks divided at specified indices. @@ -197,7 +158,7 @@ splitStringByIndices :: [Int] -> [Char] -> [[Char]] splitStringByIndices [] lst = [lst] splitStringByIndices (x:xs) lst = let (first, rest) = splitAt' x lst in - first : (splitStringByIndices (map (\y -> y - x) xs) rest) + first : splitStringByIndices (map (\y -> y - x) xs) rest splitAt' :: Int -> [Char] -> ([Char],[Char]) splitAt' _ [] = ([],[]) @@ -236,9 +197,9 @@ backslashEscapes = map (\ch -> (ch, ['\\',ch])) escapeStringUsing :: [(Char, String)] -> String -> String escapeStringUsing _ [] = "" escapeStringUsing escapeTable (x:xs) = - case (lookup x escapeTable) of - Just str -> str ++ rest - Nothing -> x:rest + case lookup x escapeTable of + Just str -> str ++ rest + Nothing -> x:rest where rest = escapeStringUsing escapeTable xs -- | Strip trailing newlines from string. @@ -260,35 +221,33 @@ trimr = reverse . triml . reverse -- | Strip leading and trailing characters from string stripFirstAndLast :: String -> String stripFirstAndLast str = - drop 1 $ take ((length str) - 1) str + drop 1 $ take (length str - 1) str -- | Change CamelCase word to hyphenated lowercase (e.g., camel-case). camelCaseToHyphenated :: String -> String camelCaseToHyphenated [] = "" camelCaseToHyphenated (a:b:rest) | isLower a && isUpper b = - a:'-':(toLower b):(camelCaseToHyphenated rest) -camelCaseToHyphenated (a:rest) = (toLower a):(camelCaseToHyphenated rest) + a:'-':toLower b:camelCaseToHyphenated rest +camelCaseToHyphenated (a:rest) = toLower a:camelCaseToHyphenated rest -- | Convert number < 4000 to uppercase roman numeral. toRomanNumeral :: Int -> String -toRomanNumeral x = - if x >= 4000 || x < 0 - then "?" - else case x of - _ | x >= 1000 -> "M" ++ toRomanNumeral (x - 1000) - _ | x >= 900 -> "CM" ++ toRomanNumeral (x - 900) - _ | x >= 500 -> "D" ++ toRomanNumeral (x - 500) - _ | x >= 400 -> "CD" ++ toRomanNumeral (x - 400) - _ | x >= 100 -> "C" ++ toRomanNumeral (x - 100) - _ | x >= 90 -> "XC" ++ toRomanNumeral (x - 90) - _ | x >= 50 -> "L" ++ toRomanNumeral (x - 50) - _ | x >= 40 -> "XL" ++ toRomanNumeral (x - 40) - _ | x >= 10 -> "X" ++ toRomanNumeral (x - 10) - _ | x == 9 -> "IX" - _ | x >= 5 -> "V" ++ toRomanNumeral (x - 5) - _ | x == 4 -> "IV" - _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) - _ -> "" +toRomanNumeral x + | x >= 4000 || x < 0 = "?" + | x >= 1000 = "M" ++ toRomanNumeral (x - 1000) + | x >= 900 = "CM" ++ toRomanNumeral (x - 900) + | x >= 500 = "D" ++ toRomanNumeral (x - 500) + | x >= 400 = "CD" ++ toRomanNumeral (x - 400) + | x >= 100 = "C" ++ toRomanNumeral (x - 100) + | x >= 90 = "XC" ++ toRomanNumeral (x - 90) + | x >= 50 = "L" ++ toRomanNumeral (x - 50) + | x >= 40 = "XL" ++ toRomanNumeral (x - 40) + | x >= 10 = "X" ++ toRomanNumeral (x - 10) + | x == 9 = "IX" + | x >= 5 = "V" ++ toRomanNumeral (x - 5) + | x == 4 = "IV" + | x >= 1 = "I" ++ toRomanNumeral (x - 1) + | otherwise = "" -- | Escape whitespace and some punctuation characters in URI. escapeURI :: String -> String @@ -296,26 +255,23 @@ escapeURI = escapeURIString (not . needsEscaping) where needsEscaping c = isSpace c || c `elem` ['<','>','|','"','{','}','[',']','^', '`'] - --- | Convert tabs to spaces and filter out DOS line endings. --- Tabs will be preserved if tab stop is set to 0. +-- | Convert tabs to spaces. Tabs will be preserved if tab stop is set to 0. tabFilter :: Int -- ^ Tab stop - -> String -- ^ Input - -> String -tabFilter tabStop = - let go _ [] = "" - go _ ('\n':xs) = '\n' : go tabStop xs - go _ ('\r':'\n':xs) = '\n' : go tabStop xs - go _ ('\r':xs) = '\n' : go tabStop xs - go spsToNextStop ('\t':xs) = - if tabStop == 0 - then '\t' : go tabStop xs - else replicate spsToNextStop ' ' ++ go tabStop xs - go 1 (x:xs) = - x : go tabStop xs - go spsToNextStop (x:xs) = - x : go (spsToNextStop - 1) xs - in go tabStop + -> T.Text -- ^ Input + -> T.Text +tabFilter 0 = id +tabFilter tabStop = T.unlines . map go . T.lines + where go s = + let (s1, s2) = T.break (== '\t') s + in if T.null s2 + then s1 + else s1 <> T.replicate + (tabStop - (T.length s1 `mod` tabStop)) (T.pack " ") + <> go (T.drop 1 s2) + +-- | Strip out DOS line endings. +crFilter :: T.Text -> T.Text +crFilter = T.filter (/= '\r') -- -- Date/time @@ -329,7 +285,7 @@ normalizeDate s = fmap (formatTime defaultTimeLocale "%F") (msum $ map (\fs -> parsetimeWith fs s >>= rejectBadYear) formats :: Maybe Day) where rejectBadYear day = case toGregorian day of (y, _, _) | y >= 1601 && y <= 9999 -> Just day - _ -> Nothing + _ -> Nothing parsetimeWith = #if MIN_VERSION_time(1,5,0) parseTimeM True defaultTimeLocale @@ -337,7 +293,7 @@ normalizeDate s = fmap (formatTime defaultTimeLocale "%F") parseTime defaultTimeLocale #endif formats = ["%x","%m/%d/%Y", "%D","%F", "%d %b %Y", - "%d %B %Y", "%b. %d, %Y", "%B %d, %Y", + "%e %B %Y", "%b. %e, %Y", "%B %e, %Y", "%Y%m%d", "%Y%m", "%Y"] -- @@ -366,23 +322,6 @@ orderedListMarkers (start, numstyle, numdelim) = TwoParens -> "(" ++ str ++ ")" in map inDelim nums --- | Normalize a list of inline elements: remove leading and trailing --- @Space@ elements, collapse double @Space@s into singles, and --- remove empty Str elements. -normalizeSpaces :: [Inline] -> [Inline] -normalizeSpaces = cleanup . dropWhile isSpaceOrEmpty - where cleanup [] = [] - cleanup (Space:rest) = case dropWhile isSpaceOrEmpty rest of - [] -> [] - (x:xs) -> Space : x : cleanup xs - cleanup ((Str ""):rest) = cleanup rest - cleanup (x:rest) = x : cleanup rest - -isSpaceOrEmpty :: Inline -> Bool -isSpaceOrEmpty Space = True -isSpaceOrEmpty (Str "") = True -isSpaceOrEmpty _ = False - -- | Extract the leading and trailing spaces from inside an inline element -- and place them outside the element. SoftBreaks count as Spaces for -- these purposes. @@ -399,183 +338,43 @@ extractSpaces f is = _ -> mempty in (left <> f (B.trimInlines . B.Many $ contents) <> right) --- | Normalize @Pandoc@ document, consolidating doubled 'Space's, --- combining adjacent 'Str's and 'Emph's, remove 'Null's and --- empty elements, etc. -normalize :: Pandoc -> Pandoc -normalize (Pandoc (Meta meta) blocks) = - Pandoc (Meta $ M.map go meta) (normalizeBlocks blocks) - where go (MetaInlines xs) = MetaInlines $ normalizeInlines xs - go (MetaBlocks xs) = MetaBlocks $ normalizeBlocks xs - go (MetaList ms) = MetaList $ map go ms - go (MetaMap m) = MetaMap $ M.map go m - go x = x - -normalizeBlocks :: [Block] -> [Block] -normalizeBlocks (Null : xs) = normalizeBlocks xs -normalizeBlocks (Div attr bs : xs) = - Div attr (normalizeBlocks bs) : normalizeBlocks xs -normalizeBlocks (BlockQuote bs : xs) = - case normalizeBlocks bs of - [] -> normalizeBlocks xs - bs' -> BlockQuote bs' : normalizeBlocks xs -normalizeBlocks (BulletList [] : xs) = normalizeBlocks xs -normalizeBlocks (BulletList items : xs) = - BulletList (map normalizeBlocks items) : normalizeBlocks xs -normalizeBlocks (OrderedList _ [] : xs) = normalizeBlocks xs -normalizeBlocks (OrderedList attr items : xs) = - OrderedList attr (map normalizeBlocks items) : normalizeBlocks xs -normalizeBlocks (DefinitionList [] : xs) = normalizeBlocks xs -normalizeBlocks (DefinitionList items : xs) = - DefinitionList (map go items) : normalizeBlocks xs - where go (ils, bs) = (normalizeInlines ils, map normalizeBlocks bs) -normalizeBlocks (RawBlock _ "" : xs) = normalizeBlocks xs -normalizeBlocks (RawBlock f x : xs) = - case normalizeBlocks xs of - (RawBlock f' x' : rest) | f' == f -> - RawBlock f (x ++ ('\n':x')) : rest - rest -> RawBlock f x : rest -normalizeBlocks (Para ils : xs) = - case normalizeInlines ils of - [] -> normalizeBlocks xs - ils' -> Para ils' : normalizeBlocks xs -normalizeBlocks (Plain ils : xs) = - case normalizeInlines ils of - [] -> normalizeBlocks xs - ils' -> Plain ils' : normalizeBlocks xs -normalizeBlocks (Header lev attr ils : xs) = - Header lev attr (normalizeInlines ils) : normalizeBlocks xs -normalizeBlocks (Table capt aligns widths hdrs rows : xs) = - Table (normalizeInlines capt) aligns widths - (map normalizeBlocks hdrs) (map (map normalizeBlocks) rows) - : normalizeBlocks xs -normalizeBlocks (x:xs) = x : normalizeBlocks xs -normalizeBlocks [] = [] - -normalizeInlines :: [Inline] -> [Inline] -normalizeInlines (Str x : ys) = - case concat (x : map fromStr strs) of - "" -> rest - n -> Str n : rest - where - (strs, rest) = span isStr $ normalizeInlines ys - isStr (Str _) = True - isStr _ = False - fromStr (Str z) = z - fromStr _ = error "normalizeInlines - fromStr - not a Str" -normalizeInlines (Space : SoftBreak : ys) = - SoftBreak : normalizeInlines ys -normalizeInlines (Space : ys) = - if null rest - then [] - else Space : rest - where isSp Space = True - isSp _ = False - rest = dropWhile isSp $ normalizeInlines ys -normalizeInlines (Emph xs : zs) = - case normalizeInlines zs of - (Emph ys : rest) -> normalizeInlines $ - Emph (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> Emph xs' : rest -normalizeInlines (Strong xs : zs) = - case normalizeInlines zs of - (Strong ys : rest) -> normalizeInlines $ - Strong (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> Strong xs' : rest -normalizeInlines (Subscript xs : zs) = - case normalizeInlines zs of - (Subscript ys : rest) -> normalizeInlines $ - Subscript (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> Subscript xs' : rest -normalizeInlines (Superscript xs : zs) = - case normalizeInlines zs of - (Superscript ys : rest) -> normalizeInlines $ - Superscript (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> Superscript xs' : rest -normalizeInlines (SmallCaps xs : zs) = - case normalizeInlines zs of - (SmallCaps ys : rest) -> normalizeInlines $ - SmallCaps (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> SmallCaps xs' : rest -normalizeInlines (Strikeout xs : zs) = - case normalizeInlines zs of - (Strikeout ys : rest) -> normalizeInlines $ - Strikeout (normalizeInlines $ xs ++ ys) : rest - rest -> case normalizeInlines xs of - [] -> rest - xs' -> Strikeout xs' : rest -normalizeInlines (RawInline _ [] : ys) = normalizeInlines ys -normalizeInlines (RawInline f xs : zs) = - case normalizeInlines zs of - (RawInline f' ys : rest) | f == f' -> normalizeInlines $ - RawInline f (xs ++ ys) : rest - rest -> RawInline f xs : rest -normalizeInlines (Code _ "" : ys) = normalizeInlines ys -normalizeInlines (Code attr xs : zs) = - case normalizeInlines zs of - (Code attr' ys : rest) | attr == attr' -> normalizeInlines $ - Code attr (xs ++ ys) : rest - rest -> Code attr xs : rest --- allow empty spans, they may carry identifiers etc. --- normalizeInlines (Span _ [] : ys) = normalizeInlines ys -normalizeInlines (Span attr xs : zs) = - case normalizeInlines zs of - (Span attr' ys : rest) | attr == attr' -> normalizeInlines $ - Span attr (normalizeInlines $ xs ++ ys) : rest - rest -> Span attr (normalizeInlines xs) : rest -normalizeInlines (Note bs : ys) = Note (normalizeBlocks bs) : - normalizeInlines ys -normalizeInlines (Quoted qt ils : ys) = - Quoted qt (normalizeInlines ils) : normalizeInlines ys -normalizeInlines (Link attr ils t : ys) = - Link attr (normalizeInlines ils) t : normalizeInlines ys -normalizeInlines (Image attr ils t : ys) = - Image attr (normalizeInlines ils) t : normalizeInlines ys -normalizeInlines (Cite cs ils : ys) = - Cite cs (normalizeInlines ils) : normalizeInlines ys -normalizeInlines (x : xs) = x : normalizeInlines xs -normalizeInlines [] = [] - -- | Extract inlines, removing formatting. removeFormatting :: Walkable Inline a => a -> [Inline] -removeFormatting = query go . walk deNote +removeFormatting = query go . walk (deNote . deQuote) where go :: Inline -> [Inline] - go (Str xs) = [Str xs] - go Space = [Space] - go SoftBreak = [SoftBreak] - go (Code _ x) = [Str x] - go (Math _ x) = [Str x] - go LineBreak = [Space] - go _ = [] - deNote (Note _) = Str "" - deNote x = x + go (Str xs) = [Str xs] + go Space = [Space] + go SoftBreak = [SoftBreak] + go (Code _ x) = [Str x] + go (Math _ x) = [Str x] + go LineBreak = [Space] + go _ = [] + +deNote :: Inline -> Inline +deNote (Note _) = Str "" +deNote x = x + +deQuote :: Inline -> Inline +deQuote (Quoted SingleQuote xs) = + Span ("",[],[]) (Str "\8216" : xs ++ [Str "\8217"]) +deQuote (Quoted DoubleQuote xs) = + Span ("",[],[]) (Str "\8220" : xs ++ [Str "\8221"]) +deQuote x = x -- | Convert pandoc structure to a string with formatting removed. -- Footnotes are skipped (since we don't want their contents in link -- labels). stringify :: Walkable Inline a => a -> String -stringify = query go . walk deNote +stringify = query go . walk (deNote . deQuote) where go :: Inline -> [Char] - go Space = " " - go SoftBreak = " " - go (Str x) = x - go (Code _ x) = x - go (Math _ x) = x + go Space = " " + go SoftBreak = " " + go (Str x) = x + go (Code _ x) = x + go (Math _ x) = x go (RawInline (Format "html") ('<':'b':'r':_)) = " " -- see #2105 - go LineBreak = " " - go _ = "" - deNote (Note _) = Str "" - deNote x = x + go LineBreak = " " + go _ = "" -- | Bring all regular text in a pandoc structure to uppercase. -- @@ -589,28 +388,12 @@ capitalize = walk go go x = x -- | Change final list item from @Para@ to @Plain@ if the list contains --- no other @Para@ blocks. -compactify :: [[Block]] -- ^ List of list items (each a list of blocks) - -> [[Block]] -compactify [] = [] -compactify items = - case (init items, last items) of - (_,[]) -> items - (others, final) -> - case last final of - Para a -> case (filter isPara $ concat items) of - -- if this is only Para, change to Plain - [_] -> others ++ [init final ++ [Plain a]] - _ -> items - _ -> items - --- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. Like compactify, but operates on @Blocks@ rather -- than @[Block]@. -compactify' :: [Blocks] -- ^ List of list items (each a list of blocks) +compactify :: [Blocks] -- ^ List of list items (each a list of blocks) -> [Blocks] -compactify' [] = [] -compactify' items = +compactify [] = [] +compactify items = let (others, final) = (init items, last items) in case reverse (B.toList final) of (Para a:xs) -> case [Para x | Para x <- concatMap B.toList items] of @@ -619,9 +402,9 @@ compactify' items = _ -> items _ -> items --- | Like @compactify'@, but acts on items of definition lists. -compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] -compactify'DL items = +-- | Like @compactify@, but acts on items of definition lists. +compactifyDL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] +compactifyDL items = let defs = concatMap snd items in case reverse (concatMap B.toList defs) of (Para x:xs) @@ -663,7 +446,7 @@ instance Walkable Inline Element where ils' <- walkM f ils elts' <- walkM f elts return $ Sec lev nums attr ils' elts' - query f (Blk x) = query f x + query f (Blk x) = query f x query f (Sec _ _ _ ils elts) = query f ils <> query f elts instance Walkable Block Element where @@ -674,7 +457,7 @@ instance Walkable Block Element where ils' <- walkM f ils elts' <- walkM f elts return $ Sec lev nums attr ils' elts' - query f (Blk x) = query f x + query f (Blk x) = query f x query f (Sec _ _ _ ils elts) = query f ils <> query f elts @@ -687,8 +470,8 @@ inlineListToIdentifier = map (nbspToSp . toLower) . filter (\c -> isLetter c || isDigit c || c `elem` "_-. ") . stringify - where nbspToSp '\160' = ' ' - nbspToSp x = x + where nbspToSp '\160' = ' ' + nbspToSp x = x -- | Convert list of Pandoc blocks into (hierarchical) list of Elements hierarchicalize :: [Block] -> [Element] @@ -696,7 +479,7 @@ hierarchicalize blocks = S.evalState (hierarchicalizeWithIds blocks) [] hierarchicalizeWithIds :: [Block] -> S.State [Int] [Element] hierarchicalizeWithIds [] = return [] -hierarchicalizeWithIds ((Header level attr@(_,classes,_) title'):xs) = do +hierarchicalizeWithIds (Header level attr@(_,classes,_) title':xs) = do lastnum <- S.get let lastnum' = take level lastnum let newnum = case length lastnum' of @@ -709,26 +492,26 @@ hierarchicalizeWithIds ((Header level attr@(_,classes,_) title'):xs) = do sectionContents' <- hierarchicalizeWithIds sectionContents rest' <- hierarchicalizeWithIds rest return $ Sec level newnum attr title' sectionContents' : rest' -hierarchicalizeWithIds ((Div ("",["references"],[]) - (Header level (ident,classes,kvs) title' : xs)):ys) = - hierarchicalizeWithIds ((Header level (ident,("references":classes),kvs) - title') : (xs ++ ys)) +hierarchicalizeWithIds (Div ("",["references"],[]) + (Header level (ident,classes,kvs) title' : xs):ys) = + hierarchicalizeWithIds (Header level (ident,"references":classes,kvs) + title' : (xs ++ ys)) hierarchicalizeWithIds (x:rest) = do rest' <- hierarchicalizeWithIds rest - return $ (Blk x) : rest' + return $ Blk x : rest' headerLtEq :: Int -> Block -> Bool -headerLtEq level (Header l _ _) = l <= level -headerLtEq level (Div ("",["references"],[]) (Header l _ _ : _)) = l <= level -headerLtEq _ _ = False +headerLtEq level (Header l _ _) = l <= level +headerLtEq level (Div ("",["references"],[]) (Header l _ _ : _)) = l <= level +headerLtEq _ _ = False -- | Generate a unique identifier from a list of inlines. -- Second argument is a list of already used identifiers. uniqueIdent :: [Inline] -> Set.Set String -> String uniqueIdent title' usedIdents = let baseIdent = case inlineListToIdentifier title' of - "" -> "section" - x -> x + "" -> "section" + x -> x numIdent n = baseIdent ++ "-" ++ show n in if baseIdent `Set.member` usedIdents then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of @@ -738,8 +521,8 @@ uniqueIdent title' usedIdents -- | True if block is a Header block. isHeaderBlock :: Block -> Bool -isHeaderBlock (Header _ _ _) = True -isHeaderBlock _ = False +isHeaderBlock Header{} = True +isHeaderBlock _ = False -- | Shift header levels up or down. headerShift :: Int -> Pandoc -> Pandoc @@ -748,6 +531,14 @@ headerShift n = walk shift shift (Header level attr inner) = Header (level + n) attr inner shift x = x +-- | Remove empty paragraphs. +stripEmptyParagraphs :: Pandoc -> Pandoc +stripEmptyParagraphs = walk go + where go :: [Block] -> [Block] + go = filter (not . isEmptyParagraph) + isEmptyParagraph (Para []) = True + isEmptyParagraph _ = False + -- | Detect if a list is tight. isTightList :: [[Block]] -> Bool isTightList = all firstIsPlain @@ -765,8 +556,8 @@ addMetaField key val (Meta meta) = Meta $ M.insertWith combine key (toMetaValue val) meta where combine newval (MetaList xs) = MetaList (xs ++ tolist newval) combine newval x = MetaList [x, newval] - tolist (MetaList ys) = ys - tolist y = [y] + tolist (MetaList ys) = ys + tolist y = [y] -- | Create 'Meta' from old-style title, authors, date. This is -- provided to ease the transition from the old API. @@ -774,8 +565,24 @@ makeMeta :: [Inline] -> [[Inline]] -> [Inline] -> Meta makeMeta title authors date = addMetaField "title" (B.fromList title) $ addMetaField "author" (map B.fromList authors) - $ addMetaField "date" (B.fromList date) - $ nullMeta + $ addMetaField "date" (B.fromList date) nullMeta + +-- | Remove soft breaks between East Asian characters. +eastAsianLineBreakFilter :: Pandoc -> Pandoc +eastAsianLineBreakFilter = bottomUp go + where go (x:SoftBreak:y:zs) = + case (stringify x, stringify y) of + (xs@(_:_), c:_) + | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs + _ -> x:SoftBreak:y:zs + go xs = xs + +-- | Builder for underline. +-- This probably belongs in Builder.hs in pandoc-types. +-- Will be replaced once Underline is an element. +underlineSpan :: Inlines -> Inlines +underlineSpan = B.spanWith ("", ["underline"], []) + -- -- TagSoup HTML handling @@ -787,7 +594,7 @@ renderTags' = renderTagsOptions renderOptions{ optMinimize = matchTags ["hr", "br", "img", "meta", "link"] , optRawTag = matchTags ["script", "style"] } - where matchTags = \tags -> flip elem tags . map toLower + where matchTags tags = flip elem tags . map toLower -- -- File handling @@ -800,226 +607,14 @@ inDirectory path action = E.bracket setCurrentDirectory (const $ setCurrentDirectory path >> action) -getDefaultReferenceDocx :: Maybe FilePath -> IO Archive -getDefaultReferenceDocx datadir = do - let paths = ["[Content_Types].xml", - "_rels/.rels", - "docProps/app.xml", - "docProps/core.xml", - "word/document.xml", - "word/fontTable.xml", - "word/footnotes.xml", - "word/numbering.xml", - "word/settings.xml", - "word/webSettings.xml", - "word/styles.xml", - "word/_rels/document.xml.rels", - "word/_rels/footnotes.xml.rels", - "word/theme/theme1.xml"] - let toLazy = fromChunks . (:[]) - let pathToEntry path = do epochtime <- (floor . utcTimeToPOSIXSeconds) <$> - getCurrentTime - contents <- toLazy <$> readDataFile datadir - ("docx/" ++ path) - return $ toEntry path epochtime contents - mbArchive <- case datadir of - Nothing -> return Nothing - Just d -> do - exists <- doesFileExist (d </> "reference.docx") - if exists - then return (Just (d </> "reference.docx")) - else return Nothing - case mbArchive of - Just arch -> toArchive <$> BL.readFile arch - Nothing -> foldr addEntryToArchive emptyArchive <$> - mapM pathToEntry paths - -getDefaultReferenceODT :: Maybe FilePath -> IO Archive -getDefaultReferenceODT datadir = do - let paths = ["mimetype", - "manifest.rdf", - "styles.xml", - "content.xml", - "meta.xml", - "settings.xml", - "Configurations2/accelerator/current.xml", - "Thumbnails/thumbnail.png", - "META-INF/manifest.xml"] - let pathToEntry path = do epochtime <- floor `fmap` getPOSIXTime - contents <- (fromChunks . (:[])) `fmap` - readDataFile datadir ("odt/" ++ path) - return $ toEntry path epochtime contents - mbArchive <- case datadir of - Nothing -> return Nothing - Just d -> do - exists <- doesFileExist (d </> "reference.odt") - if exists - then return (Just (d </> "reference.odt")) - else return Nothing - case mbArchive of - Just arch -> toArchive <$> BL.readFile arch - Nothing -> foldr addEntryToArchive emptyArchive <$> - mapM pathToEntry paths - - -readDefaultDataFile :: FilePath -> IO BS.ByteString -readDefaultDataFile "reference.docx" = - (BS.concat . toChunks . fromArchive) <$> getDefaultReferenceDocx Nothing -readDefaultDataFile "reference.odt" = - (BS.concat . toChunks . fromArchive) <$> getDefaultReferenceODT Nothing -readDefaultDataFile fname = -#ifdef EMBED_DATA_FILES - case lookup (makeCanonical fname) dataFiles of - Nothing -> err 97 $ "Could not find data file " ++ fname - Just contents -> return contents - where makeCanonical = Posix.joinPath . transformPathParts . splitDirectories - transformPathParts = reverse . foldl go [] - go as "." = as - go (_:as) ".." = as - go as x = x : as -#else - getDataFileName fname' >>= checkExistence >>= BS.readFile - where fname' = if fname == "MANUAL.txt" then fname else "data" </> fname - -checkExistence :: FilePath -> IO FilePath -checkExistence fn = do - exists <- doesFileExist fn - if exists - then return fn - else err 97 ("Could not find data file " ++ fn) -#endif - --- | Read file from specified user data directory or, if not found there, from --- Cabal data directory. -readDataFile :: Maybe FilePath -> FilePath -> IO BS.ByteString -readDataFile Nothing fname = readDefaultDataFile fname -readDataFile (Just userDir) fname = do - exists <- doesFileExist (userDir </> fname) - if exists - then BS.readFile (userDir </> fname) - else readDefaultDataFile fname - --- | Same as 'readDataFile' but returns a String instead of a ByteString. -readDataFileUTF8 :: Maybe FilePath -> FilePath -> IO String -readDataFileUTF8 userDir fname = - UTF8.toString `fmap` readDataFile userDir fname - --- | Specialized version of parseURIReference that disallows --- single-letter schemes. Reason: these are usually windows absolute --- paths. -parseURIReference' :: String -> Maybe URI -parseURIReference' s = - case parseURIReference s of - Just u - | length (uriScheme u) > 2 -> Just u - | null (uriScheme u) -> Just u -- protocol-relative - _ -> Nothing - --- | Fetch an image or other item from the local filesystem or the net. --- Returns raw content and maybe mime type. -fetchItem :: Maybe String -> String - -> IO (Either E.SomeException (BS.ByteString, Maybe MimeType)) -fetchItem sourceURL s = - case (sourceURL >>= parseURIReference' . ensureEscaped, ensureEscaped s) of - (Just u, s') -> -- try fetching from relative path at source - case parseURIReference' s' of - Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u - Nothing -> openURL s' -- will throw error - (Nothing, s'@('/':'/':_)) -> -- protocol-relative URI - case parseURIReference' s' of - Just u' -> openURL $ show $ u' `nonStrictRelativeTo` httpcolon - Nothing -> openURL s' -- will throw error - (Nothing, s') -> - case parseURI s' of -- requires absolute URI - -- We don't want to treat C:/ as a scheme: - Just u' | length (uriScheme u') > 2 -> openURL (show u') - Just u' | uriScheme u' == "file:" -> - E.try $ readLocalFile $ dropWhile (=='/') (uriPath u') - _ -> E.try $ readLocalFile fp -- get from local file system - where readLocalFile f = do - cont <- BS.readFile f - return (cont, mime) - httpcolon = URI{ uriScheme = "http:", - uriAuthority = Nothing, - uriPath = "", - uriQuery = "", - uriFragment = "" } - dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') - fp = unEscapeString $ dropFragmentAndQuery s - mime = case takeExtension fp of - ".gz" -> getMimeType $ dropExtension fp - ".svgz" -> getMimeType $ dropExtension fp ++ ".svg" - x -> getMimeType x - ensureEscaped = escapeURIString isAllowedInURI . map convertSlash - convertSlash '\\' = '/' - convertSlash x = x - --- | Like 'fetchItem', but also looks for items in a 'MediaBag'. -fetchItem' :: MediaBag -> Maybe String -> String - -> IO (Either E.SomeException (BS.ByteString, Maybe MimeType)) -fetchItem' media sourceURL s = do - case lookupMedia s media of - Nothing -> fetchItem sourceURL s - Just (mime, bs) -> return $ Right (BS.concat $ toChunks bs, Just mime) - --- | Read from a URL and return raw data and maybe mime type. -openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe MimeType)) -openURL u - | Just u'' <- stripPrefix "data:" u = - let mime = takeWhile (/=',') u'' - contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u'' - in return $ Right (decodeLenient contents, Just mime) -#ifdef HTTP_CLIENT - | otherwise = withSocketsDo $ E.try $ do - let parseReq = parseRequest - (proxy :: Either E.SomeException String) <- E.try $ getEnv "http_proxy" - req <- parseReq u - req' <- case proxy of - Left _ -> return req - Right pr -> (parseReq pr >>= \r -> - return $ addProxy (host r) (port r) req) - `mplus` return req - resp <- newManager tlsManagerSettings >>= httpLbs req' - return (BS.concat $ toChunks $ responseBody resp, - UTF8.toString `fmap` lookup hContentType (responseHeaders resp)) -#else - | otherwise = E.try $ getBodyAndMimeType `fmap` browse - (do liftIO $ UTF8.hPutStrLn stderr $ "Fetching " ++ u ++ "..." - setOutHandler $ const (return ()) - setAllowRedirects True - request (getRequest' u')) - where getBodyAndMimeType (_, r) = (rspBody r, findHeader HdrContentType r) - getRequest' uriString = case parseURI uriString of - Nothing -> error ("Not a valid URL: " ++ - uriString) - Just v -> mkRequest GET v - u' = escapeURIString (/= '|') u -- pipes are rejected by Network.URI -#endif - -- -- Error reporting -- -err :: Int -> String -> IO a -err exitCode msg = do - name <- getProgName - UTF8.hPutStrLn stderr $ name ++ ": " ++ msg - exitWith $ ExitFailure exitCode - return undefined - -warn :: MonadIO m => String -> m () -warn msg = liftIO $ do - name <- getProgName - UTF8.hPutStrLn stderr $ "[" ++ name ++ " warning] " ++ msg - mapLeft :: (a -> b) -> Either a c -> Either b c -mapLeft f (Left x) = Left (f x) +mapLeft f (Left x) = Left (f x) mapLeft _ (Right x) = Right x -hush :: Either a b -> Maybe b -hush (Left _) = Nothing -hush (Right x) = Just x - -- | Remove intermediate "." and ".." directories from a path. -- -- > collapseFilePath "./foo" == "foo" @@ -1034,14 +629,14 @@ collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories where go rs "." = rs go r@(p:rs) ".." = case p of - ".." -> ("..":r) - (checkPathSeperator -> Just True) -> ("..":r) - _ -> rs + ".." -> "..":r + (checkPathSeperator -> Just True) -> "..":r + _ -> rs go _ (checkPathSeperator -> Just True) = [[Posix.pathSeparator]] go rs x = x:rs - isSingleton [] = Nothing + isSingleton [] = Nothing isSingleton [x] = Just x - isSingleton _ = Nothing + isSingleton _ = Nothing checkPathSeperator = fmap isPathSeparator . isSingleton -- @@ -1054,41 +649,109 @@ filteredFilesFromArchive zf f = fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString) fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e) + +-- +-- IANA URIs +-- + +-- | Schemes from http://www.iana.org/assignments/uri-schemes.html plus +-- the unofficial schemes doi, javascript, isbn, pmid. +schemes :: Set.Set String +schemes = Set.fromList + -- Official IANA schemes + [ "aaa", "aaas", "about", "acap", "acct", "acr", "adiumxtra", "afp", "afs" + , "aim", "appdata", "apt", "attachment", "aw", "barion", "beshare", "bitcoin" + , "blob", "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension" + , "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid", "cvs" + , "data", "dav", "dict", "dis", "dlna-playcontainer", "dlna-playsingle" + , "dns", "dntp", "dtn", "dvb", "ed2k", "example", "facetime", "fax", "feed" + , "feedready", "file", "filesystem", "finger", "fish", "ftp", "geo", "gg" + , "git", "gizmoproject", "go", "gopher", "graph", "gtalk", "h323", "ham" + , "hcp", "http", "https", "hxxp", "hxxps", "hydrazone", "iax", "icap", "icon" + , "im", "imap", "info", "iotdisco", "ipn", "ipp", "ipps", "irc", "irc6" + , "ircs", "iris", "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs" + , "isostore", "itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap" + , "ldaps", "lvlt", "magnet", "mailserver", "mailto", "maps", "market" + , "message", "mid", "mms", "modem", "mongodb", "moz", "ms-access" + , "ms-browser-extension", "ms-drive-to", "ms-enrollment", "ms-excel" + , "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath" + , "ms-media-stream-id", "ms-officeapp", "ms-project", "ms-powerpoint" + , "ms-publisher", "ms-search-repair", "ms-secondary-screen-controller" + , "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode" + , "ms-settings-bluetooth", "ms-settings-camera", "ms-settings-cellular" + , "ms-settings-cloudstorage", "ms-settings-connectabledevices" + , "ms-settings-displays-topology", "ms-settings-emailandaccounts" + , "ms-settings-language", "ms-settings-location", "ms-settings-lock" + , "ms-settings-nfctransactions", "ms-settings-notifications" + , "ms-settings-power", "ms-settings-privacy", "ms-settings-proximity" + , "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace" + , "ms-spd", "ms-sttoverlay", "ms-transit-to", "ms-virtualtouchpad" + , "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd", "ms-word" + , "msnim", "msrp", "msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs" + , "ni", "nih", "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd" + , "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop" + , "pres", "prospero", "proxy", "pwid", "psyc", "qb", "query", "redis" + , "rediss", "reload", "res", "resource", "rmi", "rsync", "rtmfp", "rtmp" + , "rtsp", "rtsps", "rtspu", "secondlife", "service", "session", "sftp", "sgn" + , "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "smtp", "snews" + , "snmp", "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam" + , "stun", "stuns", "submit", "svn", "tag", "teamspeak", "tel", "teliaeid" + , "telnet", "tftp", "things", "thismessage", "tip", "tn3270", "tool", "turn" + , "turns", "tv", "udp", "unreal", "urn", "ut2004", "v-event", "vemmi" + , "ventrilo", "videotex", "vnc", "view-source", "wais", "webcal", "wpid" + , "ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire" + , "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri", "ymsgr", "z39.50", "z39.50r" + , "z39.50s" + -- Inofficial schemes + , "doi", "isbn", "javascript", "pmid" + ] + +-- | Check if the string is a valid URL with a IANA or frequently used but +-- unofficial scheme (see @schemes@). +isURI :: String -> Bool +isURI = maybe False hasKnownScheme . parseURI + where + hasKnownScheme = (`Set.member` schemes) . map toLower . + filter (/= ':') . uriScheme + --- --- Squash blocks into inlines --- -blockToInlines :: Block -> [Inline] -blockToInlines (Plain ils) = ils -blockToInlines (Para ils) = ils -blockToInlines (LineBlock lns) = combineLines lns -blockToInlines (CodeBlock attr str) = [Code attr str] -blockToInlines (RawBlock fmt str) = [RawInline fmt str] -blockToInlines (BlockQuote blks) = blocksToInlines blks +blockToInlines :: Block -> Inlines +blockToInlines (Plain ils) = B.fromList ils +blockToInlines (Para ils) = B.fromList ils +blockToInlines (LineBlock lns) = B.fromList $ combineLines lns +blockToInlines (CodeBlock attr str) = B.codeWith attr str +blockToInlines (RawBlock (Format fmt) str) = B.rawInline fmt str +blockToInlines (BlockQuote blks) = blocksToInlines' blks blockToInlines (OrderedList _ blkslst) = - concatMap blocksToInlines blkslst + mconcat $ map blocksToInlines' blkslst blockToInlines (BulletList blkslst) = - concatMap blocksToInlines blkslst + mconcat $ map blocksToInlines' blkslst blockToInlines (DefinitionList pairslst) = - concatMap f pairslst + mconcat $ map f pairslst where - f (ils, blkslst) = ils ++ - [Str ":", Space] ++ - (concatMap blocksToInlines blkslst) -blockToInlines (Header _ _ ils) = ils -blockToInlines (HorizontalRule) = [] + f (ils, blkslst) = B.fromList ils <> B.str ":" <> B.space <> + mconcat (map blocksToInlines' blkslst) +blockToInlines (Header _ _ ils) = B.fromList ils +blockToInlines HorizontalRule = mempty blockToInlines (Table _ _ _ headers rows) = - intercalate [LineBreak] $ map (concatMap blocksToInlines) tbl - where - tbl = headers : rows -blockToInlines (Div _ blks) = blocksToInlines blks -blockToInlines Null = [] + mconcat $ intersperse B.linebreak $ + map (mconcat . map blocksToInlines') (headers:rows) +blockToInlines (Div _ blks) = blocksToInlines' blks +blockToInlines Null = mempty + +blocksToInlinesWithSep :: Inlines -> [Block] -> Inlines +blocksToInlinesWithSep sep = + mconcat . intersperse sep . map blockToInlines -blocksToInlinesWithSep :: [Inline] -> [Block] -> [Inline] -blocksToInlinesWithSep sep blks = intercalate sep $ map blockToInlines blks +blocksToInlines' :: [Block] -> Inlines +blocksToInlines' = blocksToInlinesWithSep parSep + where parSep = B.space <> B.str "¶" <> B.space blocksToInlines :: [Block] -> [Inline] -blocksToInlines = blocksToInlinesWithSep [Space, Str "¶", Space] +blocksToInlines = B.toList . blocksToInlines' -- |