diff options
author | dr@jones.dk <dr@jones.dk> | 2010-03-29 21:36:23 +0200 |
---|---|---|
committer | dr@jones.dk <dr@jones.dk> | 2010-03-29 21:36:23 +0200 |
commit | c5408a001e497aed5733e00346bcba7e06cb65ba (patch) | |
tree | da10f02052410c5d7c5db1d8987fe88e4a5bb757 /src/Text/Pandoc/Shared.hs | |
parent | 96d4f941026a8eca3ba211facdc8ce66b2ab38bb (diff) |
Imported Upstream version 1.5.1.1
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 119 |
1 files changed, 70 insertions, 49 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f093ddbee..26aff4250 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,6 +1,6 @@ {-# LANGUAGE DeriveDataTypeable #-} {- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Shared - Copyright : Copyright (C) 2006-8 John MacFarlane + Copyright : Copyright (C) 2006-2010 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -28,7 +28,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Utility functions and definitions used by the various Pandoc modules. -} -module Text.Pandoc.Shared ( +module Text.Pandoc.Shared ( -- * List processing splitBy, splitByIndices, @@ -43,6 +43,8 @@ module Text.Pandoc.Shared ( stripFirstAndLast, camelCaseToHyphenated, toRomanNumeral, + escapeURI, + unescapeURI, wrapped, wrapIfNeeded, wrappedTeX, @@ -114,10 +116,11 @@ import Text.ParserCombinators.Parsec import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) import qualified Text.PrettyPrint.HughesPJ as PP import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, - isPunctuation ) +import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isAscii, + isLetter, isDigit ) import Data.List ( find, isPrefixOf, intercalate ) -import Network.URI ( parseURI, URI (..), isAllowedInURI ) +import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString, unEscapeString ) +import Codec.Binary.UTF8.String ( encodeString, decodeString ) import System.Directory import System.FilePath ( (</>) ) -- Note: ghc >= 6.12 (base >=4.2) supports unicode through iconv @@ -131,6 +134,7 @@ import Data.Generics import qualified Control.Monad.State as S import Control.Monad (join) import Paths_pandoc (getDataFileName) + -- -- List processing -- @@ -228,6 +232,17 @@ toRomanNumeral x = _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ -> "" +-- | Escape unicode characters in a URI. Characters that are +-- already valid in a URI, including % and ?, are left alone. +escapeURI :: String -> String +escapeURI = escapeURIString isAllowedInURI . encodeString + +-- | Unescape unicode and some special characters in a URI, but +-- without introducing spaces. +unescapeURI :: String -> String +unescapeURI = escapeURIString (\c -> isAllowedInURI c || not (isAscii c)) . + decodeString . unEscapeString + -- | Wrap inlines to line length. wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= @@ -479,25 +494,30 @@ domain = do dom <- many1 $ try (char '.' >> many1 domainChar ) return $ intercalate "." (first:dom) --- | Parses an email address; returns string. -emailAddress :: GenParser Char st [Char] +-- | Parses an email address; returns original and corresponding +-- escaped mailto: URI. +emailAddress :: GenParser Char st (String, String) emailAddress = try $ do firstLetter <- alphaNum restAddr <- many emailChar let addr = firstLetter:restAddr char '@' dom <- domain - return $ addr ++ '@':dom + let full = addr ++ '@':dom + return (full, escapeURI $ "mailto:" ++ full) --- | Parses a URI. -uri :: GenParser Char st String +-- | Parses a URI. Returns pair of original and URI-escaped version. +uri :: GenParser Char st (String, String) uri = try $ do - str <- many1 $ satisfy isAllowedInURI - case parseURI str of - Just uri' -> if uriScheme uri' `elem` [ "http:", "https:", "ftp:", - "file:", "mailto:", - "news:", "telnet:" ] - then return $ show uri' + let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:", + "news:", "telnet:" ] + lookAhead $ oneOfStrings protocols + -- scan non-ascii characters and ascii characters allowed in a URI + str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c) + -- now see if they amount to an absolute URI + case parseURI (escapeURI str) of + Just uri' -> if uriScheme uri' `elem` protocols + then return (str, show uri') else fail "not a URI" Nothing -> fail "not a URI" @@ -894,38 +914,37 @@ data Element = Blk Block -- lvl num ident label contents deriving (Eq, Read, Show, Typeable, Data) --- | Convert Pandoc inline list to plain text identifier. +-- | Convert Pandoc inline list to plain text identifier. HTML +-- identifiers must start with a letter, and may contain only +-- letters, digits, and the characters _-:. inlineListToIdentifier :: [Inline] -> String -inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier' - -inlineListToIdentifier' :: [Inline] -> [Char] -inlineListToIdentifier' [] = "" -inlineListToIdentifier' (x:xs) = - xAsText ++ inlineListToIdentifier' xs - where xAsText = case x of - Str s -> filter (\c -> c `elem` "_-." || not (isPunctuation c)) $ - intercalate "-" $ words $ map toLower s - Emph lst -> inlineListToIdentifier' lst - Strikeout lst -> inlineListToIdentifier' lst - Superscript lst -> inlineListToIdentifier' lst - SmallCaps lst -> inlineListToIdentifier' lst - Subscript lst -> inlineListToIdentifier' lst - Strong lst -> inlineListToIdentifier' lst - Quoted _ lst -> inlineListToIdentifier' lst - Cite _ lst -> inlineListToIdentifier' lst - Code s -> s - Space -> "-" - EmDash -> "-" - EnDash -> "-" - Apostrophe -> "" - Ellipses -> "" - LineBreak -> "-" - Math _ _ -> "" - TeX _ -> "" - HtmlInline _ -> "" - Link lst _ -> inlineListToIdentifier' lst - Image lst _ -> inlineListToIdentifier' lst - Note _ -> "" +inlineListToIdentifier = + dropWhile (not . isAlpha) . intercalate "-" . words . map toLower . + filter (\c -> isLetter c || isDigit c || c `elem` "_-:. ") . + concatMap extractText + where extractText x = case x of + Str s -> s + Emph lst -> concatMap extractText lst + Strikeout lst -> concatMap extractText lst + Superscript lst -> concatMap extractText lst + SmallCaps lst -> concatMap extractText lst + Subscript lst -> concatMap extractText lst + Strong lst -> concatMap extractText lst + Quoted _ lst -> concatMap extractText lst + Cite _ lst -> concatMap extractText lst + Code s -> s + Space -> " " + EmDash -> "---" + EnDash -> "--" + Apostrophe -> "" + Ellipses -> "..." + LineBreak -> " " + Math _ s -> s + TeX _ -> "" + HtmlInline _ -> "" + Link lst _ -> concatMap extractText lst + Image lst _ -> concatMap extractText lst + Note _ -> "" -- | Convert list of Pandoc blocks into (hierarchical) list of Elements hierarchicalize :: [Block] -> [Element] @@ -957,7 +976,9 @@ headerLtEq _ _ = False -- Second argument is a list of already used identifiers. uniqueIdent :: [Inline] -> [String] -> String uniqueIdent title' usedIdents = - let baseIdent = inlineListToIdentifier title' + let baseIdent = case inlineListToIdentifier title' of + "" -> "section" + x -> x numIdent n = baseIdent ++ "-" ++ show n in if baseIdent `elem` usedIdents then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of |