summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Shared.hs
diff options
context:
space:
mode:
authordr@jones.dk <dr@jones.dk>2010-03-29 21:36:23 +0200
committerdr@jones.dk <dr@jones.dk>2010-03-29 21:36:23 +0200
commitc5408a001e497aed5733e00346bcba7e06cb65ba (patch)
treeda10f02052410c5d7c5db1d8987fe88e4a5bb757 /src/Text/Pandoc/Shared.hs
parent96d4f941026a8eca3ba211facdc8ce66b2ab38bb (diff)
Imported Upstream version 1.5.1.1
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r--src/Text/Pandoc/Shared.hs119
1 files changed, 70 insertions, 49 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index f093ddbee..26aff4250 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE DeriveDataTypeable #-}
{-
-Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Shared
- Copyright : Copyright (C) 2006-8 John MacFarlane
+ Copyright : Copyright (C) 2006-2010 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -28,7 +28,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Utility functions and definitions used by the various Pandoc modules.
-}
-module Text.Pandoc.Shared (
+module Text.Pandoc.Shared (
-- * List processing
splitBy,
splitByIndices,
@@ -43,6 +43,8 @@ module Text.Pandoc.Shared (
stripFirstAndLast,
camelCaseToHyphenated,
toRomanNumeral,
+ escapeURI,
+ unescapeURI,
wrapped,
wrapIfNeeded,
wrappedTeX,
@@ -114,10 +116,11 @@ import Text.ParserCombinators.Parsec
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
import qualified Text.PrettyPrint.HughesPJ as PP
import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
- isPunctuation )
+import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isAscii,
+ isLetter, isDigit )
import Data.List ( find, isPrefixOf, intercalate )
-import Network.URI ( parseURI, URI (..), isAllowedInURI )
+import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString, unEscapeString )
+import Codec.Binary.UTF8.String ( encodeString, decodeString )
import System.Directory
import System.FilePath ( (</>) )
-- Note: ghc >= 6.12 (base >=4.2) supports unicode through iconv
@@ -131,6 +134,7 @@ import Data.Generics
import qualified Control.Monad.State as S
import Control.Monad (join)
import Paths_pandoc (getDataFileName)
+
--
-- List processing
--
@@ -228,6 +232,17 @@ toRomanNumeral x =
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
_ -> ""
+-- | Escape unicode characters in a URI. Characters that are
+-- already valid in a URI, including % and ?, are left alone.
+escapeURI :: String -> String
+escapeURI = escapeURIString isAllowedInURI . encodeString
+
+-- | Unescape unicode and some special characters in a URI, but
+-- without introducing spaces.
+unescapeURI :: String -> String
+unescapeURI = escapeURIString (\c -> isAllowedInURI c || not (isAscii c)) .
+ decodeString . unEscapeString
+
-- | Wrap inlines to line length.
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=
@@ -479,25 +494,30 @@ domain = do
dom <- many1 $ try (char '.' >> many1 domainChar )
return $ intercalate "." (first:dom)
--- | Parses an email address; returns string.
-emailAddress :: GenParser Char st [Char]
+-- | Parses an email address; returns original and corresponding
+-- escaped mailto: URI.
+emailAddress :: GenParser Char st (String, String)
emailAddress = try $ do
firstLetter <- alphaNum
restAddr <- many emailChar
let addr = firstLetter:restAddr
char '@'
dom <- domain
- return $ addr ++ '@':dom
+ let full = addr ++ '@':dom
+ return (full, escapeURI $ "mailto:" ++ full)
--- | Parses a URI.
-uri :: GenParser Char st String
+-- | Parses a URI. Returns pair of original and URI-escaped version.
+uri :: GenParser Char st (String, String)
uri = try $ do
- str <- many1 $ satisfy isAllowedInURI
- case parseURI str of
- Just uri' -> if uriScheme uri' `elem` [ "http:", "https:", "ftp:",
- "file:", "mailto:",
- "news:", "telnet:" ]
- then return $ show uri'
+ let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
+ "news:", "telnet:" ]
+ lookAhead $ oneOfStrings protocols
+ -- scan non-ascii characters and ascii characters allowed in a URI
+ str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c)
+ -- now see if they amount to an absolute URI
+ case parseURI (escapeURI str) of
+ Just uri' -> if uriScheme uri' `elem` protocols
+ then return (str, show uri')
else fail "not a URI"
Nothing -> fail "not a URI"
@@ -894,38 +914,37 @@ data Element = Blk Block
-- lvl num ident label contents
deriving (Eq, Read, Show, Typeable, Data)
--- | Convert Pandoc inline list to plain text identifier.
+-- | Convert Pandoc inline list to plain text identifier. HTML
+-- identifiers must start with a letter, and may contain only
+-- letters, digits, and the characters _-:.
inlineListToIdentifier :: [Inline] -> String
-inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
-
-inlineListToIdentifier' :: [Inline] -> [Char]
-inlineListToIdentifier' [] = ""
-inlineListToIdentifier' (x:xs) =
- xAsText ++ inlineListToIdentifier' xs
- where xAsText = case x of
- Str s -> filter (\c -> c `elem` "_-." || not (isPunctuation c)) $
- intercalate "-" $ words $ map toLower s
- Emph lst -> inlineListToIdentifier' lst
- Strikeout lst -> inlineListToIdentifier' lst
- Superscript lst -> inlineListToIdentifier' lst
- SmallCaps lst -> inlineListToIdentifier' lst
- Subscript lst -> inlineListToIdentifier' lst
- Strong lst -> inlineListToIdentifier' lst
- Quoted _ lst -> inlineListToIdentifier' lst
- Cite _ lst -> inlineListToIdentifier' lst
- Code s -> s
- Space -> "-"
- EmDash -> "-"
- EnDash -> "-"
- Apostrophe -> ""
- Ellipses -> ""
- LineBreak -> "-"
- Math _ _ -> ""
- TeX _ -> ""
- HtmlInline _ -> ""
- Link lst _ -> inlineListToIdentifier' lst
- Image lst _ -> inlineListToIdentifier' lst
- Note _ -> ""
+inlineListToIdentifier =
+ dropWhile (not . isAlpha) . intercalate "-" . words . map toLower .
+ filter (\c -> isLetter c || isDigit c || c `elem` "_-:. ") .
+ concatMap extractText
+ where extractText x = case x of
+ Str s -> s
+ Emph lst -> concatMap extractText lst
+ Strikeout lst -> concatMap extractText lst
+ Superscript lst -> concatMap extractText lst
+ SmallCaps lst -> concatMap extractText lst
+ Subscript lst -> concatMap extractText lst
+ Strong lst -> concatMap extractText lst
+ Quoted _ lst -> concatMap extractText lst
+ Cite _ lst -> concatMap extractText lst
+ Code s -> s
+ Space -> " "
+ EmDash -> "---"
+ EnDash -> "--"
+ Apostrophe -> ""
+ Ellipses -> "..."
+ LineBreak -> " "
+ Math _ s -> s
+ TeX _ -> ""
+ HtmlInline _ -> ""
+ Link lst _ -> concatMap extractText lst
+ Image lst _ -> concatMap extractText lst
+ Note _ -> ""
-- | Convert list of Pandoc blocks into (hierarchical) list of Elements
hierarchicalize :: [Block] -> [Element]
@@ -957,7 +976,9 @@ headerLtEq _ _ = False
-- Second argument is a list of already used identifiers.
uniqueIdent :: [Inline] -> [String] -> String
uniqueIdent title' usedIdents =
- let baseIdent = inlineListToIdentifier title'
+ let baseIdent = case inlineListToIdentifier title' of
+ "" -> "section"
+ x -> x
numIdent n = baseIdent ++ "-" ++ show n
in if baseIdent `elem` usedIdents
then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of