Imported Upstream version 1.5.1.1

author: dr@jones.dk <dr@jones.dk> 2010-03-29 21:36:23 +0200
committer: dr@jones.dk <dr@jones.dk> 2010-03-29 21:36:23 +0200
commit: c5408a001e497aed5733e00346bcba7e06cb65ba (patch)
tree: da10f02052410c5d7c5db1d8987fe88e4a5bb757 /src/Text/Pandoc/Shared.hs
parent: 96d4f941026a8eca3ba211facdc8ce66b2ab38bb (diff)
1 files changed, 70 insertions, 49 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index f093ddbee..26aff4250 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -1,6 +1,6 @@
 {-# LANGUAGE DeriveDataTypeable #-}
 {-
-Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 {- |
    Module      : Text.Pandoc.Shared
-   Copyright   : Copyright (C) 2006-8 John MacFarlane
+   Copyright   : Copyright (C) 2006-2010 John MacFarlane
    License     : GNU GPL, version 2 or above 
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -28,7 +28,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 Utility functions and definitions used by the various Pandoc modules.
 -}
-module Text.Pandoc.Shared ( 
+module Text.Pandoc.Shared (
                      -- * List processing
                      splitBy,
                      splitByIndices,
@@ -43,6 +43,8 @@ module Text.Pandoc.Shared (
                      stripFirstAndLast,
                      camelCaseToHyphenated,
                      toRomanNumeral,
+                     escapeURI,
+                     unescapeURI,
                      wrapped,
                      wrapIfNeeded,
                      wrappedTeX,
@@ -114,10 +116,11 @@ import Text.ParserCombinators.Parsec
 import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
 import qualified Text.PrettyPrint.HughesPJ as PP
 import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
-                   isPunctuation )
+import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isAscii,
+                   isLetter, isDigit )
 import Data.List ( find, isPrefixOf, intercalate )
-import Network.URI ( parseURI, URI (..), isAllowedInURI )
+import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString, unEscapeString )
+import Codec.Binary.UTF8.String ( encodeString, decodeString )
 import System.Directory
 import System.FilePath ( (</>) )
 -- Note: ghc >= 6.12 (base >=4.2) supports unicode through iconv
@@ -131,6 +134,7 @@ import Data.Generics
 import qualified Control.Monad.State as S
 import Control.Monad (join)
 import Paths_pandoc (getDataFileName)
+
 --
 -- List processing
 --
@@ -228,6 +232,17 @@ toRomanNumeral x =
               _ | x >= 1    -> "I" ++ toRomanNumeral (x - 1)
               _             -> ""
 
+-- | Escape unicode characters in a URI.  Characters that are
+-- already valid in a URI, including % and ?, are left alone.
+escapeURI :: String -> String
+escapeURI = escapeURIString isAllowedInURI . encodeString
+
+-- | Unescape unicode and some special characters in a URI, but
+-- without introducing spaces.
+unescapeURI :: String -> String
+unescapeURI = escapeURIString (\c -> isAllowedInURI c || not (isAscii c)) .
+               decodeString . unEscapeString
+
 -- | Wrap inlines to line length.
 wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
 wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= 
@@ -479,25 +494,30 @@ domain = do
   dom <- many1 $ try (char '.' >> many1 domainChar )
   return $ intercalate "." (first:dom)
 
--- | Parses an email address; returns string.
-emailAddress :: GenParser Char st [Char]
+-- | Parses an email address; returns original and corresponding
+-- escaped mailto: URI.
+emailAddress :: GenParser Char st (String, String)
 emailAddress = try $ do
   firstLetter <- alphaNum
   restAddr <- many emailChar
   let addr = firstLetter:restAddr
   char '@'
   dom <- domain
-  return $ addr ++ '@':dom
+  let full = addr ++ '@':dom
+  return (full, escapeURI $ "mailto:" ++ full)
 
--- | Parses a URI.
-uri :: GenParser Char st String
+-- | Parses a URI. Returns pair of original and URI-escaped version.
+uri :: GenParser Char st (String, String)
 uri = try $ do
-  str <- many1 $ satisfy isAllowedInURI
-  case parseURI str of
-       Just uri' -> if uriScheme uri' `elem` [ "http:", "https:", "ftp:",
-                                               "file:", "mailto:",
-                                               "news:", "telnet:" ]
-                       then return $ show uri'
+  let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
+                    "news:", "telnet:" ]
+  lookAhead $ oneOfStrings protocols
+  -- scan non-ascii characters and ascii characters allowed in a URI
+  str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c)
+  -- now see if they amount to an absolute URI
+  case parseURI (escapeURI str) of
+       Just uri' -> if uriScheme uri' `elem` protocols
+                       then return (str, show uri')
                        else fail "not a URI"
        Nothing   -> fail "not a URI"
 
@@ -894,38 +914,37 @@ data Element = Blk Block
              --    lvl  num ident  label    contents
              deriving (Eq, Read, Show, Typeable, Data)
 
--- | Convert Pandoc inline list to plain text identifier.
+-- | Convert Pandoc inline list to plain text identifier.  HTML
+-- identifiers must start with a letter, and may contain only
+-- letters, digits, and the characters _-:.
 inlineListToIdentifier :: [Inline] -> String
-inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
-
-inlineListToIdentifier' :: [Inline] -> [Char]
-inlineListToIdentifier' [] = ""
-inlineListToIdentifier' (x:xs) =
-  xAsText ++ inlineListToIdentifier' xs
-  where xAsText = case x of
-          Str s          -> filter (\c -> c `elem` "_-." || not (isPunctuation c)) $
-                            intercalate "-" $ words $ map toLower s
-          Emph lst       -> inlineListToIdentifier' lst
-          Strikeout lst  -> inlineListToIdentifier' lst
-          Superscript lst -> inlineListToIdentifier' lst
-          SmallCaps   lst -> inlineListToIdentifier' lst
-          Subscript lst  -> inlineListToIdentifier' lst
-          Strong lst     -> inlineListToIdentifier' lst
-          Quoted _ lst   -> inlineListToIdentifier' lst
-          Cite   _ lst   -> inlineListToIdentifier' lst
-          Code s         -> s
-          Space          -> "-"
-          EmDash         -> "-"
-          EnDash         -> "-"
-          Apostrophe     -> ""
-          Ellipses       -> ""
-          LineBreak      -> "-"
-          Math _ _       -> ""
-          TeX _          -> ""
-          HtmlInline _   -> ""
-          Link lst _     -> inlineListToIdentifier' lst
-          Image lst _    -> inlineListToIdentifier' lst
-          Note _         -> ""
+inlineListToIdentifier =
+  dropWhile (not . isAlpha) . intercalate "-" . words . map toLower .
+  filter (\c -> isLetter c || isDigit c || c `elem` "_-:. ") .
+  concatMap extractText
+    where extractText x = case x of
+              Str s           -> s
+              Emph lst        -> concatMap extractText lst
+              Strikeout lst   -> concatMap extractText lst
+              Superscript lst -> concatMap extractText lst
+              SmallCaps   lst -> concatMap extractText lst
+              Subscript lst   -> concatMap extractText lst
+              Strong lst      -> concatMap extractText lst
+              Quoted _ lst    -> concatMap extractText lst
+              Cite   _ lst    -> concatMap extractText lst
+              Code s          -> s
+              Space           -> " "
+              EmDash          -> "---"
+              EnDash          -> "--"
+              Apostrophe      -> ""
+              Ellipses        -> "..."
+              LineBreak       -> " "
+              Math _ s        -> s
+              TeX _           -> ""
+              HtmlInline _    -> ""
+              Link lst _      -> concatMap extractText lst
+              Image lst _     -> concatMap extractText lst
+              Note _          -> ""
 
 -- | Convert list of Pandoc blocks into (hierarchical) list of Elements
 hierarchicalize :: [Block] -> [Element]
@@ -957,7 +976,9 @@ headerLtEq _ _ = False
 -- Second argument is a list of already used identifiers.
 uniqueIdent :: [Inline] -> [String] -> String
 uniqueIdent title' usedIdents =
-  let baseIdent = inlineListToIdentifier title'
+  let baseIdent = case inlineListToIdentifier title' of
+                        ""   -> "section"
+                        x    -> x
       numIdent n = baseIdent ++ "-" ++ show n
   in  if baseIdent `elem` usedIdents
         then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of
author	dr@jones.dk <dr@jones.dk>	2010-03-29 21:36:23 +0200
committer	dr@jones.dk <dr@jones.dk>	2010-03-29 21:36:23 +0200
commit	c5408a001e497aed5733e00346bcba7e06cb65ba (patch)
tree	da10f02052410c5d7c5db1d8987fe88e4a5bb757 /src/Text/Pandoc/Shared.hs
parent	96d4f941026a8eca3ba211facdc8ce66b2ab38bb (diff)