summaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2010-03-23 13:51:52 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2010-03-23 13:51:52 -0700
commit9e658673006ca8c934bb75b224fdc0e7144b4030 (patch)
treee477eb971c7dca5d18fac1871fa90566bbac73fa /src/Text
parent4fbacd5d5bed09980af17d6fb90f5a8fd074ffa3 (diff)
Better definition of stringToURI.
Now it escapes all characters that aren't allowed in a URI. %, ?, /, and other characters that are allowed in a URI are left alone. Unicode high characters are UTF-8 encoded.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Shared.hs20
1 files changed, 6 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 42e3345c8..0e1ace858 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -115,10 +115,11 @@ import Text.ParserCombinators.Parsec
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
import qualified Text.PrettyPrint.HughesPJ as PP
import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha,
+import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
isPunctuation )
import Data.List ( find, isPrefixOf, intercalate )
-import Network.URI ( parseURI, URI (..), isAllowedInURI )
+import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString )
+import Codec.Binary.UTF8.String ( encodeString )
import System.Directory
import System.FilePath ( (</>) )
-- Note: ghc >= 6.12 (base >=4.2) supports unicode through iconv
@@ -131,10 +132,6 @@ import System.IO.UTF8
import Data.Generics
import qualified Control.Monad.State as S
import Control.Monad (join)
-import Data.ByteString (unpack)
-import Data.Word (Word8)
-import Data.ByteString.UTF8 (fromString)
-import Text.Printf (printf)
import Paths_pandoc (getDataFileName)
--
@@ -234,15 +231,10 @@ toRomanNumeral x =
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
_ -> ""
--- | Escape unicode characters in a URI. This means converting
--- them to UTF-8, then URI-encoding the octets. We leave everything
--- else the same, assuming that the user has already escaped
--- special characters like & and %.
+-- | Escape unicode characters in a URI. Characters that are
+-- already valid in a URI, including % and ?, are left alone.
stringToURI :: String -> String
-stringToURI = concatMap encodeOctet . unpack . fromString
- where encodeOctet :: Word8 -> String
- encodeOctet x | x > 127 = printf "%%%2x" x
- encodeOctet x = [chr (fromIntegral x)]
+stringToURI = escapeURIString isAllowedInURI . encodeString
-- | Wrap inlines to line length.
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc