Changed uri parser so it doesn't include trailing punctuation.

So, in RST, 'http://google.com.' should be parsed as a link to 'http://google.com' followed by a period. The parser is smart enough to recognize balanced parentheses, as often occur in wikipedia links: 'http://foo.bar/baz_(bam)'. Also added ()s to RST specialChars, so '(http://google.com)' will be parsed as a link in parens. Added test cases. Resolves Issue #291.
author: John MacFarlane <jgm@berkeley.edu> 2011-03-18 11:27:42 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2011-03-18 11:30:20 -0700
commit: 6beba76f61faa41c4351fa4eb54161c060315a23 (patch)
tree: 62cc4da5965e08f8630a25701769dbb8917532a1 /src/Text/Pandoc/Parsing.hs
parent: d1304e835670a3e025e0f1b5ca3d38dca1ee6d62 (diff)
1 files changed, 19 insertions, 3 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 9ce064f91..187343f9c 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -78,7 +78,7 @@ import Text.Pandoc.Generic
 import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
 import Text.ParserCombinators.Parsec
 import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit )
+import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
 import Data.List ( intercalate, transpose )
 import Network.URI ( parseURI, URI (..), isAllowedInURI )
 import Control.Monad ( join, liftM, guard )
@@ -264,8 +264,24 @@ uri = try $ do
   let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
                     "news:", "telnet:" ]
   lookAhead $ oneOfStrings protocols
-  -- scan non-ascii characters and ascii characters allowed in a URI
-  str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c)
+  -- Scan non-ascii characters and ascii characters allowed in a URI.
+  -- We allow punctuation except when followed by a space, since
+  -- we don't want the trailing '.' in 'http://google.com.'
+  let innerPunct = try $ satisfy isPunctuation >>~
+                         notFollowedBy (newline <|> spaceChar)
+  let uriChar = innerPunct <|>
+                satisfy (\c -> not (isPunctuation c) &&
+                            (not (isAscii c) || isAllowedInURI c))
+  -- We want to allow
+  -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
+  -- as a URL, while NOT picking up the closing paren in
+  -- (http://wikipedia.org)
+  -- So we include balanced parens in the URL.
+  let inParens = try $ do char '('
+                          res <- many uriChar
+                          char ')'
+                          return $ '(' : res ++ ")"
+  str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar)
   -- now see if they amount to an absolute URI
   case parseURI (escapeURI str) of
        Just uri' -> if uriScheme uri' `elem` protocols
author	John MacFarlane <jgm@berkeley.edu>	2011-03-18 11:27:42 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2011-03-18 11:30:20 -0700
commit	6beba76f61faa41c4351fa4eb54161c060315a23 (patch)
tree	62cc4da5965e08f8630a25701769dbb8917532a1 /src/Text/Pandoc/Parsing.hs
parent	d1304e835670a3e025e0f1b5ca3d38dca1ee6d62 (diff)