summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Parsing.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Parsing.hs')
-rw-r--r--src/Text/Pandoc/Parsing.hs44
1 files changed, 33 insertions, 11 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 9ce064f91..eaf0c0f67 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -33,6 +33,7 @@ module Text.Pandoc.Parsing ( (>>~),
notFollowedBy',
oneOfStrings,
spaceChar,
+ nonspaceChar,
skipSpaces,
blankline,
blanklines,
@@ -78,7 +79,7 @@ import Text.Pandoc.Generic
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
import Text.ParserCombinators.Parsec
import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit )
+import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
import Data.List ( intercalate, transpose )
import Network.URI ( parseURI, URI (..), isAllowedInURI )
import Control.Monad ( join, liftM, guard )
@@ -122,6 +123,10 @@ oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings
spaceChar :: CharParser st Char
spaceChar = satisfy $ \c -> c == ' ' || c == '\t'
+-- | Parses a nonspace, nonnewline character.
+nonspaceChar :: CharParser st Char
+nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r'
+
-- | Skips zero or more spaces or tabs.
skipSpaces :: GenParser Char st ()
skipSpaces = skipMany spaceChar
@@ -264,8 +269,24 @@ uri = try $ do
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
"news:", "telnet:" ]
lookAhead $ oneOfStrings protocols
- -- scan non-ascii characters and ascii characters allowed in a URI
- str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c)
+ -- Scan non-ascii characters and ascii characters allowed in a URI.
+ -- We allow punctuation except when followed by a space, since
+ -- we don't want the trailing '.' in 'http://google.com.'
+ let innerPunct = try $ satisfy isPunctuation >>~
+ notFollowedBy (newline <|> spaceChar)
+ let uriChar = innerPunct <|>
+ satisfy (\c -> not (isPunctuation c) &&
+ (not (isAscii c) || isAllowedInURI c))
+ -- We want to allow
+ -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
+ -- as a URL, while NOT picking up the closing paren in
+ -- (http://wikipedia.org)
+ -- So we include balanced parens in the URL.
+ let inParens = try $ do char '('
+ res <- many uriChar
+ char ')'
+ return $ '(' : res ++ ")"
+ str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar)
-- now see if they amount to an absolute URI
case parseURI (escapeURI str) of
Just uri' -> if uriScheme uri' `elem` protocols
@@ -742,8 +763,9 @@ charOrRef cs =
singleQuoteStart :: GenParser Char ParserState ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
- try $ do charOrRef "'\8216"
- notFollowedBy (oneOf ")!],.;:-? \t\n")
+ try $ do charOrRef "'\8216\145"
+ notFollowedBy (oneOf ")!],;:-? \t\n")
+ notFollowedBy (char '.') <|> lookAhead (string "..." >> return ())
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
-- possess/contraction
@@ -751,23 +773,23 @@ singleQuoteStart = do
singleQuoteEnd :: GenParser Char st ()
singleQuoteEnd = try $ do
- charOrRef "'\8217"
+ charOrRef "'\8217\146"
notFollowedBy alphaNum
doubleQuoteStart :: GenParser Char ParserState ()
doubleQuoteStart = do
failIfInQuoteContext InDoubleQuote
- try $ do charOrRef "\"\8220"
+ try $ do charOrRef "\"\8220\147"
notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
doubleQuoteEnd :: GenParser Char st ()
doubleQuoteEnd = do
- charOrRef "\"\8221"
+ charOrRef "\"\8221\148"
return ()
ellipses :: GenParser Char st Inline
ellipses = do
- try (charOrRef "…") <|> try (string "..." >> return '…')
+ try (charOrRef "…\133") <|> try (string "..." >> return '…')
return Ellipses
dash :: GenParser Char st Inline
@@ -775,13 +797,13 @@ dash = enDash <|> emDash
enDash :: GenParser Char st Inline
enDash = do
- try (charOrRef "–") <|>
+ try (charOrRef "–\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
return EnDash
emDash :: GenParser Char st Inline
emDash = do
- try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—')
+ try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—')
return EmDash
--