diff options
Diffstat (limited to 'src/Text/Pandoc/Parsing.hs')
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 44 |
1 files changed, 33 insertions, 11 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 9ce064f91..eaf0c0f67 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -33,6 +33,7 @@ module Text.Pandoc.Parsing ( (>>~), notFollowedBy', oneOfStrings, spaceChar, + nonspaceChar, skipSpaces, blankline, blanklines, @@ -78,7 +79,7 @@ import Text.Pandoc.Generic import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.ParserCombinators.Parsec import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit ) +import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation ) import Data.List ( intercalate, transpose ) import Network.URI ( parseURI, URI (..), isAllowedInURI ) import Control.Monad ( join, liftM, guard ) @@ -122,6 +123,10 @@ oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings spaceChar :: CharParser st Char spaceChar = satisfy $ \c -> c == ' ' || c == '\t' +-- | Parses a nonspace, nonnewline character. +nonspaceChar :: CharParser st Char +nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r' + -- | Skips zero or more spaces or tabs. skipSpaces :: GenParser Char st () skipSpaces = skipMany spaceChar @@ -264,8 +269,24 @@ uri = try $ do let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:", "news:", "telnet:" ] lookAhead $ oneOfStrings protocols - -- scan non-ascii characters and ascii characters allowed in a URI - str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c) + -- Scan non-ascii characters and ascii characters allowed in a URI. + -- We allow punctuation except when followed by a space, since + -- we don't want the trailing '.' in 'http://google.com.' + let innerPunct = try $ satisfy isPunctuation >>~ + notFollowedBy (newline <|> spaceChar) + let uriChar = innerPunct <|> + satisfy (\c -> not (isPunctuation c) && + (not (isAscii c) || isAllowedInURI c)) + -- We want to allow + -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) + -- as a URL, while NOT picking up the closing paren in + -- (http://wikipedia.org) + -- So we include balanced parens in the URL. + let inParens = try $ do char '(' + res <- many uriChar + char ')' + return $ '(' : res ++ ")" + str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar) -- now see if they amount to an absolute URI case parseURI (escapeURI str) of Just uri' -> if uriScheme uri' `elem` protocols @@ -742,8 +763,9 @@ charOrRef cs = singleQuoteStart :: GenParser Char ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote - try $ do charOrRef "'\8216" - notFollowedBy (oneOf ")!],.;:-? \t\n") + try $ do charOrRef "'\8216\145" + notFollowedBy (oneOf ")!],;:-? \t\n") + notFollowedBy (char '.') <|> lookAhead (string "..." >> return ()) notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> satisfy (not . isAlphaNum))) -- possess/contraction @@ -751,23 +773,23 @@ singleQuoteStart = do singleQuoteEnd :: GenParser Char st () singleQuoteEnd = try $ do - charOrRef "'\8217" + charOrRef "'\8217\146" notFollowedBy alphaNum doubleQuoteStart :: GenParser Char ParserState () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote - try $ do charOrRef "\"\8220" + try $ do charOrRef "\"\8220\147" notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) doubleQuoteEnd :: GenParser Char st () doubleQuoteEnd = do - charOrRef "\"\8221" + charOrRef "\"\8221\148" return () ellipses :: GenParser Char st Inline ellipses = do - try (charOrRef "…") <|> try (string "..." >> return '…') + try (charOrRef "…\133") <|> try (string "..." >> return '…') return Ellipses dash :: GenParser Char st Inline @@ -775,13 +797,13 @@ dash = enDash <|> emDash enDash :: GenParser Char st Inline enDash = do - try (charOrRef "–") <|> + try (charOrRef "–\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') return EnDash emDash :: GenParser Char st Inline emDash = do - try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—') + try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—') return EmDash -- |