summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-05-05 12:41:35 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-05-05 12:42:50 -0700
commitd19a347fd5e93802e80e5bd8e13f29de74a53f3c (patch)
treeb45b219990a1b4c33ba45f3f9d845ad45af14a7a
parentdb4d69a8cb20d37265b4017c69baec4270af8841 (diff)
UTF8: Better handling of bare CRs in input files.
Previously we just stripped them out; now we convert other line ending styles to LF line endings. Closes #2132.
-rw-r--r--src/Text/Pandoc/UTF8.hs10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs
index c1bf84995..de3314a0d 100644
--- a/src/Text/Pandoc/UTF8.hs
+++ b/src/Text/Pandoc/UTF8.hs
@@ -93,10 +93,16 @@ dropBOM :: String -> String
dropBOM ('\xFEFF':xs) = xs
dropBOM xs = xs
+filterCRs :: String -> String
+filterCRs ('\r':'\n':xs) = '\n': filterCRs xs
+filterCRs ('\r':xs) = '\n' : filterCRs xs
+filterCRs (x:xs) = x : filterCRs xs
+filterCRs [] = []
+
-- | Convert UTF8-encoded ByteString to String, also
-- removing '\r' characters.
toString :: B.ByteString -> String
-toString = filter (/='\r') . dropBOM . T.unpack . T.decodeUtf8
+toString = filterCRs . dropBOM . T.unpack . T.decodeUtf8
fromString :: String -> B.ByteString
fromString = T.encodeUtf8 . T.pack
@@ -104,7 +110,7 @@ fromString = T.encodeUtf8 . T.pack
-- | Convert UTF8-encoded ByteString to String, also
-- removing '\r' characters.
toStringLazy :: BL.ByteString -> String
-toStringLazy = filter (/='\r') . dropBOM . TL.unpack . TL.decodeUtf8
+toStringLazy = filterCRs . dropBOM . TL.unpack . TL.decodeUtf8
fromStringLazy :: String -> BL.ByteString
fromStringLazy = TL.encodeUtf8 . TL.pack