summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/XML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/XML.hs')
-rw-r--r--src/Text/Pandoc/XML.hs25
1 files changed, 24 insertions, 1 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs
index e21525018..7a1c8bdd8 100644
--- a/src/Text/Pandoc/XML.hs
+++ b/src/Text/Pandoc/XML.hs
@@ -33,9 +33,13 @@ module Text.Pandoc.XML ( stripTags,
inTags,
selfClosingTag,
inTagsSimple,
- inTagsIndented ) where
+ inTagsIndented,
+ toEntities,
+ fromEntities ) where
import Text.Pandoc.Pretty
+import Data.Char (ord, isAscii)
+import Text.HTML.TagSoup.Entity (lookupEntity)
-- | Remove everything between <...>
stripTags :: String -> String
@@ -89,3 +93,22 @@ inTagsSimple tagType = inTags False tagType []
-- | Put the supplied contents in indented block btw start and end tags.
inTagsIndented :: String -> Doc -> Doc
inTagsIndented tagType = inTags True tagType []
+
+-- | Escape all non-ascii characters using numerical entities.
+toEntities :: String -> String
+toEntities [] = ""
+toEntities (c:cs)
+ | isAscii c = c : toEntities cs
+ | otherwise = "&#" ++ show (ord c) ++ ";" ++ toEntities cs
+
+-- Unescapes XML entities
+fromEntities :: String -> String
+fromEntities ('&':xs) =
+ case lookupEntity ent of
+ Just c -> c : fromEntities rest
+ Nothing -> '&' : fromEntities rest
+ where (ent, rest) = case break (==';') xs of
+ (zs,';':ys) -> (zs,ys)
+ _ -> ("",xs)
+fromEntities (x:xs) = x : fromEntities xs
+fromEntities [] = []