diff options
Diffstat (limited to 'src/Text/Pandoc/XML.hs')
-rw-r--r-- | src/Text/Pandoc/XML.hs | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index e21525018..7a1c8bdd8 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -33,9 +33,13 @@ module Text.Pandoc.XML ( stripTags, inTags, selfClosingTag, inTagsSimple, - inTagsIndented ) where + inTagsIndented, + toEntities, + fromEntities ) where import Text.Pandoc.Pretty +import Data.Char (ord, isAscii) +import Text.HTML.TagSoup.Entity (lookupEntity) -- | Remove everything between <...> stripTags :: String -> String @@ -89,3 +93,22 @@ inTagsSimple tagType = inTags False tagType [] -- | Put the supplied contents in indented block btw start and end tags. inTagsIndented :: String -> Doc -> Doc inTagsIndented tagType = inTags True tagType [] + +-- | Escape all non-ascii characters using numerical entities. +toEntities :: String -> String +toEntities [] = "" +toEntities (c:cs) + | isAscii c = c : toEntities cs + | otherwise = "&#" ++ show (ord c) ++ ";" ++ toEntities cs + +-- Unescapes XML entities +fromEntities :: String -> String +fromEntities ('&':xs) = + case lookupEntity ent of + Just c -> c : fromEntities rest + Nothing -> '&' : fromEntities rest + where (ent, rest) = case break (==';') xs of + (zs,';':ys) -> (zs,ys) + _ -> ("",xs) +fromEntities (x:xs) = x : fromEntities xs +fromEntities [] = [] |