summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/HtmlEntities.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/HtmlEntities.hs')
-rw-r--r--src/Text/Pandoc/HtmlEntities.hs306
1 files changed, 306 insertions, 0 deletions
diff --git a/src/Text/Pandoc/HtmlEntities.hs b/src/Text/Pandoc/HtmlEntities.hs
new file mode 100644
index 000000000..bbb438ef5
--- /dev/null
+++ b/src/Text/Pandoc/HtmlEntities.hs
@@ -0,0 +1,306 @@
+-- | Functions for encoding unicode characters as HTML entity
+-- references, and vice versa.
+module Text.Pandoc.HtmlEntities (
+ htmlEntityToChar,
+ charToHtmlEntity,
+ decodeEntities,
+ encodeEntities
+ ) where
+import Char ( chr, ord )
+import Text.Regex ( mkRegex, matchRegexAll )
+import Maybe ( fromMaybe )
+
+-- regexs for entities
+decimalCodedEntity = mkRegex "&#([0-9]+);"
+characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;"
+
+-- | Return a string with all entity references decoded to unicode characters
+-- where possible.
+decodeEntities :: String -> String
+decodeEntities str =
+ case (matchRegexAll characterEntity str) of
+ Nothing -> str
+ Just (before, match, rest, _) -> before ++ replacement ++ (decodeEntities rest)
+ where replacement = case (htmlEntityToChar match) of
+ Just ch -> [ch]
+ Nothing -> match
+
+-- | Returns a string with characters replaced with entity references where possible.
+encodeEntities :: String -> String
+encodeEntities = concatMap (\c -> fromMaybe [c] (charToHtmlEntity c))
+
+-- | If the string is a valid entity reference, returns @Just@ the character,
+-- otherwise @Nothing@.
+htmlEntityToChar :: String -> Maybe Char
+htmlEntityToChar entity =
+ case (lookup entity htmlEntityTable) of
+ Just ch -> Just ch
+ Nothing -> case (matchRegexAll decimalCodedEntity entity) of
+ Just (_, _, _, [sub]) -> Just (chr (read sub))
+ Nothing -> Nothing
+
+-- | If there is an entity reference corresponding to the character, returns
+-- @Just@ the entity reference, otherwise @Nothing@.
+charToHtmlEntity :: Char -> Maybe String
+charToHtmlEntity char =
+ let matches = filter (\(entity, character) -> (character == char)) htmlEntityTable in
+ if (length matches) == 0 then
+ Nothing
+ else
+ Just (fst (head matches))
+
+htmlEntityTable :: [(String, Char)]
+htmlEntityTable = [
+ (""", chr 34),
+ ("&", chr 38),
+ ("<", chr 60),
+ (">", chr 62),
+ (" ", chr 160),
+ ("¡", chr 161),
+ ("¢", chr 162),
+ ("£", chr 163),
+ ("¤", chr 164),
+ ("¥", chr 165),
+ ("¦", chr 166),
+ ("§", chr 167),
+ ("¨", chr 168),
+ ("©", chr 169),
+ ("ª", chr 170),
+ ("«", chr 171),
+ ("¬", chr 172),
+ ("­", chr 173),
+ ("®", chr 174),
+ ("¯", chr 175),
+ ("°", chr 176),
+ ("±", chr 177),
+ ("²", chr 178),
+ ("³", chr 179),
+ ("´", chr 180),
+ ("µ", chr 181),
+ ("¶", chr 182),
+ ("·", chr 183),
+ ("¸", chr 184),
+ ("¹", chr 185),
+ ("º", chr 186),
+ ("»", chr 187),
+ ("¼", chr 188),
+ ("½", chr 189),
+ ("¾", chr 190),
+ ("¿", chr 191),
+ ("À", chr 192),
+ ("Á", chr 193),
+ ("Â", chr 194),
+ ("Ã", chr 195),
+ ("Ä", chr 196),
+ ("Å", chr 197),
+ ("Æ", chr 198),
+ ("Ç", chr 199),
+ ("È", chr 200),
+ ("É", chr 201),
+ ("Ê", chr 202),
+ ("Ë", chr 203),
+ ("Ì", chr 204),
+ ("Í", chr 205),
+ ("Î", chr 206),
+ ("Ï", chr 207),
+ ("Ð", chr 208),
+ ("Ñ", chr 209),
+ ("Ò", chr 210),
+ ("Ó", chr 211),
+ ("Ô", chr 212),
+ ("Õ", chr 213),
+ ("Ö", chr 214),
+ ("×", chr 215),
+ ("Ø", chr 216),
+ ("Ù", chr 217),
+ ("Ú", chr 218),
+ ("Û", chr 219),
+ ("Ü", chr 220),
+ ("Ý", chr 221),
+ ("Þ", chr 222),
+ ("ß", chr 223),
+ ("à", chr 224),
+ ("á", chr 225),
+ ("â", chr 226),
+ ("ã", chr 227),
+ ("ä", chr 228),
+ ("å", chr 229),
+ ("æ", chr 230),
+ ("ç", chr 231),
+ ("è", chr 232),
+ ("é", chr 233),
+ ("ê", chr 234),
+ ("ë", chr 235),
+ ("ì", chr 236),
+ ("í", chr 237),
+ ("î", chr 238),
+ ("ï", chr 239),
+ ("ð", chr 240),
+ ("ñ", chr 241),
+ ("ò", chr 242),
+ ("ó", chr 243),
+ ("ô", chr 244),
+ ("õ", chr 245),
+ ("ö", chr 246),
+ ("÷", chr 247),
+ ("ø", chr 248),
+ ("ù", chr 249),
+ ("ú", chr 250),
+ ("û", chr 251),
+ ("ü", chr 252),
+ ("ý", chr 253),
+ ("þ", chr 254),
+ ("ÿ", chr 255),
+ ("Œ", chr 338),
+ ("œ", chr 339),
+ ("Š", chr 352),
+ ("š", chr 353),
+ ("Ÿ", chr 376),
+ ("ƒ", chr 402),
+ ("ˆ", chr 710),
+ ("˜", chr 732),
+ ("Α", chr 913),
+ ("Β", chr 914),
+ ("Γ", chr 915),
+ ("Δ", chr 916),
+ ("Ε", chr 917),
+ ("Ζ", chr 918),
+ ("Η", chr 919),
+ ("Θ", chr 920),
+ ("Ι", chr 921),
+ ("Κ", chr 922),
+ ("Λ", chr 923),
+ ("Μ", chr 924),
+ ("Ν", chr 925),
+ ("Ξ", chr 926),
+ ("Ο", chr 927),
+ ("Π", chr 928),
+ ("Ρ", chr 929),
+ ("Σ", chr 931),
+ ("Τ", chr 932),
+ ("Υ", chr 933),
+ ("Φ", chr 934),
+ ("Χ", chr 935),
+ ("Ψ", chr 936),
+ ("Ω", chr 937),
+ ("α", chr 945),
+ ("β", chr 946),
+ ("γ", chr 947),
+ ("δ", chr 948),
+ ("ε", chr 949),
+ ("ζ", chr 950),
+ ("η", chr 951),
+ ("θ", chr 952),
+ ("ι", chr 953),
+ ("κ", chr 954),
+ ("λ", chr 955),
+ ("μ", chr 956),
+ ("ν", chr 957),
+ ("ξ", chr 958),
+ ("ο", chr 959),
+ ("π", chr 960),
+ ("ρ", chr 961),
+ ("ς", chr 962),
+ ("σ", chr 963),
+ ("τ", chr 964),
+ ("υ", chr 965),
+ ("φ", chr 966),
+ ("χ", chr 967),
+ ("ψ", chr 968),
+ ("ω", chr 969),
+ ("ϑ", chr 977),
+ ("ϒ", chr 978),
+ ("ϖ", chr 982),
+ (" ", chr 8194),
+ (" ", chr 8195),
+ (" ", chr 8201),
+ ("‌", chr 8204),
+ ("‍", chr 8205),
+ ("‎", chr 8206),
+ ("‏", chr 8207),
+ ("–", chr 8211),
+ ("—", chr 8212),
+ ("‘", chr 8216),
+ ("’", chr 8217),
+ ("‚", chr 8218),
+ ("“", chr 8220),
+ ("”", chr 8221),
+ ("„", chr 8222),
+ ("†", chr 8224),
+ ("‡", chr 8225),
+ ("•", chr 8226),
+ ("…", chr 8230),
+ ("‰", chr 8240),
+ ("′", chr 8242),
+ ("″", chr 8243),
+ ("‹", chr 8249),
+ ("›", chr 8250),
+ ("‾", chr 8254),
+ ("⁄", chr 8260),
+ ("€", chr 8364),
+ ("ℑ", chr 8465),
+ ("℘", chr 8472),
+ ("ℜ", chr 8476),
+ ("™", chr 8482),
+ ("ℵ", chr 8501),
+ ("←", chr 8592),
+ ("↑", chr 8593),
+ ("→", chr 8594),
+ ("↓", chr 8595),
+ ("↔", chr 8596),
+ ("↵", chr 8629),
+ ("⇐", chr 8656),
+ ("⇑", chr 8657),
+ ("⇒", chr 8658),
+ ("⇓", chr 8659),
+ ("⇔", chr 8660),
+ ("∀", chr 8704),
+ ("∂", chr 8706),
+ ("∃", chr 8707),
+ ("∅", chr 8709),
+ ("∇", chr 8711),
+ ("∈", chr 8712),
+ ("∉", chr 8713),
+ ("∋", chr 8715),
+ ("∏", chr 8719),
+ ("∑", chr 8721),
+ ("−", chr 8722),
+ ("∗", chr 8727),
+ ("√", chr 8730),
+ ("∝", chr 8733),
+ ("∞", chr 8734),
+ ("∠", chr 8736),
+ ("∧", chr 8743),
+ ("∨", chr 8744),
+ ("∩", chr 8745),
+ ("∪", chr 8746),
+ ("∫", chr 8747),
+ ("∴", chr 8756),
+ ("∼", chr 8764),
+ ("≅", chr 8773),
+ ("≈", chr 8776),
+ ("≠", chr 8800),
+ ("≡", chr 8801),
+ ("≤", chr 8804),
+ ("≥", chr 8805),
+ ("⊂", chr 8834),
+ ("⊃", chr 8835),
+ ("⊄", chr 8836),
+ ("⊆", chr 8838),
+ ("⊇", chr 8839),
+ ("⊕", chr 8853),
+ ("⊗", chr 8855),
+ ("⊥", chr 8869),
+ ("⋅", chr 8901),
+ ("⌈", chr 8968),
+ ("⌉", chr 8969),
+ ("⌊", chr 8970),
+ ("⌋", chr 8971),
+ ("⟨", chr 9001),
+ ("⟩", chr 9002),
+ ("◊", chr 9674),
+ ("♠", chr 9824),
+ ("♣", chr 9827),
+ ("♥", chr 9829),
+ ("♦", chr 9830)
+ ]