summaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/CharacterReferences.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/CharacterReferences.hs')
-rw-r--r--src/Text/Pandoc/CharacterReferences.hs327
1 files changed, 327 insertions, 0 deletions
diff --git a/src/Text/Pandoc/CharacterReferences.hs b/src/Text/Pandoc/CharacterReferences.hs
new file mode 100644
index 000000000..b0f4f6019
--- /dev/null
+++ b/src/Text/Pandoc/CharacterReferences.hs
@@ -0,0 +1,327 @@
+{-
+Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.CharacterReferences
+ Copyright : Copyright (C) 2006-7 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for parsing character references.
+-}
+module Text.Pandoc.CharacterReferences (
+ characterReference,
+ decodeCharacterReferences,
+ ) where
+import Data.Char ( chr )
+import Text.ParserCombinators.Parsec
+import qualified Data.Map as Map
+
+-- | Parse character entity.
+characterReference :: GenParser Char st Char
+characterReference = try $ do
+ char '&'
+ character <- numRef <|> entity
+ char ';'
+ return character
+
+numRef :: GenParser Char st Char
+numRef = do
+ char '#'
+ num <- hexNum <|> decNum
+ return $ chr $ num
+
+hexNum :: GenParser Char st Int
+hexNum = oneOf "Xx" >> many1 hexDigit >>= return . read . ("0x" ++)
+
+decNum :: GenParser Char st Int
+decNum = many1 digit >>= return . read
+
+entity :: GenParser Char st Char
+entity = do
+ body <- many1 alphaNum
+ return $ Map.findWithDefault '?' body entityTable
+
+-- | Convert entities in a string to characters.
+decodeCharacterReferences :: String -> String
+decodeCharacterReferences str =
+ case parse (many (characterReference <|> anyChar)) str str of
+ Left err -> error $ "\nError: " ++ show err
+ Right result -> result
+
+entityTable :: Map.Map String Char
+entityTable = Map.fromList entityTableList
+
+entityTableList :: [(String, Char)]
+entityTableList = [
+ ("quot", chr 34),
+ ("amp", chr 38),
+ ("lt", chr 60),
+ ("gt", chr 62),
+ ("nbsp", chr 160),
+ ("iexcl", chr 161),
+ ("cent", chr 162),
+ ("pound", chr 163),
+ ("curren", chr 164),
+ ("yen", chr 165),
+ ("brvbar", chr 166),
+ ("sect", chr 167),
+ ("uml", chr 168),
+ ("copy", chr 169),
+ ("ordf", chr 170),
+ ("laquo", chr 171),
+ ("not", chr 172),
+ ("shy", chr 173),
+ ("reg", chr 174),
+ ("macr", chr 175),
+ ("deg", chr 176),
+ ("plusmn", chr 177),
+ ("sup2", chr 178),
+ ("sup3", chr 179),
+ ("acute", chr 180),
+ ("micro", chr 181),
+ ("para", chr 182),
+ ("middot", chr 183),
+ ("cedil", chr 184),
+ ("sup1", chr 185),
+ ("ordm", chr 186),
+ ("raquo", chr 187),
+ ("frac14", chr 188),
+ ("frac12", chr 189),
+ ("frac34", chr 190),
+ ("iquest", chr 191),
+ ("Agrave", chr 192),
+ ("Aacute", chr 193),
+ ("Acirc", chr 194),
+ ("Atilde", chr 195),
+ ("Auml", chr 196),
+ ("Aring", chr 197),
+ ("AElig", chr 198),
+ ("Ccedil", chr 199),
+ ("Egrave", chr 200),
+ ("Eacute", chr 201),
+ ("Ecirc", chr 202),
+ ("Euml", chr 203),
+ ("Igrave", chr 204),
+ ("Iacute", chr 205),
+ ("Icirc", chr 206),
+ ("Iuml", chr 207),
+ ("ETH", chr 208),
+ ("Ntilde", chr 209),
+ ("Ograve", chr 210),
+ ("Oacute", chr 211),
+ ("Ocirc", chr 212),
+ ("Otilde", chr 213),
+ ("Ouml", chr 214),
+ ("times", chr 215),
+ ("Oslash", chr 216),
+ ("Ugrave", chr 217),
+ ("Uacute", chr 218),
+ ("Ucirc", chr 219),
+ ("Uuml", chr 220),
+ ("Yacute", chr 221),
+ ("THORN", chr 222),
+ ("szlig", chr 223),
+ ("agrave", chr 224),
+ ("aacute", chr 225),
+ ("acirc", chr 226),
+ ("atilde", chr 227),
+ ("auml", chr 228),
+ ("aring", chr 229),
+ ("aelig", chr 230),
+ ("ccedil", chr 231),
+ ("egrave", chr 232),
+ ("eacute", chr 233),
+ ("ecirc", chr 234),
+ ("euml", chr 235),
+ ("igrave", chr 236),
+ ("iacute", chr 237),
+ ("icirc", chr 238),
+ ("iuml", chr 239),
+ ("eth", chr 240),
+ ("ntilde", chr 241),
+ ("ograve", chr 242),
+ ("oacute", chr 243),
+ ("ocirc", chr 244),
+ ("otilde", chr 245),
+ ("ouml", chr 246),
+ ("divide", chr 247),
+ ("oslash", chr 248),
+ ("ugrave", chr 249),
+ ("uacute", chr 250),
+ ("ucirc", chr 251),
+ ("uuml", chr 252),
+ ("yacute", chr 253),
+ ("thorn", chr 254),
+ ("yuml", chr 255),
+ ("OElig", chr 338),
+ ("oelig", chr 339),
+ ("Scaron", chr 352),
+ ("scaron", chr 353),
+ ("Yuml", chr 376),
+ ("fnof", chr 402),
+ ("circ", chr 710),
+ ("tilde", chr 732),
+ ("Alpha", chr 913),
+ ("Beta", chr 914),
+ ("Gamma", chr 915),
+ ("Delta", chr 916),
+ ("Epsilon", chr 917),
+ ("Zeta", chr 918),
+ ("Eta", chr 919),
+ ("Theta", chr 920),
+ ("Iota", chr 921),
+ ("Kappa", chr 922),
+ ("Lambda", chr 923),
+ ("Mu", chr 924),
+ ("Nu", chr 925),
+ ("Xi", chr 926),
+ ("Omicron", chr 927),
+ ("Pi", chr 928),
+ ("Rho", chr 929),
+ ("Sigma", chr 931),
+ ("Tau", chr 932),
+ ("Upsilon", chr 933),
+ ("Phi", chr 934),
+ ("Chi", chr 935),
+ ("Psi", chr 936),
+ ("Omega", chr 937),
+ ("alpha", chr 945),
+ ("beta", chr 946),
+ ("gamma", chr 947),
+ ("delta", chr 948),
+ ("epsilon", chr 949),
+ ("zeta", chr 950),
+ ("eta", chr 951),
+ ("theta", chr 952),
+ ("iota", chr 953),
+ ("kappa", chr 954),
+ ("lambda", chr 955),
+ ("mu", chr 956),
+ ("nu", chr 957),
+ ("xi", chr 958),
+ ("omicron", chr 959),
+ ("pi", chr 960),
+ ("rho", chr 961),
+ ("sigmaf", chr 962),
+ ("sigma", chr 963),
+ ("tau", chr 964),
+ ("upsilon", chr 965),
+ ("phi", chr 966),
+ ("chi", chr 967),
+ ("psi", chr 968),
+ ("omega", chr 969),
+ ("thetasym", chr 977),
+ ("upsih", chr 978),
+ ("piv", chr 982),
+ ("ensp", chr 8194),
+ ("emsp", chr 8195),
+ ("thinsp", chr 8201),
+ ("zwnj", chr 8204),
+ ("zwj", chr 8205),
+ ("lrm", chr 8206),
+ ("rlm", chr 8207),
+ ("ndash", chr 8211),
+ ("mdash", chr 8212),
+ ("lsquo", chr 8216),
+ ("rsquo", chr 8217),
+ ("sbquo", chr 8218),
+ ("ldquo", chr 8220),
+ ("rdquo", chr 8221),
+ ("bdquo", chr 8222),
+ ("dagger", chr 8224),
+ ("Dagger", chr 8225),
+ ("bull", chr 8226),
+ ("hellip", chr 8230),
+ ("permil", chr 8240),
+ ("prime", chr 8242),
+ ("Prime", chr 8243),
+ ("lsaquo", chr 8249),
+ ("rsaquo", chr 8250),
+ ("oline", chr 8254),
+ ("frasl", chr 8260),
+ ("euro", chr 8364),
+ ("image", chr 8465),
+ ("weierp", chr 8472),
+ ("real", chr 8476),
+ ("trade", chr 8482),
+ ("alefsym", chr 8501),
+ ("larr", chr 8592),
+ ("uarr", chr 8593),
+ ("rarr", chr 8594),
+ ("darr", chr 8595),
+ ("harr", chr 8596),
+ ("crarr", chr 8629),
+ ("lArr", chr 8656),
+ ("uArr", chr 8657),
+ ("rArr", chr 8658),
+ ("dArr", chr 8659),
+ ("hArr", chr 8660),
+ ("forall", chr 8704),
+ ("part", chr 8706),
+ ("exist", chr 8707),
+ ("empty", chr 8709),
+ ("nabla", chr 8711),
+ ("isin", chr 8712),
+ ("notin", chr 8713),
+ ("ni", chr 8715),
+ ("prod", chr 8719),
+ ("sum", chr 8721),
+ ("minus", chr 8722),
+ ("lowast", chr 8727),
+ ("radic", chr 8730),
+ ("prop", chr 8733),
+ ("infin", chr 8734),
+ ("ang", chr 8736),
+ ("and", chr 8743),
+ ("or", chr 8744),
+ ("cap", chr 8745),
+ ("cup", chr 8746),
+ ("int", chr 8747),
+ ("there4", chr 8756),
+ ("sim", chr 8764),
+ ("cong", chr 8773),
+ ("asymp", chr 8776),
+ ("ne", chr 8800),
+ ("equiv", chr 8801),
+ ("le", chr 8804),
+ ("ge", chr 8805),
+ ("sub", chr 8834),
+ ("sup", chr 8835),
+ ("nsub", chr 8836),
+ ("sube", chr 8838),
+ ("supe", chr 8839),
+ ("oplus", chr 8853),
+ ("otimes", chr 8855),
+ ("perp", chr 8869),
+ ("sdot", chr 8901),
+ ("lceil", chr 8968),
+ ("rceil", chr 8969),
+ ("lfloor", chr 8970),
+ ("rfloor", chr 8971),
+ ("lang", chr 9001),
+ ("rang", chr 9002),
+ ("loz", chr 9674),
+ ("spades", chr 9824),
+ ("clubs", chr 9827),
+ ("hearts", chr 9829),
+ ("diams", chr 9830)
+ ]