diff options
Diffstat (limited to 'icu/translit_Any_Latin1.txt')
-rw-r--r-- | icu/translit_Any_Latin1.txt | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/icu/translit_Any_Latin1.txt b/icu/translit_Any_Latin1.txt new file mode 100644 index 0000000..d049845 --- /dev/null +++ b/icu/translit_Any_Latin1.txt @@ -0,0 +1,89 @@ +// Any_Latin1 + +translit_Any_Latin1 { + Rule { + + //NFKD then NFC; then NFD all non-Latin-1 codepoints + ":: NFKC ();" + ":: [^\u0000-\u00FF] NFD ();" + + //remove non-spacing marks + "[:Mn:] > ;" + + //change some non-Latin-1 codepoints to similar characters in Latin-1 range + " [\u2000-\u200A\u3000] > ' ' ;" + " [\u01C3\u2762]> '!' ;" + " \u203C > '!!' ;" + " [\u203D\u2048] > '?!' ;" + " [\u02BA\u030B\u030E\u2033\u3003\u201C-\u201F] > '\"' ;" + " [\u066A\u2030\u2031] > '%' ;" + " [\u02B9\u02BC\u02C8\u0301\u2032\u2018-\u201B] > '' ;" + " [\u066D\u2217\u2731] > '*' ;" + " [\u060C\u201A\u3001] > ',' ;" + " [\u2010-\u2013\u2212] > '-' ;" + " [\u2023\u06D4\u3002\u0589] > '.' ;" + " [\u0338\u2044\u2215] > '/' ;" + " \u2236 > ':' ;" + " \u061B > ';' ;" + " [\u2039\u2329\u3008] > '<' ;" + " \u2261 > '=' ;" + " [\u203A\u232A\u3009] > '>' ;" + " [\u037E\u061F] > '?' ;" + " \u2049 > '!?' ;" + " \u212C > B ;" + " [\u2102\u212D] > C ;" + " [\u2107\u2130] > E ;" + " [\u2131\u2132] > F ;" + " [\u210B\u210C\u210D] > H ;" + " [\u2110\u2111\u2160] > I ;" + " \u212A > K ;" + " \u2112 > L ;" + " \u2133 > L ;" + " \u2115 > N ;" + " \u2119 > P ;" + " \u211A > Q ;" + " [\u211B\u211C\u211D] > R ;" + " [\u2124\u2128] > Z ;" + " \u2216 > '\\' ;" + " [\u02C4\u02C6\u0302\u2303] > '^' ;" + " [\u02CD\u0331\u0332\u2017] > '_' ;" + " [\u02CB\u0300\u2035] > '`' ;" + " [\u212E\u212F] > e ;" + " [\u0261\u210A] > g ;" + " [\u04BB\u210E] > h ;" + " \u0131 > i ;" + " \u207F > n ;" + " \u2134 > o ;" + " \u01B6 > z ;" + " [\u01C0\u2223\u2758] > '|' ;" + " [\u02DC\u0303\u223C\uFF5E] > '~' ;" + " \u202F > \u00A0 ;" + " \uFFFD > \u001A;" + " \u20A4 > \u00A3 ;" + " [\u20A0-\u20AF] > \u00A4 ;" + " \u0308 > \u00A8 ;" + " \u2117 > \u00A9 ;" + " [\u226A\u300A] > \u00AB ;" + " \u2310 > \u00AC ;" + " \u1806 > \u00AD ;" + " [\u02C9\u0304\u0305] > \u00AF ;" + " [\u02DA\u030A\u2070\u2218] > \u00B0 ;" + " \u2213 > \u00B1 ;" + " [\u02B9\u02CA\u0301\u2032] > \u00B4 ;" + " [\u204B\u2761] > \u00B6 ;" + " [\u2022\u2024\u2027\u2219\u22C5\u30FB] > \u00B7 ;" + " \u0327 > \u00B8 ;" + " [\u226B\u300B] > \u00BB ;" + " \u2014 > '--';" + " \u2015 > '---';" + " \u2024 > '..';" + " \u2025 > '...';" + " \u2016 > '||';" + " \uFEFF > ;" + + //change all spacing character to space + "[^\u0000-\u00FF] > ' ';" + + } +} + |