summaryrefslogtreecommitdiff
path: root/icu/translit_Any_Latin1.txt
blob: d049845859ca9838c9f5f210d9432a3daab1ab39 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// Any_Latin1

translit_Any_Latin1 {
  Rule {

  //NFKD then NFC; then NFD all non-Latin-1 codepoints
  ":: NFKC ();"
  ":: [^\u0000-\u00FF] NFD ();"

  //remove non-spacing marks
  "[:Mn:] > ;"

  //change some non-Latin-1 codepoints to similar characters in Latin-1 range
  " [\u2000-\u200A\u3000] > ' ' ;"
  " [\u01C3\u2762]> '!' ;"
  " \u203C > '!!' ;"
  " [\u203D\u2048] > '?!' ;"
  " [\u02BA\u030B\u030E\u2033\u3003\u201C-\u201F] > '\"' ;"
  " [\u066A\u2030\u2031] > '%' ;"
  " [\u02B9\u02BC\u02C8\u0301\u2032\u2018-\u201B] > '' ;"
  " [\u066D\u2217\u2731] > '*' ;"
  " [\u060C\u201A\u3001] > ',' ;"
  " [\u2010-\u2013\u2212] > '-' ;"
  " [\u2023\u06D4\u3002\u0589] > '.' ;"
  " [\u0338\u2044\u2215] > '/' ;"
  " \u2236 > ':' ;"
  " \u061B > ';' ;"
  " [\u2039\u2329\u3008] > '<' ;"
  " \u2261 > '=' ;"
  " [\u203A\u232A\u3009] > '>' ;"
  " [\u037E\u061F] > '?' ;"
  " \u2049 > '!?' ;"
  " \u212C > B ;"
  " [\u2102\u212D] > C ;"
  " [\u2107\u2130] > E ;"
  " [\u2131\u2132] > F ;"
  " [\u210B\u210C\u210D] > H ;"
  " [\u2110\u2111\u2160] > I ;"
  " \u212A > K ;"
  " \u2112 > L ;"
  " \u2133 > L ;"
  " \u2115 > N ;"
  " \u2119 > P ;"
  " \u211A > Q ;"
  " [\u211B\u211C\u211D] > R ;"
  " [\u2124\u2128] > Z ;"
  " \u2216 > '\\' ;"
  " [\u02C4\u02C6\u0302\u2303] > '^' ;"
  " [\u02CD\u0331\u0332\u2017] > '_' ;"
  " [\u02CB\u0300\u2035] > '`' ;"
  " [\u212E\u212F] > e ;"
  " [\u0261\u210A] > g ;"
  " [\u04BB\u210E] > h ;"
  " \u0131 > i ;"
  " \u207F > n ;"
  " \u2134 > o ;"
  " \u01B6 > z ;"
  " [\u01C0\u2223\u2758] > '|' ;"
  " [\u02DC\u0303\u223C\uFF5E] > '~' ;"
  " \u202F > \u00A0 ;"
  " \uFFFD > \u001A;"
  " \u20A4 > \u00A3 ;"
  " [\u20A0-\u20AF] > \u00A4 ;"
  " \u0308 > \u00A8 ;"
  " \u2117 > \u00A9 ;"
  " [\u226A\u300A] > \u00AB ;"
  " \u2310 > \u00AC ;"
  " \u1806 > \u00AD ;"
  " [\u02C9\u0304\u0305] > \u00AF ;"
  " [\u02DA\u030A\u2070\u2218] > \u00B0 ;"
  " \u2213 > \u00B1 ;"
  " [\u02B9\u02CA\u0301\u2032] > \u00B4 ;"
  " [\u204B\u2761] > \u00B6 ;"
  " [\u2022\u2024\u2027\u2219\u22C5\u30FB] > \u00B7 ;"
  " \u0327 > \u00B8 ;"
  " [\u226B\u300B] > \u00BB ;"
  " \u2014 > '--';"
  " \u2015 > '---';"
  " \u2024 > '..';"
  " \u2025 > '...';"
  " \u2016 > '||';"
  " \uFEFF > ;"

  //change all spacing character to space
  "[^\u0000-\u00FF] > ' ';"   

  }
}