summaryrefslogtreecommitdiff
path: root/src/modules/filters/utf8transliterator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/filters/utf8transliterator.cpp')
-rw-r--r--src/modules/filters/utf8transliterator.cpp479
1 files changed, 479 insertions, 0 deletions
diff --git a/src/modules/filters/utf8transliterator.cpp b/src/modules/filters/utf8transliterator.cpp
new file mode 100644
index 0000000..7bc068a
--- /dev/null
+++ b/src/modules/filters/utf8transliterator.cpp
@@ -0,0 +1,479 @@
+/******************************************************************************
+*
+* utf8transliterators - SWFilter decendant to transliterate between
+* ICU-supported scripts.
+*/
+
+#ifdef _ICU_
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __GNUC__
+#include <unixstr.h>
+#endif
+
+#include <utf8transliterator.h>
+
+const char UTF8Transliterator::optionstring[NUMTARGETSCRIPTS][16] = {
+ "Off",
+ "Latin",
+ "Basic Latin",
+ "Beta",
+ "BGreek",
+/*
+ "Greek",
+ "Hebrew",
+ "Cyrillic",
+ "Arabic",
+ "Syriac",
+ "Katakana",
+ "Hiragana",
+ "Jamo",
+ "Hangul",
+ "Devanagari",
+ "Tamil",
+ "Bengali",
+ "Gurmukhi",
+ "Gujarati",
+ "Oriya",
+ "Telugu",
+ "Kannada",
+ "Malayalam",
+ "Thai",
+ "Georgian",
+ "Armenian",
+ "Ethiopic",
+ "Gothic",
+ "Ugaritic",
+ "Coptic"
+ */
+};
+
+const char UTF8Transliterator::optName[] = "Transliteration";
+const char UTF8Transliterator::optTip[] = "Transliterates between scripts";
+
+UTF8Transliterator::UTF8Transliterator() {
+ option = 0;
+ unsigned long i;
+ for (i = 0; i < NUMTARGETSCRIPTS; i++) {
+ options.push_back(optionstring[i]);
+ }
+}
+
+void UTF8Transliterator::setOptionValue(const char *ival)
+{
+ unsigned char i = option = NUMTARGETSCRIPTS;
+ while (i && stricmp(ival, optionstring[i])) {
+ i--;
+ option = i;
+ }
+}
+
+const char *UTF8Transliterator::getOptionValue()
+{
+ return (NUMTARGETSCRIPTS > option) ? optionstring[option] : 0;
+}
+
+char UTF8Transliterator::ProcessText(char *text, int maxlen, const SWKey *key, const SWModule *module)
+{
+ if (option) { // if we want transliteration
+ unsigned long i, j;
+ UErrorCode err = U_ZERO_ERROR;
+ UConverter * conv = NULL;
+ conv = ucnv_open("UTF-8", &err);
+
+ bool compat = false;
+ bool noNFC = false;
+
+ if (option == SE_JAMO) {
+ noNFC = true;
+ }
+
+ // Convert UTF-8 string to UTF-16 (UChars)
+ j = strlen(text);
+ int32_t len = (j * 2) + 1;
+ UChar *source = new UChar[len];
+ err = U_ZERO_ERROR;
+ len = ucnv_toUChars(conv, source, len, text, j, &err);
+ source[len] = 0;
+
+ // Figure out which scripts are used in the string
+ unsigned char scripts[NUMSCRIPTS];
+
+ for (i = 0; i < NUMSCRIPTS; i++) {
+ scripts[i] = false;
+ }
+
+ for (i = 0; i < len; i++) {
+ j = ublock_getCode(source[i]);
+ switch (j) {
+ case UBLOCK_BASIC_LATIN: scripts[SE_LATIN] = true; break;
+ case UBLOCK_GREEK: scripts[SE_GREEK] = true; break;
+ case UBLOCK_HEBREW: scripts[SE_HEBREW] = true; break;
+ case UBLOCK_CYRILLIC: scripts[SE_CYRILLIC] = true; break;
+ case UBLOCK_ARABIC: scripts[SE_ARABIC] = true; break;
+ case UBLOCK_SYRIAC: scripts[SE_SYRIAC] = true; break;
+ case UBLOCK_KATAKANA: scripts[SE_KATAKANA] = true; break;
+ case UBLOCK_HIRAGANA: scripts[SE_HIRAGANA] = true; break;
+ case UBLOCK_HANGUL_SYLLABLES: scripts[SE_HANGUL] = true; break;
+ case UBLOCK_HANGUL_JAMO: scripts[SE_JAMO] = true; break;
+ case UBLOCK_DEVANAGARI: scripts[SE_DEVANAGARI] = true; break;
+ case UBLOCK_TAMIL: scripts[SE_TAMIL] = true; break;
+ case UBLOCK_BENGALI: scripts[SE_BENGALI] = true; break;
+ case UBLOCK_GURMUKHI: scripts[SE_GURMUKHI] = true; break;
+ case UBLOCK_GUJARATI: scripts[SE_GUJARATI] = true; break;
+ case UBLOCK_ORIYA: scripts[SE_ORIYA] = true; break;
+ case UBLOCK_TELUGU: scripts[SE_TELUGU] = true; break;
+ case UBLOCK_KANNADA: scripts[SE_KANNADA] = true; break;
+ case UBLOCK_MALAYALAM: scripts[SE_MALAYALAM] = true; break;
+ case UBLOCK_THAI: scripts[SE_THAI] = true; break;
+ case UBLOCK_GEORGIAN: scripts[SE_GEORGIAN] = true; break;
+ case UBLOCK_ARMENIAN: scripts[SE_ARMENIAN] = true; break;
+ case UBLOCK_ETHIOPIC: scripts[SE_ETHIOPIC] = true; break;
+ case UBLOCK_GOTHIC: scripts[SE_GOTHIC] = true; break;
+ // needs Unicode 3.2? or 4.0? support from ICU
+ //case UBLOCK_UGARITIC: scripts[SE_UGARITIC] = true; break;
+ case UBLOCK_CJK_RADICALS_SUPPLEMENT:
+ case UBLOCK_KANGXI_RADICALS:
+ case UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS:
+ case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
+ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
+ scripts[SE_HAN] = true;
+ break;
+ case UBLOCK_CJK_COMPATIBILITY:
+ case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS:
+ case UBLOCK_CJK_COMPATIBILITY_FORMS:
+ scripts[SE_HAN] = true;
+ compat = true;
+ break;
+ case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
+ scripts[SE_HANGUL] = true;
+ compat = true;
+ break;
+
+ default: scripts[SE_LATIN] = true;
+ }
+ }
+ scripts[option] = false; //turn off the reflexive transliteration
+
+ //return if we have no transliteration to do for this text
+ j = 0;
+ for (i = 0; !j && i < NUMSCRIPTS; i++) {
+ if (scripts[i]) j++;
+ }
+ if (!j) {
+ ucnv_close(conv);
+ return 0;
+ }
+
+ UnicodeString id;
+ if (compat) {
+ id = UnicodeString("NFKD");
+ }
+ else {
+ id = UnicodeString("NFD");
+ }
+
+ //Simple X to Latin transliterators
+ if (scripts[SE_GREEK]) {
+ if (option == SE_BETA)
+ id += UnicodeString(";Greek-Beta");
+ else if (option == SE_BGREEK)
+ id += UnicodeString(";Greek-BGreek");
+ else {
+ if (!strnicmp (((SWModule*)module)->Lang(), "cop", 3)) {
+ id += UnicodeString(";Coptic-Latin");
+ }
+ else {
+ id += UnicodeString(";Greek-Latin");
+ }
+ scripts[SE_LATIN] = true;
+ }
+ }
+ if (scripts[SE_HEBREW]) {
+ if (option == SE_BETA)
+ id += UnicodeString(";Hebrew-CCAT");
+ else if (option == SE_SYRIAC)
+ id += UnicodeString(";Hebrew-Syriac");
+ else {
+ id += UnicodeString(";Hebrew-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ }
+ if (scripts[SE_CYRILLIC]) {
+ id += UnicodeString(";Cyrillic-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_ARABIC]) {
+ id += UnicodeString(";Arabic-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_SYRIAC]) {
+ if (option == SE_BETA)
+ id += UnicodeString(";Syriac-CCAT");
+ else if (option == SE_HEBREW)
+ id += UnicodeString(";Syriac-Hebrew");
+ else {
+ id += UnicodeString(";Syriac-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ }
+ if (scripts[SE_THAI]) {
+ id += UnicodeString(";Thai-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_GEORGIAN]) {
+ id += UnicodeString(";Georgian-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_ARMENIAN]) {
+ id += UnicodeString(";Armenian-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_ETHIOPIC]) {
+ id += UnicodeString(";Ethiopic-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_GOTHIC]) {
+ id += UnicodeString(";Gothic-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_UGARITIC]) {
+ id += UnicodeString(";Ugaritic-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_HAN]) {
+ if (!strnicmp (((SWModule*)module)->Lang(), "ja", 2)) {
+ id += UnicodeString(";Kanji-OnRomaji");
+ }
+ else {
+ id += UnicodeString(";Han-Pinyin");
+ }
+ scripts[SE_LATIN] = true;
+ }
+
+ // Inter-Kana and Kana to Latin transliterators
+ if (option == SE_HIRAGANA && scripts[SE_KATAKANA]) {
+ id += UnicodeString(";Katakana-Hiragana");
+ scripts[SE_HIRAGANA] = true;
+ }
+ else if (option == SE_KATAKANA && scripts[SE_HIRAGANA]) {
+ id += UnicodeString(";Hiragana-Katakana");
+ scripts[SE_KATAKANA] = true;
+ }
+ else {
+ if (scripts[SE_KATAKANA]) {
+ id += UnicodeString(";Katakana-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_HIRAGANA]) {
+ id += UnicodeString(";Hiragana-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ }
+
+ // Inter-Korean and Korean to Latin transliterators
+ if (option == SE_HANGUL && scripts[SE_JAMO]) {
+ noNFC = false;
+ scripts[SE_HANGUL] = true;
+ }
+ else if (option == SE_JAMO && scripts[SE_HANGUL]) {
+ noNFC = true;
+ scripts[SE_JAMO] = true;
+ }
+ else {
+ if (scripts[SE_HANGUL]) {
+ id += UnicodeString(";Hangul-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_JAMO]) {
+ id += UnicodeString(";Jamo-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ }
+
+ // Indic-Latin
+ if (option < SE_DEVANAGARI || option > SE_MALAYALAM) {
+ // Indic to Latin
+ if (scripts[SE_TAMIL]) {
+ id += UnicodeString(";Tamil-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_BENGALI]) {
+ id += UnicodeString(";Bengali-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_GURMUKHI]) {
+ id += UnicodeString(";Gurmukhi-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_GUJARATI]) {
+ id += UnicodeString(";Gujarati-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_ORIYA]) {
+ id += UnicodeString(";Oriya-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_TELUGU]) {
+ id += UnicodeString(";Telugu-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_KANNADA]) {
+ id += UnicodeString(";Kannada-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ if (scripts[SE_MALAYALAM]) {
+ id += UnicodeString(";Malayalam-Latin");
+ scripts[SE_LATIN] = true;
+ }
+ }
+ else {
+ if (scripts[SE_LATIN]) {
+ id += UnicodeString(";Latin-InterIndic");
+ }
+ if (scripts[SE_DEVANAGARI]) {
+ id += UnicodeString(";Devanagari-InterIndic");
+ }
+ if (scripts[SE_TAMIL]) {
+ id += UnicodeString(";Tamil-InterIndic");
+ }
+ if (scripts[SE_BENGALI]) {
+ id += UnicodeString(";Bengali-InterIndic");
+ }
+ if (scripts[SE_GURMUKHI]) {
+ id += UnicodeString(";Gurmurkhi-InterIndic");
+ }
+ if (scripts[SE_GUJARATI]) {
+ id += UnicodeString(";Gujarati-InterIndic");
+ }
+ if (scripts[SE_ORIYA]) {
+ id += UnicodeString(";Oriya-InterIndic");
+ }
+ if (scripts[SE_TELUGU]) {
+ id += UnicodeString(";Telugu-InterIndic");
+ }
+ if (scripts[SE_KANNADA]) {
+ id += UnicodeString(";Kannada-InterIndic");
+ }
+ if (scripts[SE_MALAYALAM]) {
+ id += UnicodeString(";Malayalam-InterIndic");
+ }
+
+ switch(option) {
+ case SE_DEVANAGARI:
+ id += UnicodeString(";InterIndic-Devanagari");
+ break;
+ case SE_TAMIL:
+ id += UnicodeString(";InterIndic-Tamil");
+ break;
+ case SE_BENGALI:
+ id += UnicodeString(";InterIndic-Bengali");
+ break;
+ case SE_GURMUKHI:
+ id += UnicodeString(";InterIndic-Gurmukhi");
+ break;
+ case SE_GUJARATI:
+ id += UnicodeString(";InterIndic-Gujarati");
+ break;
+ case SE_ORIYA:
+ id += UnicodeString(";InterIndic-Oriya");
+ break;
+ case SE_TELUGU:
+ id += UnicodeString(";InterIndic-Telugu");
+ break;
+ case SE_KANNADA:
+ id += UnicodeString(";InterIndic-Kannada");
+ break;
+ case SE_MALAYALAM:
+ id += UnicodeString(";InterIndic-Malayalam");
+ break;
+ default:
+ id += UnicodeString(";InterIndic-Latin");
+ scripts[SE_LATIN] = true;
+ break;
+ }
+ }
+
+ if (scripts[SE_LATIN]) {
+ switch (option) {
+ case SE_GREEK:
+ id += UnicodeString(";Latin-Greek");
+ break;
+ case SE_HEBREW:
+ id += UnicodeString(";Latin-Hebrew");
+ break;
+ case SE_CYRILLIC:
+ id += UnicodeString(";Latin-Cyrillic");
+ break;
+ case SE_ARABIC:
+ id += UnicodeString(";Latin-Arabic");
+ break;
+ case SE_SYRIAC:
+ id += UnicodeString(";Latin-Syriac");
+ break;
+ case SE_THAI:
+ id += UnicodeString(";Latin-Thai");
+ break;
+ case SE_GEORGIAN:
+ id += UnicodeString(";Latin-Georgian");
+ break;
+ case SE_ARMENIAN:
+ id += UnicodeString(";Latin-Armenian");
+ break;
+ case SE_ETHIOPIC:
+ id += UnicodeString(";Latin-Ethiopic");
+ break;
+ case SE_GOTHIC:
+ id += UnicodeString(";Latin-Gothic");
+ break;
+ case SE_UGARITIC:
+ id += UnicodeString(";Latin-Ugaritic");
+ break;
+ case SE_COPTIC:
+ id += UnicodeString(";Latin-Coptic");
+ break;
+ case SE_KATAKANA:
+ id += UnicodeString(";Latin-Katakana");
+ break;
+ case SE_HIRAGANA:
+ id += UnicodeString(";Latin-Hiragana");
+ break;
+ case SE_JAMO:
+ id += UnicodeString(";Latin-Jamo");
+ break;
+ case SE_HANGUL:
+ id += UnicodeString(";Latin-Hangul");
+ break;
+ }
+ }
+
+ if (option == SE_BASICLATIN) {
+ id += UnicodeString(";Any-Latin1");
+ }
+
+ if (noNFC) {
+ id += UnicodeString(";NFD");
+ } else {
+ id += UnicodeString(";NFC");
+ }
+
+ UParseError perr;
+
+ err = U_ZERO_ERROR;
+ Transliterator * trans = Transliterator::createInstance(id, UTRANS_FORWARD, perr, err);
+ if (trans) {
+ UnicodeString target = UnicodeString(source);
+ trans->transliterate(target);
+ len = ucnv_fromUChars(conv, text, maxlen, target.getBuffer(), target.length(), &err);
+ if (len < maxlen) *(text + len) = 0;
+ else *(text + maxlen) = 0;
+ delete trans;
+ }
+ ucnv_close(conv);
+ }
+ return 0;
+}
+#endif