diff options
Diffstat (limited to 'src/modules/filters/utf8greekaccents.cpp')
-rw-r--r-- | src/modules/filters/utf8greekaccents.cpp | 449 |
1 files changed, 223 insertions, 226 deletions
diff --git a/src/modules/filters/utf8greekaccents.cpp b/src/modules/filters/utf8greekaccents.cpp index b0e5dc8..05ef59b 100644 --- a/src/modules/filters/utf8greekaccents.cpp +++ b/src/modules/filters/utf8greekaccents.cpp @@ -1,6 +1,6 @@ /****************************************************************************** * - * UTF8GreekAccents - SWFilter decendant to remove UTF-8 Greek accents + * UTF8GreekAccents - SWFilter descendant to remove UTF-8 Greek accents * */ @@ -9,239 +9,235 @@ #include <stdio.h> #include <utf8greekaccents.h> +SWORD_NAMESPACE_START -const char UTF8GreekAccents::on[] = "On"; -const char UTF8GreekAccents::off[] = "Off"; -const char UTF8GreekAccents::optName[] = "Greek Accents"; -const char UTF8GreekAccents::optTip[] = "Toggles Greek Accents"; +const char oName[] = "Greek Accents"; +const char oTip[] = "Toggles Greek Accents"; -UTF8GreekAccents::UTF8GreekAccents() { - option = true; - options.push_back(on); - options.push_back(off); +const SWBuf choices[3] = {"On", "Off", ""}; +const StringList oValues(&choices[0], &choices[2]); + +UTF8GreekAccents::UTF8GreekAccents() : SWOptionFilter(oName, oTip, &oValues) { + setOptionValue("On"); } UTF8GreekAccents::~UTF8GreekAccents(){}; -void UTF8GreekAccents::setOptionValue(const char *ival) -{ - option = (!stricmp(ival, on)); -} - -const char *UTF8GreekAccents::getOptionValue() -{ - return (option) ? on:off; -} -char UTF8GreekAccents::ProcessText(char *text, int maxlen, const SWKey *key, const SWModule *module) -{ - if (!option) { - unsigned char *to, *from; - - to = (unsigned char*)text; - for (from = (unsigned char*)text; *from; from++) { - //first just remove combining characters - if (*from == 0xE2 && *(from + 1) == 0x80 && *(from + 2) == 0x99) - from += 2; - else if (*from == 0xCC && *(from + 1)) { - if (*(from + 1) == 0x80 || *(from + 1) == 0x81 || *(from + 1) == 0x82 || *(from + 1) == 0x88 || *(from + 1) == 0x93 || *(from + 1) == 0x94) - from++; - } - else if (*from == 0xCD && *(from + 1) == 0xBA) - from++; - //now converted pre-composed characters to their alphabetic bases, discarding the accents - - //Greek - //capital alpha - else if ((*from == 0xCE && *(from + 1) == 0x86)) { - *to++ = 0xCE; - *to++ = 0x91; - from++; - } - //capital epsilon - else if ((*from == 0xCE && *(from + 1) == 0x88)) { - *to++ = 0xCE; - *to++ = 0x95; - from++; - } - //capital eta - else if ((*from == 0xCE && *(from + 1) == 0x89)) { - *to++ = 0xCE; - *to++ = 0x97; - from++; - } - //capital iota - else if ((*from == 0xCE && (*(from + 1) == 0x8A || *(from + 1) == 0xAA))) { - *to++ = 0xCE; - *to++ = 0x99; - from++; - } - //capital omicron - else if ((*from == 0xCE && *(from + 1) == 0x8C)) { - *to++ = 0xCE; - *to++ = 0x9F; - from++; - } - //capital upsilon - else if ((*from == 0xCE && (*(from + 1) == 0x8E || *(from + 1) == 0xAB))) { - *to++ = 0xCE; - *to++ = 0xA5; - from++; - } - //capital omega - else if ((*from == 0xCE && *(from + 1) == 0x8F)) { - *to++ = 0xCE; - *to++ = 0xA9; - from++; - } - - //alpha - else if ((*from == 0xCE && *(from + 1) == 0xAC)) { - *to++ = 0xCE; - *to++ = 0xB1; - from++; - } - //epsilon - else if ((*from == 0xCE && *(from + 1) == 0xAD)) { - *to++ = 0xCE; - *to++ = 0xB5; - from++; - } - //eta - else if ((*from == 0xCE && *(from + 1) == 0xAE)) { - *to++ = 0xCE; - *to++ = 0xB7; - from++; - } - //iota - else if ((*from == 0xCE && *(from + 1) == 0xAF) || (*from == 0xCF && *(from + 1) == 0x8A)) { - *to++ = 0xCE; - *to++ = 0xB9; - from++; - } - //omicron - else if ((*from == 0xCF && *(from + 1) == 0x8C)) { - *to++ = 0xCE; - *to++ = 0xBF; - from++; - } - //upsilon - else if ((*from == 0xCE && *(from + 1) == 0x88) || (*from == 0xCF && (*(from + 1) == 0x8B || *(from + 1) == 0x8D))) { - *to++ = 0xCF; - *to++ = 0x85; - from++; - } - //omega - else if ((*from == 0xCF && *(from + 1) == 0x8E)) { - *to++ = 0xCF; - *to++ = 0x89; - from++; - } - - //Extended Greek - //capital alpha - else if (*from == 0xE1 && ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x88 && *(from + 2) <= 0x8F) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBC)) { - *to++ = 0xCE; - *to++ = 0x91; - from+=2; - } - //capital epsilon - else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x98 && *(from + 2) <= 0x9D) || (*(from + 1) == 0xBF && (*(from + 2) == 0x88 || *(from + 2) == 0x89)))) { - *to++ = 0xCE; - *to++ = 0x95; - from+=2; - } - //capital eta - else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBE && *(from + 2) >= 0x98 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0x8A && *(from + 2) <= 0x8C))) { - *to++ = 0xCE; - *to++ = 0x97; - from+=2; - } - //capital iota - else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBF) || (*(from + 1) == 0xBF && *(from + 2) >= 0x98 && *(from + 2) <= 0x9B))) { - *to++ = 0xCE; - *to++ = 0x99; - from+=2; - } - //capital omicron - else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x88 && *(from + 2) <= 0x8D) || (*(from + 1) == 0xBF && *(from + 2) == 0xB8 || *(from + 2) == 0xB9))) { - *to++ = 0xCE; - *to++ = 0x9F; - from+=2; - } - //capital upsilon - else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x99 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAB))) { - *to++ = 0xCE; - *to++ = 0xA5; - from+=2; - } - //capital omega - else if (*from == 0xE1 && (((*(from + 1) == 0xBD || *(from + 1) == 0xBE) && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBF && *(from + 2) >= 0xBA && *(from + 2) <= 0xBC))) { - *to++ = 0xCE; - *to++ = 0xA9; - from+=2; - } - //capital rho - else if (*from == 0xE1 && *(from + 1) == 0xBF && *(from + 2) == 0xAC) { - *to++ = 0xCE; - *to++ = 0xA1; - from+=2; - } - - //alpha - else if (*from == 0xE1 && ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x80 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB0 || *(from + 2) == 0xB1)) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7)) { - *to++ = 0xCE; - *to++ = 0xB1; - from+=2; - } - //epsilon - else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x90 && *(from + 2) <= 0x95) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB2 || *(from + 2) == 0xB3)))) { - *to++ = 0xCE; - *to++ = 0xB5; - from+=2; - } - //eta - else if (*from == 0xE1 && ((*(from + 1) == 0xBE && *(from + 2) >= 0x90 && *(from + 2) <= 0x97) || (*(from + 1) == 0xBC && *(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 1) == 0xBF && *(from + 2) >= 0x82 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB4 || *(from + 2) == 0xB5)))) { - *to++ = 0xCE; - *to++ = 0xB7; - from+=2; - } - //iota - else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB6 || *(from + 2) == 0xB7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0x90 && *(from + 2) <= 0x97))) { - *to++ = 0xCE; - *to++ = 0xB9; - from+=2; - } - //omicron - else if (*from == 0xE1 && (*(from + 1) == 0xBD && ((*(from + 2) >= 0x80 && *(from + 2) <= 0x85) || (*(from + 2) == 0xB8 || *(from + 2) == 0xB9)))) { - *to++ = 0xCE; - *to++ = 0xBF; - from+=2; - } - //upsilon - else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0x90 && *(from + 2) <= 0x97) || *(from + 2) == 0xBA || *(from + 2) == 0xBB)) || (*(from + 1) == 0xBF && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA3) || *(from + 2) == 0xA6 || *(from + 2) == 0xA7)))) { - *to++ = 0xCF; - *to++ = 0x85; - from+=2; - } - //omega - else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 2) == 0xBC || *(from + 2) == 0xBD))) || (*(from + 1) == 0xBE && (*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0xB2 && *(from + 2) <= 0xB7))) { - *to++ = 0xCF; - *to++ = 0x89; - from+=2; - } - //rho - else if (*from == 0xE1 && *(from + 1) == 0xBF && (*(from + 2) == 0xA4 && *(from + 2) == 0xA5)) { - *to++ = 0xCF; - *to++ = 0x81; - from+=2; - } - else - *to++ = *from; +char UTF8GreekAccents::processText(SWBuf &text, const SWKey *key, const SWModule *module) { + + if (!option) { //we don't want greek accents + //unsigned char *to, *from; + //to = (unsigned char*)text; + //for (from = (unsigned char*)text; *from; from++) { + + SWBuf orig = text; + const unsigned char* from = (unsigned char*)orig.c_str(); + for (text = ""; *from; from++) { + //first just remove combining characters + if (*from == 0xE2 && *(from + 1) == 0x80 && *(from + 2) == 0x99) { + from += 2; + } + else if (*from == 0xCC && *(from + 1)) { + if (*(from + 1) == 0x80 || *(from + 1) == 0x81 || *(from + 1) == 0x82 || *(from + 1) == 0x88 || *(from + 1) == 0x93 || *(from + 1) == 0x94) { + from++; + } + } + else if (*from == 0xCD && *(from + 1) == 0xBA) { + from++; + } + //now converted pre-composed characters to their alphabetic bases, discarding the accents + + //Greek + //capital alpha + else if ((*from == 0xCE && *(from + 1) == 0x86)) { + text += 0xCE; + text += 0x91; + from++; + } + //capital epsilon + else if ((*from == 0xCE && *(from + 1) == 0x88)) { + text += 0xCE; + text += 0x95; + from++; + } + //capital eta + else if ((*from == 0xCE && *(from + 1) == 0x89)) { + text += 0xCE; + text += 0x97; + from++; + } + //capital iota + else if ((*from == 0xCE && (*(from + 1) == 0x8A || *(from + 1) == 0xAA))) { + text += 0xCE; + text += 0x99; + from++; + } + //capital omicron + else if ((*from == 0xCE && *(from + 1) == 0x8C)) { + text += 0xCE; + text += 0x9F; + from++; + } + //capital upsilon + else if ((*from == 0xCE && (*(from + 1) == 0x8E || *(from + 1) == 0xAB))) { + text += 0xCE; + text += 0xA5; + from++; + } + //capital omega + else if ((*from == 0xCE && *(from + 1) == 0x8F)) { + text += 0xCE; + text += 0xA9; + from++; + } + + //alpha + else if ((*from == 0xCE && *(from + 1) == 0xAC)) { + text += 0xCE; + text += 0xB1; + from++; + } + //epsilon + else if ((*from == 0xCE && *(from + 1) == 0xAD)) { + text += 0xCE; + text += 0xB5; + from++; + } + //eta + else if ((*from == 0xCE && *(from + 1) == 0xAE)) { + text += 0xCE; + text += 0xB7; + from++; + } + //iota + else if ((*from == 0xCE && *(from + 1) == 0xAF) || (*from == 0xCF && *(from + 1) == 0x8A)) { + text += 0xCE; + text += 0xB9; + from++; + } + //omicron + else if ((*from == 0xCF && *(from + 1) == 0x8C)) { + text += 0xCE; + text += 0xBF; + from++; + } + //upsilon + else if ((*from == 0xCE && *(from + 1) == 0x88) || (*from == 0xCF && (*(from + 1) == 0x8B || *(from + 1) == 0x8D))) { + text += 0xCF; + text += 0x85; + from++; + } + //omega + else if ((*from == 0xCF && *(from + 1) == 0x8E)) { + text += 0xCF; + text += 0x89; + from++; + } + + //Extended Greek + //capital alpha + else if (*from == 0xE1 && ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x88 && *(from + 2) <= 0x8F) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBC)) { + text += 0xCE; + text += 0x91; + from+=2; + } + //capital epsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x98 && *(from + 2) <= 0x9D) || (*(from + 1) == 0xBF && (*(from + 2) == 0x88 || *(from + 2) == 0x89)))) { + text += 0xCE; + text += 0x95; + from+=2; + } + //capital eta + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBE && *(from + 2) >= 0x98 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0x8A && *(from + 2) <= 0x8C))) { + text += 0xCE; + text += 0x97; + from+=2; + } + //capital iota + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBF) || (*(from + 1) == 0xBF && *(from + 2) >= 0x98 && *(from + 2) <= 0x9B))) { + text += 0xCE; + text += 0x99; + from+=2; + } + //capital omicron + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x88 && *(from + 2) <= 0x8D) || (*(from + 1) == 0xBF && *(from + 2) == 0xB8 || *(from + 2) == 0xB9))) { + text += 0xCE; + text += 0x9F; + from+=2; + } + //capital upsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x99 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAB))) { + text += 0xCE; + text += 0xA5; + from+=2; + } + //capital omega + else if (*from == 0xE1 && (((*(from + 1) == 0xBD || *(from + 1) == 0xBE) && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBF && *(from + 2) >= 0xBA && *(from + 2) <= 0xBC))) { + text += 0xCE; + text += 0xA9; + from+=2; + } + //capital rho + else if (*from == 0xE1 && *(from + 1) == 0xBF && *(from + 2) == 0xAC) { + text += 0xCE; + text += 0xA1; + from+=2; + } + + //alpha + else if (*from == 0xE1 && ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x80 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB0 || *(from + 2) == 0xB1)) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7)) { + text += 0xCE; + text += 0xB1; + from+=2; + } + //epsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x90 && *(from + 2) <= 0x95) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB2 || *(from + 2) == 0xB3)))) { + text += 0xCE; + text += 0xB5; + from+=2; + } + //eta + else if (*from == 0xE1 && ((*(from + 1) == 0xBE && *(from + 2) >= 0x90 && *(from + 2) <= 0x97) || (*(from + 1) == 0xBC && *(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 1) == 0xBF && *(from + 2) >= 0x82 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB4 || *(from + 2) == 0xB5)))) { + text += 0xCE; + text += 0xB7; + from+=2; + } + //iota + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB6 || *(from + 2) == 0xB7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0x90 && *(from + 2) <= 0x97))) { + text += 0xCE; + text += 0xB9; + from+=2; + } + //omicron + else if (*from == 0xE1 && (*(from + 1) == 0xBD && ((*(from + 2) >= 0x80 && *(from + 2) <= 0x85) || (*(from + 2) == 0xB8 || *(from + 2) == 0xB9)))) { + text += 0xCE; + text += 0xBF; + from+=2; + } + //upsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0x90 && *(from + 2) <= 0x97) || *(from + 2) == 0xBA || *(from + 2) == 0xBB)) || (*(from + 1) == 0xBF && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA3) || *(from + 2) == 0xA6 || *(from + 2) == 0xA7)))) { + text += 0xCF; + text += 0x85; + from+=2; + } + //omega + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 2) == 0xBC || *(from + 2) == 0xBD))) || (*(from + 1) == 0xBE && (*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0xB2 && *(from + 2) <= 0xB7))) { + text += 0xCF; + text += 0x89; + from+=2; + } + //rho + else if (*from == 0xE1 && *(from + 1) == 0xBF && (*(from + 2) == 0xA4 && *(from + 2) == 0xA5)) { + text += 0xCF; + text += 0x81; + from+=2; + } + else { //no characters we filter + text += *from; + } + } } - *to++ = 0; - *to = 0; - } return 0; } @@ -250,3 +246,4 @@ char UTF8GreekAccents::ProcessText(char *text, int maxlen, const SWKey *key, con +SWORD_NAMESPACE_END |