diff options
Diffstat (limited to 'src/modules/filters/utf8greekaccents.cpp')
-rw-r--r-- | src/modules/filters/utf8greekaccents.cpp | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/src/modules/filters/utf8greekaccents.cpp b/src/modules/filters/utf8greekaccents.cpp new file mode 100644 index 0000000..039f9a4 --- /dev/null +++ b/src/modules/filters/utf8greekaccents.cpp @@ -0,0 +1,285 @@ +/****************************************************************************** + * + * utf8greekaccents.cpp - SWFilter descendant to remove UTF-8 Greek + * accents + * + * $Id: utf8greekaccents.cpp 2980 2013-09-14 21:51:47Z scribe $ + * + * Copyright 2001-2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include <stdlib.h> +#include <stdio.h> +#include <utf8greekaccents.h> + + +#ifdef _ICU_ +#include <utf8nfkd.h> +sword::UTF8NFKD decompose; +#endif + + +SWORD_NAMESPACE_START + +namespace { + + static const char oName[] = "Greek Accents"; + static const char oTip[] = "Toggles Greek Accents"; + + static const StringList *oValues() { + static const SWBuf choices[3] = {"On", "Off", ""}; + static const StringList oVals(&choices[0], &choices[2]); + return &oVals; + } +} + + +UTF8GreekAccents::UTF8GreekAccents() : SWOptionFilter(oName, oTip, oValues()) { +} + + +UTF8GreekAccents::~UTF8GreekAccents() {}; + + +char UTF8GreekAccents::processText(SWBuf &text, const SWKey *key, const SWModule *module) { + + if (!option) { //we don't want greek accents + //unsigned char *to, *from; + //to = (unsigned char*)text; + //for (from = (unsigned char*)text; *from; from++) { +#ifdef _ICU_ + decompose.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks +#endif + + SWBuf orig = text; + const unsigned char* from = (unsigned char*)orig.c_str(); + for (text = ""; *from; from++) { + //first just remove combining characters + if (*from == 0xE2 && *(from + 1) == 0x80 && *(from + 2) == 0x99) { + from += 2; + } + else if (*from == 0xCC && *(from + 1)) { + if (*(from + 1) == 0x80 || *(from + 1) == 0x81 || *(from + 1) == 0x82 || *(from + 1) == 0x88 || *(from + 1) == 0x93 || *(from + 1) == 0x94) { + from++; + } + } + else if (*from == 0xCD && (*(from + 1) == 0xBA || *(from + 1) == 0x82)) { + from++; + } + //now converted pre-composed characters to their alphabetic bases, discarding the accents + + //Greek + //capital alpha + else if ((*from == 0xCE && *(from + 1) == 0x86)) { + text += 0xCE; + text += 0x91; + from++; + } + //capital epsilon + else if ((*from == 0xCE && *(from + 1) == 0x88)) { + text += 0xCE; + text += 0x95; + from++; + } + //capital eta + else if ((*from == 0xCE && *(from + 1) == 0x89)) { + text += 0xCE; + text += 0x97; + from++; + } + //capital iota + else if ((*from == 0xCE && (*(from + 1) == 0x8A || *(from + 1) == 0xAA))) { + text += 0xCE; + text += 0x99; + from++; + } + //capital omicron + else if ((*from == 0xCE && *(from + 1) == 0x8C)) { + text += 0xCE; + text += 0x9F; + from++; + } + //capital upsilon + else if ((*from == 0xCE && (*(from + 1) == 0x8E || *(from + 1) == 0xAB))) { + text += 0xCE; + text += 0xA5; + from++; + } + //capital omega + else if ((*from == 0xCE && *(from + 1) == 0x8F)) { + text += 0xCE; + text += 0xA9; + from++; + } + + //alpha + else if ((*from == 0xCE && *(from + 1) == 0xAC)) { + text += 0xCE; + text += 0xB1; + from++; + } + //epsilon + else if ((*from == 0xCE && *(from + 1) == 0xAD)) { + text += 0xCE; + text += 0xB5; + from++; + } + //eta + else if ((*from == 0xCE && *(from + 1) == 0xAE)) { + text += 0xCE; + text += 0xB7; + from++; + } + //iota + else if ((*from == 0xCE && *(from + 1) == 0xAF) || (*from == 0xCF && *(from + 1) == 0x8A)) { + text += 0xCE; + text += 0xB9; + from++; + } + //omicron + else if ((*from == 0xCF && *(from + 1) == 0x8C)) { + text += 0xCE; + text += 0xBF; + from++; + } + //upsilon + else if ((*from == 0xCE && *(from + 1) == 0x88) || (*from == 0xCF && (*(from + 1) == 0x8B || *(from + 1) == 0x8D))) { + text += 0xCF; + text += 0x85; + from++; + } + //omega + else if ((*from == 0xCF && *(from + 1) == 0x8E)) { + text += 0xCF; + text += 0x89; + from++; + } + + //Extended Greek + //capital alpha + else if (*from == 0xE1 && (((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x88 && *(from + 2) <= 0x8F) || (*(from + 1) == 0xBE && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBC))) { + text += 0xCE; + text += 0x91; + from+=2; + } + //capital epsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x98 && *(from + 2) <= 0x9D) || (*(from + 1) == 0xBF && (*(from + 2) == 0x88 || *(from + 2) == 0x89)))) { + text += 0xCE; + text += 0x95; + from+=2; + } + //capital eta + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBE && *(from + 2) >= 0x98 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0x8A && *(from + 2) <= 0x8C))) { + text += 0xCE; + text += 0x97; + from+=2; + } + //capital iota + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB8 && *(from + 2) <= 0xBF) || (*(from + 1) == 0xBF && *(from + 2) >= 0x98 && *(from + 2) <= 0x9B))) { + text += 0xCE; + text += 0x99; + from+=2; + } + //capital omicron + else if (*from == 0xE1 && (((*(from + 1) == 0xBD && *(from + 2) >= 0x88 && *(from + 2) <= 0x8D)) || ((*(from + 1) == 0xBF && (*(from + 2) == 0xB8 || *(from + 2) == 0xB9))))) { + text += 0xCE; + text += 0x9F; + from+=2; + } + //capital upsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && *(from + 2) >= 0x99 && *(from + 2) <= 0x9F) || (*(from + 1) == 0xBF && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAB))) { + text += 0xCE; + text += 0xA5; + from+=2; + } + //capital omega + else if (*from == 0xE1 && (((*(from + 1) == 0xBD || *(from + 1) == 0xBE) && *(from + 2) >= 0xA8 && *(from + 2) <= 0xAF) || (*(from + 1) == 0xBF && *(from + 2) >= 0xBA && *(from + 2) <= 0xBC))) { + text += 0xCE; + text += 0xA9; + from+=2; + } + //capital rho + else if (*from == 0xE1 && *(from + 1) == 0xBF && *(from + 2) == 0xAC) { + text += 0xCE; + text += 0xA1; + from+=2; + } + + //alpha + else if (*from == 0xE1 && ( + ((*(from + 1) == 0xBC || *(from + 1) == 0xBE) && *(from + 2) >= 0x80 && *(from + 2) <= 0x87) + || (*(from + 1) == 0xBD && (*(from + 2) == 0xB0 || *(from + 2) == 0xB1)) + || (*(from + 1) == 0xBE && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7))) { + text += 0xCE; + text += 0xB1; + from+=2; + } + //epsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0x90 && *(from + 2) <= 0x95) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB2 || *(from + 2) == 0xB3)))) { + text += 0xCE; + text += 0xB5; + from+=2; + } + //eta + else if (*from == 0xE1 && ((*(from + 1) == 0xBE && *(from + 2) >= 0x90 && *(from + 2) <= 0x97) || (*(from + 1) == 0xBC && *(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 1) == 0xBF && *(from + 2) >= 0x82 && *(from + 2) <= 0x87) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB4 || *(from + 2) == 0xB5)))) { + text += 0xCE; + text += 0xB7; + from+=2; + } + //iota + else if (*from == 0xE1 && ((*(from + 1) == 0xBC && *(from + 2) >= 0xB0 && *(from + 2) <= 0xB7) || (*(from + 1) == 0xBD && (*(from + 2) == 0xB6 || *(from + 2) == 0xB7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0x90 && *(from + 2) <= 0x97))) { + text += 0xCE; + text += 0xB9; + from+=2; + } + //omicron + else if (*from == 0xE1 && (*(from + 1) == 0xBD && ((*(from + 2) >= 0x80 && *(from + 2) <= 0x85) || (*(from + 2) == 0xB8 || *(from + 2) == 0xB9)))) { + text += 0xCE; + text += 0xBF; + from+=2; + } + //upsilon + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0x90 && *(from + 2) <= 0x97) || *(from + 2) == 0xBA || *(from + 2) == 0xBB)) || (*(from + 1) == 0xBF && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA3) || *(from + 2) == 0xA6 || *(from + 2) == 0xA7)))) { + text += 0xCF; + text += 0x85; + from+=2; + } + //omega + else if (*from == 0xE1 && ((*(from + 1) == 0xBD && ((*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7) || (*(from + 2) == 0xBC || *(from + 2) == 0xBD))) || (*(from + 1) == 0xBE && (*(from + 2) >= 0xA0 && *(from + 2) <= 0xA7)) || (*(from + 1) == 0xBF && *(from + 2) >= 0xB2 && *(from + 2) <= 0xB7))) { + text += 0xCF; + text += 0x89; + from+=2; + } + //rho + else if (*from == 0xE1 && *(from + 1) == 0xBF && (*(from + 2) == 0xA4 && *(from + 2) == 0xA5)) { + text += 0xCF; + text += 0x81; + from+=2; + } + else { //no characters we filter + text += *from; + } + } + } + return 0; +} + + + + + + +SWORD_NAMESPACE_END |