diff options
author | Roberto C. Sanchez <roberto@connexer.com> | 2014-05-12 08:21:30 -0400 |
---|---|---|
committer | Roberto C. Sanchez <roberto@connexer.com> | 2014-05-12 08:21:30 -0400 |
commit | 7a00574163029c0c2b649878c95d5acbd083564a (patch) | |
tree | c13cc5736025834df2874ed87ee8598070025ea6 /src/modules/filters/utf8arabicpoints.cpp | |
parent | b745315323de9f27538edac9453205ca70e6186e (diff) |
Imported Upstream version 1.7.2+dfsg
Diffstat (limited to 'src/modules/filters/utf8arabicpoints.cpp')
-rw-r--r-- | src/modules/filters/utf8arabicpoints.cpp | 174 |
1 files changed, 151 insertions, 23 deletions
diff --git a/src/modules/filters/utf8arabicpoints.cpp b/src/modules/filters/utf8arabicpoints.cpp index bd3169b..42bfaa5 100644 --- a/src/modules/filters/utf8arabicpoints.cpp +++ b/src/modules/filters/utf8arabicpoints.cpp @@ -1,10 +1,11 @@ /****************************************************************************** * - * UTF8ArabicPoints - SWFilter descendant to remove UTF-8 Arabic vowel points + * utf8arabicpoints.cpp - SWFilter descendant to remove UTF-8 + * Arabic vowel points * - * $Id: utf8arabicpoints.h 1688 2008-11-30 04:42:26Z refdoc $ + * $Id: utf8arabicpoints.cpp 2980 2013-09-14 21:51:47Z scribe $ * - * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * Copyright 2009-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -25,36 +26,163 @@ #include <stdio.h> #include <utf8arabicpoints.h> + SWORD_NAMESPACE_START -const char oName[] = "Arabic Vowel Points"; -const char oTip[] = "Toggles Arabic Vowel Points"; +namespace { + + static const char oName[] = "Arabic Vowel Points"; + static const char oTip[] = "Toggles Arabic Vowel Points"; + + static const StringList *oValues() { + static const SWBuf choices[3] = {"On", "Off", ""}; + static const StringList oVals(&choices[0], &choices[2]); + return &oVals; + } + + + static char *nextMark(const char* from, int* mark_size) { + // Arabic vowel points currently targeted for elimination: + // Table entries excerpted from + // http://www.utf8-chartable.de/unicode-utf8-table.pl. + // Code UTF-8 Description + // point + // ----- --------- ----------- + // U+064B d9 8b ARABIC FATHATAN + // U+064C d9 8c ARABIC DAMMATAN + // U+064D d9 8d ARABIC KASRATAN + // U+064E d9 8e ARABIC FATHA + // U+064F d9 8f ARABIC DAMMA + // U+0650 d9 90 ARABIC KASRA + // U+0651 d9 91 ARABIC SHADDA + // U+0652 d9 92 ARABIC SUKUN + // U+0653 d9 93 ARABIC MADDAH ABOVE + // U+0654 d9 94 ARABIC HAMZA ABOVE + // U+0655 d9 95 ARABIC HAMZA BELOW + // + // U+FC5E ef b1 9e ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM + // U+FC5F ef b1 9f ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM + // U+FC60 ef b1 a0 ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM + // U+FC61 ef b1 a1 ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM + // U+FC62 ef b1 a2 ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM + // U+FC63 ef b1 a3 ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM + // + // U+FE70 ef b9 b0 ARABIC FATHATAN ISOLATED FORM + // U+FE71 ef b9 b1 ARABIC TATWEEL WITH FATHATAN ABOVE + // U+FE72 ef b9 b2 ARABIC DAMMATAN ISOLATED FORM + // U+FE73 ef b9 b3 ARABIC TAIL FRAGMENT + // U+FE74 ef b9 b4 ARABIC KASRATAN ISOLATED FORM + // U+FE75 ef b9 b5 ??? + // U+FE76 ef b9 b6 ARABIC FATHA ISOLATED FORM + // U+FE77 ef b9 b7 ARABIC FATHA MEDIAL FORM + // U+FE78 ef b9 b8 ARABIC DAMMA ISOLATED FORM + // U+FE79 ef b9 b9 ARABIC DAMMA MEDIAL FORM + // U+FE7A ef b9 ba ARABIC KASRA ISOLATED FORM + // U+FE7B ef b9 bb ARABIC KASRA MEDIAL FORM + // U+FE7C ef b9 bc ARABIC SHADDA ISOLATED FORM + // U+FE7D ef b9 bd ARABIC SHADDA MEDIAL FORM + // U+FE7E ef b9 be ARABIC SUKUN ISOLATED FORM + // U+FE7F ef b9 bf ARABIC SUKUN MEDIAL FORM + + unsigned char* byte = (unsigned char*) from; + for (; *byte; ++byte) { + if (byte[0] == 0xD9) { + if (byte[1] >= 0x8B && byte[1] <= 0x95) { + *mark_size = 2; + break; + } + continue; + } + if (byte[0] == 0xEF) { + if (byte[1] == 0xB1) { + if (byte[2] >= 0x9E && byte[2] <= 0xA3) { + *mark_size = 3; + break; + } + continue; + } + if (byte[1] == 0xB9) { + if (byte[2] >= 0xB0 && byte[2] <= 0xBF) { + *mark_size = 3; + break; + } + continue; + } + } + } + return (char*)byte; + } +} -const SWBuf choices[3] = {"On", "Off", ""}; -const StringList oValues(&choices[0], &choices[2]); -UTF8ArabicPoints::UTF8ArabicPoints() : SWOptionFilter(oName, oTip, &oValues) { - setOptionValue("On"); +UTF8ArabicPoints::UTF8ArabicPoints() : SWOptionFilter(oName, oTip, oValues()) { } + UTF8ArabicPoints::~UTF8ArabicPoints(){}; -char UTF8ArabicPoints::processText(SWBuf &text, const SWKey *key, const SWModule *module) { - if (!option) { - //The UTF-8 range 0xFC 0xE5 to 0xFC 0x63 consist of Arabic vowel marks so block those out. - // Also ranges 0xFE70 til OxFE7F and 0x064b-0x0655 - SWBuf orig = text; - const unsigned char* from = (unsigned char*)orig.c_str(); - for (text = ""; *from; from++) { - if (((*from == 0xFC) && (*(from + 1) >= 0xE5 && *(from + 1) <= 0x63)) || ((*from == 0xFE) && (*(from + 1) >= 0x70 && *(from + 1) <= 0x7F)) || ((*from == 0x06) && (*(from + 1) >= 0x4B && *(from + 1) <= 0x55))){ - from++; - } - else { - text += *from; - } + +char UTF8ArabicPoints::processText(SWBuf &text, const SWKey *, const SWModule *) { + // A non-zero/true option setting means that setOptionValue("On") + // was called which apparently means that Arabic Vowel Marks are ENABLED, + // so the filter's actions are DISABLED. + if (option) + return 0; + + // Eliminate Arabic vowel marks from the text. + // The recognized marks are determined by the "nextMark" function. + + // If nextMark were polymorphic (a virtual function or a function + // pointer), this function could be generically used in any filter that + // only removed (vs. replaced) areas of text based on the arbitrary + // match criteria encapsulated in the specific nextMark + // implementation. + int mark_size = 0; + char* mark_pos = nextMark(text.c_str(), &mark_size); + + // Here and at the end of the loop, + // test BOTH mark_pos AND *mark_pos for safety and to give nextMark + // the option of returning either NULL or a pointer to the null + // terminator when done. + if (!mark_pos || !*mark_pos) + return 0; // no marks found. + + // Purposely granting write access into SWBuf internal buffer via + // "end_of_output" avoids a needless temporary SWBuf copy. + // Everything before the first mark is already in its final position + // and can be safely ignored. So start appending at the current mark. + char* end_of_output = mark_pos; + + // For consistency, input starts at (vs. after) the first mark as well + // -- not a problem since the mark itself gets skipped, anyway. + const char* start_of_input = mark_pos; + do { + // At this point, "mark_pos" and "mark_pos+mark_size" delimit + // the text to drop. + // "start_of_input" is either mark_pos or any text between the + // end of any previous mark and the current mark_pos. + // This text is now ready to be moved into the output. + int ready_size = mark_pos - start_of_input; + if (ready_size > 0) { + // Append the input text before the current mark to the + // output. + // Must use bcopy vs. strncpy because the final + // end_of_output may overtake the original + // start_of_input. + memmove(end_of_output, start_of_input, ready_size); + // Keep appending to end_of_output. + end_of_output += ready_size; } - } + // Ensure the mark never gets copied. + start_of_input = mark_pos + mark_size; + // Find the next mark. + mark_pos = nextMark(start_of_input, &mark_size); + + } while (mark_pos && *mark_pos); // No more marks. + + // Copy any trailing input text AND always the terminating null. + memmove(end_of_output, start_of_input, strlen(start_of_input)+1); return 0; } |