summaryrefslogtreecommitdiff
path: root/src/mgr/stringmgr.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mgr/stringmgr.cpp')
-rw-r--r--src/mgr/stringmgr.cpp280
1 files changed, 280 insertions, 0 deletions
diff --git a/src/mgr/stringmgr.cpp b/src/mgr/stringmgr.cpp
new file mode 100644
index 0000000..c4a994e
--- /dev/null
+++ b/src/mgr/stringmgr.cpp
@@ -0,0 +1,280 @@
+/******************************************************************************
+ * stringmgr.cpp - implementation of class StringMgr
+ *
+ * $Id: stringmgr.cpp 2115 2007-10-16 18:29:00Z scribe $
+ *
+ * Copyright 1998 CrossWire Bible Society (http://www.crosswire.org)
+ * CrossWire Bible Society
+ * P. O. Box 2528
+ * Tempe, AZ 85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <stringmgr.h>
+#include <swlog.h>
+#include <localemgr.h>
+#include <utilstr.h>
+
+#ifdef _ICU_
+
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
+
+#include <unicode/locid.h>
+
+#endif
+
+SWORD_NAMESPACE_START
+
+StringMgr *StringMgr::systemStringMgr = 0;
+
+class __staticsystemStringMgr {
+public:
+ __staticsystemStringMgr() { }
+ ~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; }
+} _staticsystemStringMgr;
+
+/**
+ * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th 5th 6th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
+ * The number of bits of the leading byte before the first 0 is the total number of bytes
+ * The "n" are the bits of the unicode codepoint.
+ *
+ * This routine does not check to see if the code point is in the range. It could.
+ *
+ * @param txt the text to check
+ * @return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters
+ * 0 if there are high order characters that do not form a valid unicode sequence
+ * @author DM Smith [dmsmith555 at yahoo dot com]
+ */
+int isValidUTF8(unsigned char *txt) {
+ unsigned int countUTF8 = 0;
+#if 0
+ unsigned char parts = 0;
+
+
+ unsigned char *p = txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // then count the number of high order bits that are set
+ // this determines the number of following bytes need to have high order bits set
+ unsigned char i = *p;
+ for (parts = 0; i & 0x80; parts++) {
+ i <<= 1;
+ }
+
+
+ // The pattern 10nnnnnn is not a unicode character
+ if (parts == 1) {
+ return 0;
+ }
+ else {
+ while (--parts && ++*p) {
+ // The pattern of each following character must be: 10nnnnnn
+ if (0xc0 & *p != 0x80) {
+ return 0;
+ }
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (parts) {
+ return 0;
+ }
+ }
+ countUTF8++;
+ }
+ }
+
+ // At this point it is either UTF-8 or ascii
+#endif
+ return countUTF8 ? 1 : -1;
+}
+
+
+#ifdef _ICU_
+
+//here comes our ICUStringMgr reimplementation
+class ICUStringMgr : public StringMgr {
+public:
+ virtual char *upperUTF8(char *, unsigned int maxlen = 0) const;
+
+protected:
+ virtual bool supportsUnicode() const { return true; };
+};
+
+#endif
+
+
+/** Default constructor
+*/
+StringMgr::StringMgr() {
+}
+
+/** Copy constructor
+*/
+StringMgr::StringMgr(const StringMgr &m) {
+}
+
+/** Destructor
+*/
+StringMgr::~StringMgr() {
+}
+
+/** Sets the global StringMgr handle
+* @param newStringMgr The new global StringMgr. This pointer will be deleted by this StringMgr
+*/
+void StringMgr::setSystemStringMgr(StringMgr *newStringMgr) {
+ if (systemStringMgr)
+ delete systemStringMgr;
+
+ systemStringMgr = newStringMgr;
+
+ // TODO: this is magic. apparently we have to reset the system localemgr upon changing stringmgr.
+ // setting system stringmgr should be set before localemgr and not possible to change.
+ // rework this design.
+ LocaleMgr::getSystemLocaleMgr()->setSystemLocaleMgr(new LocaleMgr());
+}
+
+/** Returns the global StringMgr handle
+* @return The global string handle
+*/
+StringMgr* StringMgr::getSystemStringMgr() {
+ if (!systemStringMgr) {
+#ifdef _ICU_
+ systemStringMgr = new ICUStringMgr();
+// SWLog::getSystemLog()->logInformation("created default ICUStringMgr");
+#else
+ systemStringMgr = new StringMgr();
+// SWLog::getSystemLog()->logInformation("created default StringMgr");
+#endif
+ }
+
+ return systemStringMgr;
+}
+
+
+/**
+ * This is a fallback method. It should never be called.
+ * If UTF8 support is desired, then a UTF8 StringMgr needs
+ * to be used.
+ *
+ * Here we just do our best.
+ *
+ * Converts the param to an upper case UTF8 string
+ * @param t - The text encoded in utf8 which should be turned into an upper case string
+ *
+ */
+char *StringMgr::upperUTF8(char *t, unsigned int maxlen) const {
+ // try to decide if it's worth trying to toupper. Do we have more
+ // characters which are probably lower latin than not?
+ // we still don't use isValidUTF8 optimally. what if we have 1 unicode
+ // character in the string? should we not try to upper any of the string?
+ // dunno. Best solution is to upper all other characters. Don't have
+ // time to write that before release.
+ long performOp = 0;
+ if (!isValidUTF8((unsigned char *)t)) {
+ performOp = 1;
+ }
+ else {
+ for (const char *ch = t; *ch; ch++) {
+ performOp += (*ch > 0) ? 1 : -1;
+ }
+ }
+
+ if (performOp > 0) {
+ return upperLatin1(t);
+ }
+
+ return t;
+}
+
+
+/**
+ * Converts the param to an uppercase latin1 string
+ * @param The text encoded in latin1 which should be turned into an upper case string
+ */
+char *StringMgr::upperLatin1(char *buf, unsigned int maxlen) const {
+ if (!buf)
+ return 0;
+
+ char *ret = buf;
+ bool checkMax = maxlen;
+
+ while (*buf && (!checkMax || maxlen--)) {
+ *buf = SW_toupper(*buf);
+ buf++;
+ }
+
+ return ret;
+}
+
+bool StringMgr::supportsUnicode() const {
+ return false; //default impl has no UTF8 support
+}
+
+
+#ifdef _ICU_
+
+char *ICUStringMgr::upperUTF8(char *buf, unsigned int maxlen) const {
+ char *ret = buf;
+ int max = (maxlen) ? maxlen : strlen(buf);
+
+ UErrorCode err = U_ZERO_ERROR;
+
+ if (!buf || !max) {
+ return ret;
+ }
+
+ UChar *lowerStr = new UChar[max+10];
+ UChar *upperStr = new UChar[max+10];
+
+ u_strFromUTF8(lowerStr, max+9, 0, buf, -1, &err);
+ if (err != U_ZERO_ERROR) {
+// SWLog::getSystemLog()->logError("from: %s", u_errorName(err));
+ delete [] lowerStr;
+ delete [] upperStr;
+ return ret;
+ }
+
+ u_strToUpper(upperStr, max+9, lowerStr, -1, 0, &err);
+ if (err != U_ZERO_ERROR) {
+// SWLog::getSystemLog()->logError("upperCase: %s", u_errorName(err));
+ delete [] lowerStr;
+ delete [] upperStr;
+ return ret;
+ }
+
+ ret = u_strToUTF8(ret, max, 0, upperStr, -1, &err);
+
+ delete [] lowerStr;
+ delete [] upperStr;
+ return ret;
+}
+
+#endif
+
+SWORD_NAMESPACE_END