1 files changed, 280 insertions, 0 deletions
diff --git a/src/mgr/stringmgr.cpp b/src/mgr/stringmgr.cpp
new file mode 100644
index 0000000..c4a994e
--- /dev/null
+++ b/src/mgr/stringmgr.cpp
@@ -0,0 +1,280 @@
+/******************************************************************************
+ *  stringmgr.cpp - implementation of class StringMgr
+ *
+ * $Id: stringmgr.cpp 2115 2007-10-16 18:29:00Z scribe $
+ *
+ * Copyright 1998 CrossWire Bible Society (http://www.crosswire.org)
+ *	CrossWire Bible Society
+ *	P. O. Box 2528
+ *	Tempe, AZ  85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <stringmgr.h>
+#include <swlog.h>
+#include <localemgr.h>
+#include <utilstr.h>
+
+#ifdef _ICU_
+
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+
+#include <unicode/unistr.h>
+#include <unicode/translit.h>
+
+#include <unicode/locid.h>
+
+#endif
+
+SWORD_NAMESPACE_START
+
+StringMgr *StringMgr::systemStringMgr = 0;
+
+class __staticsystemStringMgr {
+public:
+	__staticsystemStringMgr() { }
+	~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; }
+} _staticsystemStringMgr;
+
+/**
+ * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range            1st       2nd       3rd       4th       5th       6th
+ * U-00000000 - U-0000007F  0nnnnnnn
+ * U-00000080 - U-000007FF  110nnnnn  10nnnnnn
+ * U-00000800 - U-0000FFFF  1110nnnn  10nnnnnn  10nnnnnn
+ * U-00010000 - U-001FFFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-00200000 - U-03FFFFFF  111110nn  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-04000000 - U-7FFFFFFF  1111110n  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * Note:
+ *   The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
+ *   The number of bits of the leading byte before the first 0 is the total number of bytes
+ *   The "n" are the bits of the unicode codepoint.
+ *
+ * This routine does not check to see if the code point is in the range. It could.
+ *
+ * @param txt the text to check
+ * @return  1 if all high order characters form a valid unicode sequence
+ *         -1 if there are no high order characters
+ *          0 if there are high order characters that do not form a valid unicode sequence
+ * @author DM Smith [dmsmith555 at yahoo dot com]
+ */
+int isValidUTF8(unsigned char *txt) {
+	unsigned int  countUTF8 = 0;
+#if 0
+	unsigned char parts     = 0;
+
+
+	unsigned char *p = txt;
+	while (*p) {
+		// Is the high order bit set?
+		if (*p & 0x80) {
+			// then count the number of high order bits that are set
+			// this determines the number of following bytes need to have high order bits set
+			unsigned char i = *p;
+			for (parts = 0; i & 0x80; parts++) {
+				i <<= 1;
+			}
+
+
+			// The pattern 10nnnnnn is not a unicode character
+			if (parts == 1) {
+				return 0;
+			}
+			else {
+				while (--parts && ++*p) {
+					// The pattern of each following character must be: 10nnnnnn
+					if (0xc0 & *p != 0x80) {
+						return  0;
+					}
+				}
+
+				// Oops, we've run out of bytes too soon: Cannot be UTF-8
+				if (parts) {
+					return 0;
+				}
+			}
+			countUTF8++;
+		}
+	}
+
+	// At this point it is either UTF-8 or ascii
+#endif
+	return countUTF8 ? 1 : -1;
+}
+
+
+#ifdef _ICU_
+
+//here comes our ICUStringMgr reimplementation
+class ICUStringMgr : public StringMgr {
+public:
+	virtual char *upperUTF8(char *, unsigned int maxlen = 0) const;
+	
+protected:
+	virtual bool supportsUnicode() const { return true; };
+};
+
+#endif
+
+
+/** Default constructor
+*/		
+StringMgr::StringMgr() {
+}
+
+/** Copy constructor
+*/	
+StringMgr::StringMgr(const StringMgr &m) {
+}
+
+/** Destructor
+*/	
+StringMgr::~StringMgr() {
+}
+
+/** Sets the global StringMgr handle
+* @param newStringMgr The new global StringMgr. This pointer will be deleted by this StringMgr
+*/	
+void StringMgr::setSystemStringMgr(StringMgr *newStringMgr) {
+	if (systemStringMgr) 
+		delete systemStringMgr;
+	
+	systemStringMgr = newStringMgr;
+
+   // TODO: this is magic. apparently we have to reset the system localemgr upon changing stringmgr.
+   // setting system stringmgr should be set before localemgr and not possible to change.
+   // rework this design.
+	LocaleMgr::getSystemLocaleMgr()->setSystemLocaleMgr(new LocaleMgr());
+}
+
+/** Returns the global StringMgr handle
+* @return The global string handle
+*/
+StringMgr* StringMgr::getSystemStringMgr() {
+	if (!systemStringMgr) {
+#ifdef _ICU_
+		systemStringMgr = new ICUStringMgr();
+// 		SWLog::getSystemLog()->logInformation("created default ICUStringMgr");
+#else
+		systemStringMgr = new StringMgr();
+//  		SWLog::getSystemLog()->logInformation("created default StringMgr");
+#endif
+	}
+	
+	return systemStringMgr;
+}
+
+
+/**
+ * This is a fallback method.  It should never be called.
+ * If UTF8 support is desired, then a UTF8 StringMgr needs
+ * to be used.
+ *
+ * Here we just do our best.
+ *
+ * Converts the param to an upper case UTF8 string
+ * @param t - The text encoded in utf8 which should be turned into an upper case string
+ *
+ */	
+char *StringMgr::upperUTF8(char *t, unsigned int maxlen) const {
+	// try to decide if it's worth trying to toupper.  Do we have more
+	// characters which are probably lower latin than not?
+	// we still don't use isValidUTF8 optimally. what if we have 1 unicode
+	// character in the string?  should we not try to upper any of the string?
+	// dunno.  Best solution is to upper all other characters. Don't have
+	// time to write that before release.
+	long performOp = 0;
+	if (!isValidUTF8((unsigned char *)t)) {
+		performOp = 1;
+	}
+	else {
+		for (const char *ch = t; *ch; ch++) {
+			performOp += (*ch > 0) ? 1 : -1;
+		}
+	}
+
+	if (performOp > 0) {
+		return upperLatin1(t);
+	}
+
+	return t;
+}
+
+
+/**
+ * Converts the param to an uppercase latin1 string
+ * @param The text encoded in latin1 which should be turned into an upper case string
+ */	
+char *StringMgr::upperLatin1(char *buf, unsigned int maxlen) const {
+	if (!buf)
+		return 0;
+		
+	char *ret = buf;
+	bool checkMax = maxlen;
+
+	while (*buf && (!checkMax || maxlen--)) {
+		*buf = SW_toupper(*buf);
+		buf++;
+	}
+
+	return ret;
+}
+
+bool StringMgr::supportsUnicode() const {
+	return false; //default impl has no UTF8 support
+}
+
+
+#ifdef _ICU_
+
+char *ICUStringMgr::upperUTF8(char *buf, unsigned int maxlen) const {
+	char *ret = buf;
+	int max = (maxlen) ? maxlen : strlen(buf);
+		
+	UErrorCode err = U_ZERO_ERROR;
+		
+	if (!buf || !max) {
+		return ret;
+	}
+		
+	UChar *lowerStr = new UChar[max+10];
+	UChar *upperStr = new UChar[max+10];
+		
+	u_strFromUTF8(lowerStr, max+9, 0, buf, -1, &err);
+	if (err != U_ZERO_ERROR) {
+//		SWLog::getSystemLog()->logError("from: %s", u_errorName(err));
+		delete [] lowerStr;
+		delete [] upperStr;
+		return ret;
+	}
+
+	u_strToUpper(upperStr, max+9, lowerStr, -1, 0, &err);
+	if (err != U_ZERO_ERROR) {
+//		SWLog::getSystemLog()->logError("upperCase: %s", u_errorName(err));
+		delete [] lowerStr;
+		delete [] upperStr;
+		return ret;
+	}
+
+	ret = u_strToUTF8(ret, max, 0, upperStr, -1, &err);
+		
+	delete [] lowerStr;
+	delete [] upperStr;
+	return ret;
+}
+	
+#endif
+
+SWORD_NAMESPACE_END