summaryrefslogtreecommitdiff
path: root/src/mgr/stringmgr.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mgr/stringmgr.cpp')
-rw-r--r--src/mgr/stringmgr.cpp125
1 files changed, 66 insertions, 59 deletions
diff --git a/src/mgr/stringmgr.cpp b/src/mgr/stringmgr.cpp
index c4a994e..0390905 100644
--- a/src/mgr/stringmgr.cpp
+++ b/src/mgr/stringmgr.cpp
@@ -1,9 +1,10 @@
/******************************************************************************
- * stringmgr.cpp - implementation of class StringMgr
*
- * $Id: stringmgr.cpp 2115 2007-10-16 18:29:00Z scribe $
+ * stringmgr.cpp - implementation of class StringMgr
*
- * Copyright 1998 CrossWire Bible Society (http://www.crosswire.org)
+ * $Id: stringmgr.cpp 2980 2013-09-14 21:51:47Z scribe $
+ *
+ * Copyright 2004-2013 CrossWire Bible Society (http://www.crosswire.org)
* CrossWire Bible Society
* P. O. Box 2528
* Tempe, AZ 85280-2528
@@ -38,8 +39,10 @@
#endif
+
SWORD_NAMESPACE_START
+
StringMgr *StringMgr::systemStringMgr = 0;
class __staticsystemStringMgr {
@@ -48,70 +51,74 @@ public:
~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; }
} _staticsystemStringMgr;
-/**
- * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
- * Unicode Range 1st 2nd 3rd 4th 5th 6th
- * U-00000000 - U-0000007F 0nnnnnnn
- * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
- * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
- * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
- * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
- * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
- * Note:
- * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
- * The number of bits of the leading byte before the first 0 is the total number of bytes
- * The "n" are the bits of the unicode codepoint.
- *
- * This routine does not check to see if the code point is in the range. It could.
- *
- * @param txt the text to check
- * @return 1 if all high order characters form a valid unicode sequence
- * -1 if there are no high order characters
- * 0 if there are high order characters that do not form a valid unicode sequence
- * @author DM Smith [dmsmith555 at yahoo dot com]
- */
-int isValidUTF8(unsigned char *txt) {
- unsigned int countUTF8 = 0;
-#if 0
- unsigned char parts = 0;
-
-
- unsigned char *p = txt;
- while (*p) {
- // Is the high order bit set?
- if (*p & 0x80) {
- // then count the number of high order bits that are set
- // this determines the number of following bytes need to have high order bits set
- unsigned char i = *p;
- for (parts = 0; i & 0x80; parts++) {
- i <<= 1;
- }
-
- // The pattern 10nnnnnn is not a unicode character
- if (parts == 1) {
- return 0;
- }
- else {
- while (--parts && ++*p) {
- // The pattern of each following character must be: 10nnnnnn
- if (0xc0 & *p != 0x80) {
- return 0;
- }
+namespace {
+
+ /**
+ * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th 5th 6th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6.
+ * The number of bits of the leading byte before the first 0 is the total number of bytes
+ * The "n" are the bits of the unicode codepoint.
+ *
+ * This routine does not check to see if the code point is in the range. It could.
+ *
+ * @param txt the text to check
+ * @return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters
+ * 0 if there are high order characters that do not form a valid unicode sequence
+ * @author DM Smith [dmsmith555 at yahoo dot com]
+ */
+ int isValidUTF8(unsigned char *txt) {
+ unsigned int countUTF8 = 0;
+ #if 0
+ unsigned char parts = 0;
+
+
+ unsigned char *p = txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // then count the number of high order bits that are set
+ // this determines the number of following bytes need to have high order bits set
+ unsigned char i = *p;
+ for (parts = 0; i & 0x80; parts++) {
+ i <<= 1;
}
- // Oops, we've run out of bytes too soon: Cannot be UTF-8
- if (parts) {
+
+ // The pattern 10nnnnnn is not a unicode character
+ if (parts == 1) {
return 0;
}
+ else {
+ while (--parts && ++*p) {
+ // The pattern of each following character must be: 10nnnnnn
+ if (0xc0 & *p != 0x80) {
+ return 0;
+ }
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (parts) {
+ return 0;
+ }
+ }
+ countUTF8++;
}
- countUTF8++;
}
- }
- // At this point it is either UTF-8 or ascii
-#endif
- return countUTF8 ? 1 : -1;
+ // At this point it is either UTF-8 or ascii
+ #endif
+ return countUTF8 ? 1 : -1;
+ }
}