diff options
Diffstat (limited to 'src/mgr/stringmgr.cpp')
-rw-r--r-- | src/mgr/stringmgr.cpp | 125 |
1 files changed, 66 insertions, 59 deletions
diff --git a/src/mgr/stringmgr.cpp b/src/mgr/stringmgr.cpp index c4a994e..0390905 100644 --- a/src/mgr/stringmgr.cpp +++ b/src/mgr/stringmgr.cpp @@ -1,9 +1,10 @@ /****************************************************************************** - * stringmgr.cpp - implementation of class StringMgr * - * $Id: stringmgr.cpp 2115 2007-10-16 18:29:00Z scribe $ + * stringmgr.cpp - implementation of class StringMgr * - * Copyright 1998 CrossWire Bible Society (http://www.crosswire.org) + * $Id: stringmgr.cpp 2980 2013-09-14 21:51:47Z scribe $ + * + * Copyright 2004-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 @@ -38,8 +39,10 @@ #endif + SWORD_NAMESPACE_START + StringMgr *StringMgr::systemStringMgr = 0; class __staticsystemStringMgr { @@ -48,70 +51,74 @@ public: ~__staticsystemStringMgr() { if (StringMgr::systemStringMgr) delete StringMgr::systemStringMgr; StringMgr::systemStringMgr = 0; } } _staticsystemStringMgr; -/** - * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. - * Unicode Range 1st 2nd 3rd 4th 5th 6th - * U-00000000 - U-0000007F 0nnnnnnn - * U-00000080 - U-000007FF 110nnnnn 10nnnnnn - * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn - * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn - * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn - * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn - * Note: - * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. - * The number of bits of the leading byte before the first 0 is the total number of bytes - * The "n" are the bits of the unicode codepoint. - * - * This routine does not check to see if the code point is in the range. It could. - * - * @param txt the text to check - * @return 1 if all high order characters form a valid unicode sequence - * -1 if there are no high order characters - * 0 if there are high order characters that do not form a valid unicode sequence - * @author DM Smith [dmsmith555 at yahoo dot com] - */ -int isValidUTF8(unsigned char *txt) { - unsigned int countUTF8 = 0; -#if 0 - unsigned char parts = 0; - - - unsigned char *p = txt; - while (*p) { - // Is the high order bit set? - if (*p & 0x80) { - // then count the number of high order bits that are set - // this determines the number of following bytes need to have high order bits set - unsigned char i = *p; - for (parts = 0; i & 0x80; parts++) { - i <<= 1; - } - - // The pattern 10nnnnnn is not a unicode character - if (parts == 1) { - return 0; - } - else { - while (--parts && ++*p) { - // The pattern of each following character must be: 10nnnnnn - if (0xc0 & *p != 0x80) { - return 0; - } +namespace { + + /** + * Determine whether the string contains a valid unicode sequence. The following table give the pattern of a valid UTF-8 character. + * Unicode Range 1st 2nd 3rd 4th 5th 6th + * U-00000000 - U-0000007F 0nnnnnnn + * U-00000080 - U-000007FF 110nnnnn 10nnnnnn + * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn + * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn + * U-00200000 - U-03FFFFFF 111110nn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn + * U-04000000 - U-7FFFFFFF 1111110n 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn 10nnnnnn + * Note: + * The latest UTF-8 RFC allows for a max of 4 bytes. Earlier allowed 6. + * The number of bits of the leading byte before the first 0 is the total number of bytes + * The "n" are the bits of the unicode codepoint. + * + * This routine does not check to see if the code point is in the range. It could. + * + * @param txt the text to check + * @return 1 if all high order characters form a valid unicode sequence + * -1 if there are no high order characters + * 0 if there are high order characters that do not form a valid unicode sequence + * @author DM Smith [dmsmith555 at yahoo dot com] + */ + int isValidUTF8(unsigned char *txt) { + unsigned int countUTF8 = 0; + #if 0 + unsigned char parts = 0; + + + unsigned char *p = txt; + while (*p) { + // Is the high order bit set? + if (*p & 0x80) { + // then count the number of high order bits that are set + // this determines the number of following bytes need to have high order bits set + unsigned char i = *p; + for (parts = 0; i & 0x80; parts++) { + i <<= 1; } - // Oops, we've run out of bytes too soon: Cannot be UTF-8 - if (parts) { + + // The pattern 10nnnnnn is not a unicode character + if (parts == 1) { return 0; } + else { + while (--parts && ++*p) { + // The pattern of each following character must be: 10nnnnnn + if (0xc0 & *p != 0x80) { + return 0; + } + } + + // Oops, we've run out of bytes too soon: Cannot be UTF-8 + if (parts) { + return 0; + } + } + countUTF8++; } - countUTF8++; } - } - // At this point it is either UTF-8 or ascii -#endif - return countUTF8 ? 1 : -1; + // At this point it is either UTF-8 or ascii + #endif + return countUTF8 ? 1 : -1; + } } |