From 03134fa5f6f25d92724ce4c183f9bbe12a9e37dc Mon Sep 17 00:00:00 2001 From: "Roberto C. Sanchez" Date: Sat, 29 Mar 2014 10:53:59 -0400 Subject: Imported Upstream version 1.5.11 --- src/modules/filters/utf8utf16.cpp | 78 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/modules/filters/utf8utf16.cpp (limited to 'src/modules/filters/utf8utf16.cpp') diff --git a/src/modules/filters/utf8utf16.cpp b/src/modules/filters/utf8utf16.cpp new file mode 100644 index 0000000..5c1614c --- /dev/null +++ b/src/modules/filters/utf8utf16.cpp @@ -0,0 +1,78 @@ +/****************************************************************************** + * + * UTF8UTF16 - SWFilter descendant to convert UTF-8 to UTF-16 + * + */ + +#include +#include + +#include +#include + +SWORD_NAMESPACE_START + +UTF8UTF16::UTF8UTF16() { +} + +char UTF8UTF16::processText(SWBuf &text, const SWKey *key, const SWModule *module) { + const unsigned char *from; + unsigned long ch; + signed short utf16; + unsigned char from2[7]; + + SWBuf orig = text; + + from = (const unsigned char *)orig.c_str(); + + // ------------------------------- + for (text = ""; *from; from++) { + ch = 0; + //case: ANSI + if ((*from & 128) != 128) { + text.setSize(text.size()+2); + *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)*from; + continue; + } + //case: Invalid UTF-8 (illegal continuing byte in initial position) + if ((*from & 128) && ((*from & 64) != 64)) { + continue; + } + //case: 2+ byte codepoint + from2[0] = *from; + from2[0] <<= 1; + int subsequent; + for (subsequent = 1; (from2[0] & 128) && (subsequent < 7); subsequent++) { + from2[0] <<= 1; + from2[subsequent] = from[subsequent]; + from2[subsequent] &= 63; + ch <<= 6; + ch |= from2[subsequent]; + } + subsequent--; + from2[0] <<= 1; + char significantFirstBits = 8 - (2+subsequent); + + ch |= (((short)from2[0]) << (((6*subsequent)+significantFirstBits)-8)); + from += subsequent; + if (ch < 0x10000) { + text.setSize(text.size()+2); + *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)ch; + } + else { + utf16 = (signed short)((ch - 0x10000) / 0x400 + 0xD800); + text.setSize(text.size()+2); + *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16; + utf16 = (signed short)((ch - 0x10000) % 0x400 + 0xDC00); + text.setSize(text.size()+2); + *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)utf16; + } + } + text.setSize(text.size()+2); + *((unsigned short *)(text.getRawData()+(text.size()-2))) = (unsigned short)0; + + return 0; + +} + +SWORD_NAMESPACE_END -- cgit v1.2.3