diff options
Diffstat (limited to 'utilities/tei2mod.cpp')
-rw-r--r-- | utilities/tei2mod.cpp | 531 |
1 files changed, 531 insertions, 0 deletions
diff --git a/utilities/tei2mod.cpp b/utilities/tei2mod.cpp new file mode 100644 index 0000000..a5ae6f4 --- /dev/null +++ b/utilities/tei2mod.cpp @@ -0,0 +1,531 @@ +/** + * This program handles xml files of the form: + * <TEI.2> + * <text> + * <body> + * <entry key="xxx">...</entry> + * <entryFree key="yyy">...</entryFree> + * <superentry key="zzz">...</superentry> + * </body> + * </text> + * </TEI.2> + * The document is assumed to be well-formed and valid. + * Three kinds of entries are allowed, + * <entry> - a very restricted form of a dictionary entry. + * <entryFree> - a very unrestricted form of a dictionary entry. + * <superentry> - an entry which can have other entries. + * The value of the key attribute is used as the key for the entry in the module. + * Note, for a <superentry> only it's key becomes a SWORD key. + * Keys of entries internal to it are not used. + * + * The entries must be sorted according to an ASCII collation of their bytes. + * This should be the same for Latin-1 and for UTF-8 + * + * Sword will allow for any tags, but only a few have any styling. + * + * author DM Smith + */ +#include <string> +#include <vector> +#include <fstream> +#include <iostream> +#include <swbuf.h> +#include <utilxml.h> +#include <rawld.h> +#include <rawld4.h> +#include <zld.h> +#include <zipcomprs.h> +#include <lzsscomprs.h> +#include <stdio.h> +#include <cipherfil.h> + +#ifdef _ICU_ +#include <utf8nfc.h> +#include <latin1utf8.h> +#endif + +#ifndef NO_SWORD_NAMESPACE +using namespace sword; +#endif + +using namespace std; + +#ifdef _ICU_ +UTF8NFC normalizer; +int normalized = 0; + +Latin1UTF8 converter; +int converted = 0; +#endif + +//#define DEBUG + +SWLD *module = NULL; +SWKey *currentKey = NULL; +bool normalize = true; + +/** + * Determine whether the string contains a valid unicode sequence. + * The following table give the pattern of a valid UTF-8 character. + * Unicode Range 1st 2nd 3rd 4th + * U-00000000 - U-0000007F 0nnnnnnn + * U-00000080 - U-000007FF 110nnnnn 10nnnnnn + * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn + * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn + * Note: + * 1. The latest UTF-8 RFC allows for a max of 4 bytes. + * Earlier allowed 6. + * 2. The number of bits of the leading byte before the first 0 + * is the total number of bytes. + * 3. The "n" are the bits of the unicode codepoint. + * This routine does not check to see if the code point is in the range. + * It could. + * + * param txt the text to check + * return 1 if all high order characters form a valid unicode sequence + * -1 if there are no high order characters. + * Note: this is also a valid unicode sequence + * 0 if there are high order characters that do not form + * a valid unicode sequence + * author DM Smith + */ +int detectUTF8(const char *txt) { + unsigned int countUTF8 = 0; + int count = 0; + + // Cast it to make masking and shifting easier + const unsigned char *p = (const unsigned char*) txt; + while (*p) { + // Is the high order bit set? + if (*p & 0x80) { + // Then count the number of high order bits that are set. + // This determines the number of following bytes + // that are a part of the unicode character + unsigned char i = *p; + for (count = 0; i & 0x80; count++) { + i <<= 1; + } + + // Validate count: + // Count 0: bug in code that would cause core walking + // Count 1: is a pattern of 10nnnnnn, + // which does not signal the start of a unicode character + // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 + // are not legal starts, either + if (count < 2 || count > 4) return 0; + + // At this point we expect (count - 1) following characters + // of the pattern 10nnnnnn + while (--count && *++p) { + // The pattern of each following character must be: 10nnnnnn + // So, compare the top 2 bits. + if ((0xc0 & *p) != 0x80) return 0; + } + + // Oops, we've run out of bytes too soon: Cannot be UTF-8 + if (count) return 0; + + // We have a valid UTF-8 character, so count it + countUTF8++; + } + + // Advance to the next character to examine. + p++; + } + + // At this point it is either UTF-8 or 7-bit ascii + return countUTF8 ? 1 : -1; +} + +void normalizeInput(SWKey &key, SWBuf &text) { +#ifdef _ICU_ + int utf8State = detectUTF8(text.c_str()); + if (normalize) { + // Don't need to normalize text that is ASCII + // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 + if (!utf8State) { + cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl; + converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + converted++; + + // Prepare for double check. This probably can be removed. + // But for now we are running the check again. + // This is to determine whether we need to normalize output of the conversion. + utf8State = detectUTF8(text.c_str()); + } + + // Double check. This probably can be removed. + if (!utf8State) { + cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl; + } + + if (utf8State > 0) { + SWBuf before = text; + normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + if (before != text) { + normalized++; + } + } + } +#endif +} + +void writeEntry(SWKey &key, SWBuf &text) { +#ifdef DEBUG + cout << key << endl; +#endif + + module->setKey(key); + + normalizeInput(key, text); + + module->setEntry(text); +} + +void linkToEntry(SWBuf &keyBuf, vector<string> &linkBuf) { + +/* + char links = linkBuf.size(); + for (int i = 0; i < links; i++) { + SWKey tmpkey = linkBuf[i].c_str(); + module->linkEntry(&tmpkey); + cout << "Linking: " << linkBuf[i] << endl; + } +*/ +} + +// Return true if the content was handled or is to be ignored. +// false if the what has been seen is to be accumulated and considered later. +bool handleToken(SWBuf &text, XMLTag *token) { + // The start token for the current entry; + static XMLTag startTag; + static SWBuf keyBuf; + + // Flags to indicate whether we are in a entry, entryFree or superentry + static bool inEntry = false; + static bool inEntryFree = false; + static bool inSuperEntry = false; + + const char *tokenName = token->getName(); +//-- START TAG ------------------------------------------------------------------------- + if (!token->isEndTag()) { + + // If we are not in an "entry" and we see one, then enter it. + if (!inEntry && !inEntryFree && !inSuperEntry) { + inEntry = !strcmp(tokenName, "entry"); + inEntryFree = !strcmp(tokenName, "entryFree"); + inSuperEntry = !strcmp(tokenName, "superentry"); + if (inEntry || inEntryFree || inSuperEntry) { +#ifdef DEBUG + cout << "Entering " << tokenName << endl; +#endif + startTag = *token; + text = ""; + *currentKey = token->getAttribute("key"); + + return false; // make tag be part of the output + } + } + } + +//-- EMPTY and END TAG --------------------------------------------------------------------------------------------- + else { + + // ENTRY end + // If we see the end of an entry that we are in, then leave it + if ((inEntry && !strcmp(tokenName, "entry" )) || + (inEntryFree && !strcmp(tokenName, "entryFree" )) || + (inSuperEntry && !strcmp(tokenName, "superentry"))) { +#ifdef DEBUG + cout << "Leaving " << tokenName << endl; +#endif + // Only one is false coming into here, + // but all must be on leaving. + inEntry = false; + inEntryFree = false; + inSuperEntry = false; + text += token->toString(); + writeEntry(*currentKey, text); + + // Since we consumed the text, clear it + // and tell the caller that the tag was consumed. + text = ""; + return true; + } + } + return false; +} + +void usage(const char *app, const char *error = 0) { + + if (error) fprintf(stderr, "\n%s: %s\n", app, error); + + fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for the SWORD Project\n"); + fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app); + fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); + fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); + fprintf(stderr, " -s <2|4>\t\t max text size per entry(default 4):\n"); + fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); + fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); + fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n"); + fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC"); + fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n"); + fprintf(stderr, "-z, -Z, and -s are mutually exclusive\n"); + exit(-1); +} + +int main(int argc, char **argv) { + + SWBuf program = argv[0]; + fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]); + + // Let's test our command line arguments + if (argc < 3) { + usage(*argv); + } + + // variables for arguments, holding defaults + SWBuf path = argv[1]; + SWBuf teiDoc = argv[2]; + SWBuf compType = ""; + SWBuf modDrv = ""; + SWBuf recommendedPath = "./modules/lexdict/"; + SWBuf cipherKey = ""; + SWCompress *compressor = 0; + + for (int i = 3; i < argc; i++) { + if (!strcmp(argv[i], "-z")) { + if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); + if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s"); + compType = "ZIP"; + modDrv = "zLD"; + recommendedPath += "zld/"; + } + else if (!strcmp(argv[i], "-Z")) { + if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); + if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s"); + compType = "LZSS"; + recommendedPath += "zld/"; + } + else if (!strcmp(argv[i], "-s")) { + if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z"); + if (i+1 < argc) { + int size = atoi(argv[++i]); + if (size == 2) { + modDrv = "RawLD"; + recommendedPath += "rawld/"; + continue; + } + if (size == 4) { + modDrv = "RawLD4"; + recommendedPath += "rawld4/"; + continue; + } + } + usage(*argv, "-s requires one of <2|4>"); + } + else if (!strcmp(argv[i], "-N")) { + normalize = false; + } + else if (!strcmp(argv[i], "-c")) { + if (i+1 < argc) cipherKey = argv[++i]; + else usage(*argv, "-c requires <cipher_key>"); + } + else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); + } + if (!modDrv.size()) { + modDrv = "RawLD4"; + recommendedPath += "rawld4/"; + } + +#ifndef _ICU_ + if (normalize) { + normalize = false; + cout << program << " is not compiled with support for ICU. Setting -N flag." << endl; + } +#endif + + if (compType == "ZIP") { + compressor = new ZipCompress(); + } + else if (compType = "LZSS") { + compressor = new LZSSCompress(); + } + +#ifdef DEBUG + // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; + cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n"; + cout << ""; +// exit(-3); +#endif + + SWBuf modName = path; + int pathlen = path.length(); + char lastChar = path[pathlen - 1]; + if (lastChar != '/' && lastChar != '\\') { + modName += "/"; + } + modName += "dict"; + + SWBuf keyBuf; + SWBuf entBuf; + SWBuf lineBuf; + vector<string> linkBuf; + + if (modDrv == "zLD") { + if (zLD::createModule(modName)) { + fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); + exit(-3); + } + module = new zLD(modName, 0, 0, 30, compressor); + } + else if (modDrv == "RawLD") { + if (RawLD::createModule(modName)) { + fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); + exit(-3); + } + module = new RawLD(modName); + } + else { + if (RawLD4::createModule(modName)) { + fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str()); + exit(-3); + } + module = new RawLD4(modName); + } + + SWFilter *cipherFilter = 0; + + if (cipherKey.size()) { + fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); + cipherFilter = new CipherFilter(cipherKey.c_str()); + module->AddRawFilter(cipherFilter); + } + + if (!module->isWritable()) { + fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); + exit(-1); + } + + // Let's see if we can open our input file + ifstream infile(teiDoc); + if (infile.fail()) { + fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str()); + exit(-2); + } + + currentKey = module->CreateKey(); + currentKey->Persist(1); + module->setKey(*currentKey); + + (*module) = TOP; + + SWBuf token; + SWBuf text; + bool intoken = false; + char curChar = '\0'; + + while (infile.good()) { + + curChar = infile.get(); + + // skip the character if it is bad. infile.good() will catch the problem + if (curChar == -1) { + continue; + } + + if (!intoken && curChar == '<') { + intoken = true; + token = "<"; + continue; + } + + if (intoken && curChar == '>') { + intoken = false; + token.append('>'); + + XMLTag *t = new XMLTag(token.c_str()); + if (!handleToken(text, t)) { + text.append(*t); + } + continue; + } + + if (intoken) + token.append(curChar); + else + switch (curChar) { + case '>' : text.append(">"); break; + case '<' : text.append("<"); break; + default : text.append(curChar); break; + } + } + + // Force the last entry from the text buffer. + //text = ""; + //writeEntry(*currentKey, text); + + delete module; + delete currentKey; + if (cipherFilter) + delete cipherFilter; + infile.close(); + +#ifdef _ICU_ + if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted); + if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized); +#endif + + /* + * Suggested module name detection. + * Only used for suggesting a conf. + * + * Various forms of path. + * . and .. - no module name given, use "dict". + * Or one of the following where z is the module name + * and x may be . or .. + * z + * x/y/z + * x/y/z/ + * x/y/z/z + */ + SWBuf suggestedModuleName = path; + if (lastChar == '/' || lastChar == '\\') { + suggestedModuleName.setSize(--pathlen); + } + + lastChar = suggestedModuleName[pathlen - 1]; + if (lastChar == '.') { + suggestedModuleName = "???"; + } + else { + /* At this point the suggestion is either + * what follows the last / or \ + * or the entire string + */ + const char *m = strrchr(suggestedModuleName.c_str(), '/'); + if (!m) { + m = strrchr(suggestedModuleName.c_str(), '\\'); + } + if (m) { + suggestedModuleName = m+1; + } + } + + recommendedPath += suggestedModuleName; + recommendedPath += "/dict"; + + fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n"); + fprintf(stderr, "[%s]\n", suggestedModuleName.c_str()); + fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str()); + fprintf(stderr, "Description=???\n"); + fprintf(stderr, "SourceType=TEI\n"); + fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???")); + fprintf(stderr, "ModDrv=%s\n", modDrv.c_str()); + if (compressor) { + fprintf(stderr, "CompressType=%s\n", compType.c_str()); + } + if (cipherKey.size()) { + fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str()); + } +} |