summaryrefslogtreecommitdiff
path: root/utilities/tei2mod.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utilities/tei2mod.cpp')
-rw-r--r--utilities/tei2mod.cpp531
1 files changed, 531 insertions, 0 deletions
diff --git a/utilities/tei2mod.cpp b/utilities/tei2mod.cpp
new file mode 100644
index 0000000..a5ae6f4
--- /dev/null
+++ b/utilities/tei2mod.cpp
@@ -0,0 +1,531 @@
+/**
+ * This program handles xml files of the form:
+ * <TEI.2>
+ * <text>
+ * <body>
+ * <entry key="xxx">...</entry>
+ * <entryFree key="yyy">...</entryFree>
+ * <superentry key="zzz">...</superentry>
+ * </body>
+ * </text>
+ * </TEI.2>
+ * The document is assumed to be well-formed and valid.
+ * Three kinds of entries are allowed,
+ * <entry> - a very restricted form of a dictionary entry.
+ * <entryFree> - a very unrestricted form of a dictionary entry.
+ * <superentry> - an entry which can have other entries.
+ * The value of the key attribute is used as the key for the entry in the module.
+ * Note, for a <superentry> only it's key becomes a SWORD key.
+ * Keys of entries internal to it are not used.
+ *
+ * The entries must be sorted according to an ASCII collation of their bytes.
+ * This should be the same for Latin-1 and for UTF-8
+ *
+ * Sword will allow for any tags, but only a few have any styling.
+ *
+ * author DM Smith
+ */
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <swbuf.h>
+#include <utilxml.h>
+#include <rawld.h>
+#include <rawld4.h>
+#include <zld.h>
+#include <zipcomprs.h>
+#include <lzsscomprs.h>
+#include <stdio.h>
+#include <cipherfil.h>
+
+#ifdef _ICU_
+#include <utf8nfc.h>
+#include <latin1utf8.h>
+#endif
+
+#ifndef NO_SWORD_NAMESPACE
+using namespace sword;
+#endif
+
+using namespace std;
+
+#ifdef _ICU_
+UTF8NFC normalizer;
+int normalized = 0;
+
+Latin1UTF8 converter;
+int converted = 0;
+#endif
+
+//#define DEBUG
+
+SWLD *module = NULL;
+SWKey *currentKey = NULL;
+bool normalize = true;
+
+/**
+ * Determine whether the string contains a valid unicode sequence.
+ * The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * 1. The latest UTF-8 RFC allows for a max of 4 bytes.
+ * Earlier allowed 6.
+ * 2. The number of bits of the leading byte before the first 0
+ * is the total number of bytes.
+ * 3. The "n" are the bits of the unicode codepoint.
+ * This routine does not check to see if the code point is in the range.
+ * It could.
+ *
+ * param txt the text to check
+ * return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters.
+ * Note: this is also a valid unicode sequence
+ * 0 if there are high order characters that do not form
+ * a valid unicode sequence
+ * author DM Smith
+ */
+int detectUTF8(const char *txt) {
+ unsigned int countUTF8 = 0;
+ int count = 0;
+
+ // Cast it to make masking and shifting easier
+ const unsigned char *p = (const unsigned char*) txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // Then count the number of high order bits that are set.
+ // This determines the number of following bytes
+ // that are a part of the unicode character
+ unsigned char i = *p;
+ for (count = 0; i & 0x80; count++) {
+ i <<= 1;
+ }
+
+ // Validate count:
+ // Count 0: bug in code that would cause core walking
+ // Count 1: is a pattern of 10nnnnnn,
+ // which does not signal the start of a unicode character
+ // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
+ // are not legal starts, either
+ if (count < 2 || count > 4) return 0;
+
+ // At this point we expect (count - 1) following characters
+ // of the pattern 10nnnnnn
+ while (--count && *++p) {
+ // The pattern of each following character must be: 10nnnnnn
+ // So, compare the top 2 bits.
+ if ((0xc0 & *p) != 0x80) return 0;
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (count) return 0;
+
+ // We have a valid UTF-8 character, so count it
+ countUTF8++;
+ }
+
+ // Advance to the next character to examine.
+ p++;
+ }
+
+ // At this point it is either UTF-8 or 7-bit ascii
+ return countUTF8 ? 1 : -1;
+}
+
+void normalizeInput(SWKey &key, SWBuf &text) {
+#ifdef _ICU_
+ int utf8State = detectUTF8(text.c_str());
+ if (normalize) {
+ // Don't need to normalize text that is ASCII
+ // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
+ if (!utf8State) {
+ cout << "Warning: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
+ converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ converted++;
+
+ // Prepare for double check. This probably can be removed.
+ // But for now we are running the check again.
+ // This is to determine whether we need to normalize output of the conversion.
+ utf8State = detectUTF8(text.c_str());
+ }
+
+ // Double check. This probably can be removed.
+ if (!utf8State) {
+ cout << "Error: " << key << ": Converting to UTF-8 (" << text << ")" << endl;
+ }
+
+ if (utf8State > 0) {
+ SWBuf before = text;
+ normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ if (before != text) {
+ normalized++;
+ }
+ }
+ }
+#endif
+}
+
+void writeEntry(SWKey &key, SWBuf &text) {
+#ifdef DEBUG
+ cout << key << endl;
+#endif
+
+ module->setKey(key);
+
+ normalizeInput(key, text);
+
+ module->setEntry(text);
+}
+
+void linkToEntry(SWBuf &keyBuf, vector<string> &linkBuf) {
+
+/*
+ char links = linkBuf.size();
+ for (int i = 0; i < links; i++) {
+ SWKey tmpkey = linkBuf[i].c_str();
+ module->linkEntry(&tmpkey);
+ cout << "Linking: " << linkBuf[i] << endl;
+ }
+*/
+}
+
+// Return true if the content was handled or is to be ignored.
+// false if the what has been seen is to be accumulated and considered later.
+bool handleToken(SWBuf &text, XMLTag *token) {
+ // The start token for the current entry;
+ static XMLTag startTag;
+ static SWBuf keyBuf;
+
+ // Flags to indicate whether we are in a entry, entryFree or superentry
+ static bool inEntry = false;
+ static bool inEntryFree = false;
+ static bool inSuperEntry = false;
+
+ const char *tokenName = token->getName();
+//-- START TAG -------------------------------------------------------------------------
+ if (!token->isEndTag()) {
+
+ // If we are not in an "entry" and we see one, then enter it.
+ if (!inEntry && !inEntryFree && !inSuperEntry) {
+ inEntry = !strcmp(tokenName, "entry");
+ inEntryFree = !strcmp(tokenName, "entryFree");
+ inSuperEntry = !strcmp(tokenName, "superentry");
+ if (inEntry || inEntryFree || inSuperEntry) {
+#ifdef DEBUG
+ cout << "Entering " << tokenName << endl;
+#endif
+ startTag = *token;
+ text = "";
+ *currentKey = token->getAttribute("key");
+
+ return false; // make tag be part of the output
+ }
+ }
+ }
+
+//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
+ else {
+
+ // ENTRY end
+ // If we see the end of an entry that we are in, then leave it
+ if ((inEntry && !strcmp(tokenName, "entry" )) ||
+ (inEntryFree && !strcmp(tokenName, "entryFree" )) ||
+ (inSuperEntry && !strcmp(tokenName, "superentry"))) {
+#ifdef DEBUG
+ cout << "Leaving " << tokenName << endl;
+#endif
+ // Only one is false coming into here,
+ // but all must be on leaving.
+ inEntry = false;
+ inEntryFree = false;
+ inSuperEntry = false;
+ text += token->toString();
+ writeEntry(*currentKey, text);
+
+ // Since we consumed the text, clear it
+ // and tell the caller that the tag was consumed.
+ text = "";
+ return true;
+ }
+ }
+ return false;
+}
+
+void usage(const char *app, const char *error = 0) {
+
+ if (error) fprintf(stderr, "\n%s: %s\n", app, error);
+
+ fprintf(stderr, "TEI Lexicon/Dictionary/Daily Devotional/Glossary module creation tool for the SWORD Project\n");
+ fprintf(stderr, "\nusage: %s <output/path> <teiDoc> [OPTIONS]\n", app);
+ fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n");
+ fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n");
+ fprintf(stderr, " -s <2|4>\t\t max text size per entry(default 4):\n");
+ fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
+ fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
+ fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
+ fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC");
+ fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n");
+ fprintf(stderr, "-z, -Z, and -s are mutually exclusive\n");
+ exit(-1);
+}
+
+int main(int argc, char **argv) {
+
+ SWBuf program = argv[0];
+ fprintf(stderr, "You are running %s: $Rev: 2138 $\n", argv[0]);
+
+ // Let's test our command line arguments
+ if (argc < 3) {
+ usage(*argv);
+ }
+
+ // variables for arguments, holding defaults
+ SWBuf path = argv[1];
+ SWBuf teiDoc = argv[2];
+ SWBuf compType = "";
+ SWBuf modDrv = "";
+ SWBuf recommendedPath = "./modules/lexdict/";
+ SWBuf cipherKey = "";
+ SWCompress *compressor = 0;
+
+ for (int i = 3; i < argc; i++) {
+ if (!strcmp(argv[i], "-z")) {
+ if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+ if (modDrv.size()) usage(*argv, "Cannot specify both -z and -s");
+ compType = "ZIP";
+ modDrv = "zLD";
+ recommendedPath += "zld/";
+ }
+ else if (!strcmp(argv[i], "-Z")) {
+ if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+ if (modDrv.size()) usage(*argv, "Cannot specify both -Z and -s");
+ compType = "LZSS";
+ recommendedPath += "zld/";
+ }
+ else if (!strcmp(argv[i], "-s")) {
+ if (compType.size()) usage(*argv, "Cannot specify both -s and -z or -Z");
+ if (i+1 < argc) {
+ int size = atoi(argv[++i]);
+ if (size == 2) {
+ modDrv = "RawLD";
+ recommendedPath += "rawld/";
+ continue;
+ }
+ if (size == 4) {
+ modDrv = "RawLD4";
+ recommendedPath += "rawld4/";
+ continue;
+ }
+ }
+ usage(*argv, "-s requires one of <2|4>");
+ }
+ else if (!strcmp(argv[i], "-N")) {
+ normalize = false;
+ }
+ else if (!strcmp(argv[i], "-c")) {
+ if (i+1 < argc) cipherKey = argv[++i];
+ else usage(*argv, "-c requires <cipher_key>");
+ }
+ else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+ }
+ if (!modDrv.size()) {
+ modDrv = "RawLD4";
+ recommendedPath += "rawld4/";
+ }
+
+#ifndef _ICU_
+ if (normalize) {
+ normalize = false;
+ cout << program << " is not compiled with support for ICU. Setting -N flag." << endl;
+ }
+#endif
+
+ if (compType == "ZIP") {
+ compressor = new ZipCompress();
+ }
+ else if (compType = "LZSS") {
+ compressor = new LZSSCompress();
+ }
+
+#ifdef DEBUG
+ // cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
+ cout << "path: " << path << " teiDoc: " << teiDoc << " compressType: " << compType << " ldType: " << modDrv << " normalize: " << normalize << "\n";
+ cout << "";
+// exit(-3);
+#endif
+
+ SWBuf modName = path;
+ int pathlen = path.length();
+ char lastChar = path[pathlen - 1];
+ if (lastChar != '/' && lastChar != '\\') {
+ modName += "/";
+ }
+ modName += "dict";
+
+ SWBuf keyBuf;
+ SWBuf entBuf;
+ SWBuf lineBuf;
+ vector<string> linkBuf;
+
+ if (modDrv == "zLD") {
+ if (zLD::createModule(modName)) {
+ fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
+ exit(-3);
+ }
+ module = new zLD(modName, 0, 0, 30, compressor);
+ }
+ else if (modDrv == "RawLD") {
+ if (RawLD::createModule(modName)) {
+ fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
+ exit(-3);
+ }
+ module = new RawLD(modName);
+ }
+ else {
+ if (RawLD4::createModule(modName)) {
+ fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program.c_str(), modName.c_str());
+ exit(-3);
+ }
+ module = new RawLD4(modName);
+ }
+
+ SWFilter *cipherFilter = 0;
+
+ if (cipherKey.size()) {
+ fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
+ cipherFilter = new CipherFilter(cipherKey.c_str());
+ module->AddRawFilter(cipherFilter);
+ }
+
+ if (!module->isWritable()) {
+ fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
+ exit(-1);
+ }
+
+ // Let's see if we can open our input file
+ ifstream infile(teiDoc);
+ if (infile.fail()) {
+ fprintf(stderr, "error: %s: couldn't open input file: %s \n", program.c_str(), teiDoc.c_str());
+ exit(-2);
+ }
+
+ currentKey = module->CreateKey();
+ currentKey->Persist(1);
+ module->setKey(*currentKey);
+
+ (*module) = TOP;
+
+ SWBuf token;
+ SWBuf text;
+ bool intoken = false;
+ char curChar = '\0';
+
+ while (infile.good()) {
+
+ curChar = infile.get();
+
+ // skip the character if it is bad. infile.good() will catch the problem
+ if (curChar == -1) {
+ continue;
+ }
+
+ if (!intoken && curChar == '<') {
+ intoken = true;
+ token = "<";
+ continue;
+ }
+
+ if (intoken && curChar == '>') {
+ intoken = false;
+ token.append('>');
+
+ XMLTag *t = new XMLTag(token.c_str());
+ if (!handleToken(text, t)) {
+ text.append(*t);
+ }
+ continue;
+ }
+
+ if (intoken)
+ token.append(curChar);
+ else
+ switch (curChar) {
+ case '>' : text.append("&gt;"); break;
+ case '<' : text.append("&lt;"); break;
+ default : text.append(curChar); break;
+ }
+ }
+
+ // Force the last entry from the text buffer.
+ //text = "";
+ //writeEntry(*currentKey, text);
+
+ delete module;
+ delete currentKey;
+ if (cipherFilter)
+ delete cipherFilter;
+ infile.close();
+
+#ifdef _ICU_
+ if (converted) fprintf(stderr, "tei2mod converted %d verses to UTF-8\n", converted);
+ if (normalized) fprintf(stderr, "tei2mod normalized %d verses to NFC\n", normalized);
+#endif
+
+ /*
+ * Suggested module name detection.
+ * Only used for suggesting a conf.
+ *
+ * Various forms of path.
+ * . and .. - no module name given, use "dict".
+ * Or one of the following where z is the module name
+ * and x may be . or ..
+ * z
+ * x/y/z
+ * x/y/z/
+ * x/y/z/z
+ */
+ SWBuf suggestedModuleName = path;
+ if (lastChar == '/' || lastChar == '\\') {
+ suggestedModuleName.setSize(--pathlen);
+ }
+
+ lastChar = suggestedModuleName[pathlen - 1];
+ if (lastChar == '.') {
+ suggestedModuleName = "???";
+ }
+ else {
+ /* At this point the suggestion is either
+ * what follows the last / or \
+ * or the entire string
+ */
+ const char *m = strrchr(suggestedModuleName.c_str(), '/');
+ if (!m) {
+ m = strrchr(suggestedModuleName.c_str(), '\\');
+ }
+ if (m) {
+ suggestedModuleName = m+1;
+ }
+ }
+
+ recommendedPath += suggestedModuleName;
+ recommendedPath += "/dict";
+
+ fprintf(stderr, "\nSuggested conf (replace ??? with appropriate values)\n\n");
+ fprintf(stderr, "[%s]\n", suggestedModuleName.c_str());
+ fprintf(stderr, "DataPath=%s\n", recommendedPath.c_str());
+ fprintf(stderr, "Description=???\n");
+ fprintf(stderr, "SourceType=TEI\n");
+ fprintf(stderr, "Encoding=%s\n", (normalize ? "UTF-8" : "???"));
+ fprintf(stderr, "ModDrv=%s\n", modDrv.c_str());
+ if (compressor) {
+ fprintf(stderr, "CompressType=%s\n", compType.c_str());
+ }
+ if (cipherKey.size()) {
+ fprintf(stderr, "CipherKey=%s\n", cipherKey.c_str());
+ }
+}