diff options
Diffstat (limited to 'utilities/osis2mod.cpp')
-rw-r--r-- | utilities/osis2mod.cpp | 548 |
1 files changed, 465 insertions, 83 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp index b8514b1..7ffe4ff 100644 --- a/utilities/osis2mod.cpp +++ b/utilities/osis2mod.cpp @@ -1,13 +1,13 @@ /****************************************************************************** * - * osis2mod.cpp - Utility to import a module in OSIS format + * osis2mod.cpp - Utility to import a module in OSIS format * - * $Id: osis2mod.cpp 3177 2014-04-17 04:24:37Z greg.hellings $ + * $Id: osis2mod.cpp 3431 2016-08-16 22:46:19Z refdoc $ * * Copyright 2003-2014 CrossWire Bible Society (http://www.crosswire.org) - * CrossWire Bible Society - * P. O. Box 2528 - * Tempe, AZ 85280-2528 + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -44,14 +44,27 @@ #include <versekey.h> #include <ztext.h> +#include <ztext4.h> #include <lzsscomprs.h> +#ifndef EXCLUDEZLIB #include <zipcomprs.h> +#endif +#ifndef EXCLUDEBZIP2 +#include <bz2comprs.h> +#endif +#ifndef EXCLUDEXZ +#include <xzcomprs.h> +#endif #include <cipherfil.h> #ifdef _ICU_ #include <utf8nfc.h> #include <latin1utf8.h> +#include <utf8scsu.h> +#include <scsuutf8.h> #endif +#include <utf8utf16.h> +#include <utf16utf8.h> #ifndef NO_SWORD_NAMESPACE using namespace sword; @@ -82,6 +95,9 @@ const int EXIT_BAD_NESTING = 5; // BSP or BCV nesting is bad UTF8NFC normalizer; Latin1UTF8 converter; #endif +SWFilter* outputEncoder = NULL; +SWFilter* outputDecoder = NULL; + int normalized = 0; int converted = 0; @@ -113,7 +129,8 @@ bool isOSISAbbrev(const char *buf) { * U-00000000 - U-0000007F 0nnnnnnn * U-00000080 - U-000007FF 110nnnnn 10nnnnnn * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn - * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn + * U-00010000 - U-0010FFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn + * * Note: * 1. The latest UTF-8 RFC allows for a max of 4 bytes. * Earlier allowed 6. @@ -351,7 +368,7 @@ void prepareSWVerseKey(SWBuf &buf) { * Determine whether a verse as given is valid for the versification. * This is done by comparing the before and after of normalization. */ -bool isValidRef(const char *buf) { +bool isValidRef(const char *buf, const char *caller) { // Create a VerseKey that does not do auto normalization // Note: need to turn on headings so that a heading does not get normalized anyway // And set it to the reference under question @@ -382,7 +399,7 @@ bool isValidRef(const char *buf) { // If we have gotten here the reference is not in the selected versification. // cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl; if (debug & DEBUG_REV11N) { - cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl; + cout << "DEBUG(V11N)[" << caller << "]: " << before << " normalizes to " << after << endl; } return false; @@ -465,7 +482,7 @@ void makeValidRef(VerseKey &key) { void writeEntry(SWBuf &text, bool force = false) { char keyOsisID[255]; - static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3177 $\"/>"; + static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3431 $\"/>"; static bool firstOT = true; static bool firstNT = true; @@ -494,7 +511,7 @@ void writeEntry(SWBuf &text, bool force = false) { // If we have seen a verse and the supplied one is different then we output the collected one. if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) { - if (!isValidRef(lastKey)) { + if (!isValidRef(lastKey, "writeEntry")) { makeValidRef(lastKey); } @@ -525,6 +542,11 @@ void writeEntry(SWBuf &text, bool force = false) { } } + // If the desired output encoding is non-UTF-8, convert to that encoding + if (outputEncoder) { + outputEncoder->processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + } + // If the entry already exists, then append this entry to the text. // This is for verses that are outside the chosen versification. They are appended to the prior verse. // The space should not be needed if we retained verse tags. @@ -532,7 +554,16 @@ void writeEntry(SWBuf &text, bool force = false) { module->flush(); SWBuf currentText = module->getRawEntry(); cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl; + + // If we have a non-UTF-8 encoding, we should decode it before concatenating, then re-encode it + if (outputDecoder) { + outputDecoder->processText(activeVerseText, (SWKey *)2); + outputDecoder->processText(currentText, (SWKey *)2); + } activeVerseText = currentText + " " + activeVerseText; + if (outputEncoder) { + outputEncoder->processText(activeVerseText, (SWKey *)2); + } } if (debug & DEBUG_WRITE) { @@ -563,7 +594,7 @@ void writeEntry(SWBuf &text, bool force = false) { void linkToEntry(VerseKey &linkKey, VerseKey &dest) { // Only link verses that are in the versification. - if (!isValidRef(linkKey)) { + if (!isValidRef(linkKey, "linkToEntry")) { return; } @@ -581,7 +612,7 @@ void linkToEntry(VerseKey &linkKey, VerseKey &dest) { } // Return true if the content was handled or is to be ignored. -// false if the what has been seen is to be accumulated and considered later. +// false if the what has been seen is to be accumulated and considered later. bool handleToken(SWBuf &text, XMLTag token) { // Everything between the begin book tag and the first begin chapter tag is inBookIntro @@ -671,7 +702,7 @@ bool handleToken(SWBuf &text, XMLTag token) { // BOOK START, <div type="book" ...> if (tokenName == "div" && typeAttr == "book") { - if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case + if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case if (debug & DEBUG_TITLE) { cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl; @@ -802,10 +833,13 @@ bool handleToken(SWBuf &text, XMLTag token) { // At that point we will output links. // This can be done by incrementing, which will produce an error // if there is only one verse. - verseKeys.setPosition(TOP); - verseKeys.increment(1); - if (!verseKeys.popError()) { - linkedVerses.push_back(verseKeys); + if (memberKeyCount > 1) { + verseKeys.setPosition(TOP); + verseKeys.increment(1); + if (!verseKeys.popError()) { + cout << "DEBUG(LINK): " << currentVerse.getOSISRef() << endl; + linkedVerses.push_back(verseKeys); + } } } else { @@ -852,6 +886,39 @@ bool handleToken(SWBuf &text, XMLTag token) { // Now consider everything else. +/* + // "majorSection" is code for the Book 1-5 of Psalms // This is incorrect assumption - majorSection can appear in any large book and can start and end inside chapters + if (tokenName == "div" && typeAttr == "majorSection") { + if (inBookIntro) { + if (debug & DEBUG_TITLE) { + cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl; + } + writeEntry(text); + } + + if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): majorSection found " << currentVerse.getOSISRef() << endl; + } + + strcpy(currentOsisID, currentVerse.getOSISRef()); + +// as a result of the incorrect assumption these flags are set also incorrectly and cause problems in situations where majorSections do not follow the assumptions made during creation of this patch + + inChapter = false; + inVerse = false; + inPreVerse = false; + inBookIntro = false; + inChapterIntro = true; + + if (debug & DEBUG_TITLE) { + cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl; + } + + verseDepth = 0; + + return false; + } +*/ // Handle WOC quotes. // Note this requires transformBSP to make them into milestones // Otherwise have to do it here @@ -897,8 +964,8 @@ bool handleToken(SWBuf &text, XMLTag token) { if (inChapterIntro) { // Determine when we are no longer in a chapter heading, but in pre-verse material: // If we see one of the following: - // a section div - // a title that is not main, chapter or sub or unclassified (no type attribute) + // a section div + // a title that is not main, chapter or sub or unclassified (no type attribute) if ((tokenName == "div" && typeAttr == "section") || (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub") ) { @@ -961,7 +1028,7 @@ bool handleToken(SWBuf &text, XMLTag token) { if (tokenName != topToken.getName()) { cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl; -// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. +// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a bug somewhere that's killing the converter here. // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway. // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting. } @@ -1149,6 +1216,7 @@ XMLTag transformBSP(XMLTag t) { static std::stack<XMLTag> bspTagStack; static int sID = 1; char buf[11]; + SWBuf typeAttr = t.getAttribute("type"); // Support simplification transformations if (t.isEmpty()) { @@ -1173,12 +1241,13 @@ XMLTag transformBSP(XMLTag t) { // The following containers are milestoneable. // abbr, closer, div, foreign, l, lg, salute, signed, speech // Leaving out: - // abbr When would this ever cross a boundary? - // seg as it is used for a divineName hack - // foreign so that it can be easily italicized + // abbr When would this ever cross a boundary? + // seg as it is used for a divineName hack + // foreign so that it can be easily italicized + // div type="colophon" so that it can be treated as a block else if (tagName == "chapter" || tagName == "closer" || - tagName == "div" || + (tagName == "div" && typeAttr != "colophon") || tagName == "l" || tagName == "lg" || tagName == "q" || @@ -1208,11 +1277,13 @@ XMLTag transformBSP(XMLTag t) { } bspTagStack.pop(); + SWBuf topTypeAttr = topToken.getAttribute("type"); // Look for the milestoneable container tags handled above. + // Have to treat div type="colophon" differently if (tagName == "chapter" || tagName == "closer" || - tagName == "div" || + (tagName == "div" && topTypeAttr != "colophon") || tagName == "l" || tagName == "lg" || tagName == "p" || @@ -1272,8 +1343,8 @@ void writeLinks() while (!verseKeys.popError()) { linkKey = verseKeys.getElement(); - verseKeys.increment(1); linkToEntry(linkKey, destKey); + verseKeys.increment(1); } } } @@ -1288,15 +1359,18 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n"); fprintf(stderr, "\t\t\t\t read from standard input\n"); fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n"); - fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); - fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); - fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n"); + fprintf(stderr, " -z <l|z|b|x>\t\t compression type (default: none)\n"); + fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n"); + fprintf(stderr, " -b <2|3|4>\t\t compression block size (default: 4)\n"); fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); + fprintf(stderr, " -l <1-9>\t\t compression level (default varies by compression type)\n"); fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); -#ifdef _ICU_ - fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n"); +#ifdef _ICU_ + fprintf(stderr, " -e <1|2|s>\t\t convert Unicode encoding (default: 1)\n"); + fprintf(stderr, "\t\t\t\t 1 - UTF-8 ; 2 - UTF-16 ; s - SCSU\n"); + fprintf(stderr, " -N\t\t\t do not normalize to NFC\n"); if (verboseHelp) { fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n"); @@ -1353,7 +1427,7 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals void processOSIS(istream& infile) { typedef enum { - CS_NOT_IN_COMMENT, // or seen starting "<" + CS_NOT_IN_COMMENT, // or seen starting "<" CS_SEEN_STARTING_EXCLAMATION, CS_SEEN_STARTING_HYPHEN, CS_IN_COMMENT, @@ -1362,13 +1436,21 @@ void processOSIS(istream& infile) { CS_SEEN_ENDING_GREATER_THAN } t_commentstate; + typedef enum { + ET_NUM, + ET_HEX, + ET_CHAR, + ET_NONE, + ET_ERR + } t_entitytype; + activeOsisID[0] = '\0'; strcpy(currentOsisID,"N/A"); currentVerse.setVersificationSystem(v11n); currentVerse.setAutoNormalize(false); - currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings + currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings currentVerse.setPersist(true); module->setKey(currentVerse); @@ -1382,6 +1464,13 @@ void processOSIS(istream& infile) { bool inWhitespace = false; bool seeingSpace = false; unsigned char curChar = '\0'; + SWBuf entityToken; + bool inentity = false; + t_entitytype entitytype = ET_NONE; + unsigned char attrQuoteChar = '\0'; + bool inattribute = false; + unsigned int linePos = 1; + unsigned int charPos = 0; while (infile.good()) { @@ -1398,16 +1487,221 @@ void processOSIS(istream& infile) { // Does a SWORD module actually require this? if (curChar == '\n') { curChar = ' '; + charPos = 0; + linePos++; + } + charPos++; + + // Look for entities: + // These are of the form &#dddd;, &xHHHH; or &llll; + // where dddd is a sequence of digits + // HHHH is a sequence of [A-Fa-f0-9] + // llll is amp, lt, gt, quot or apos + // but we will look for a sequence of [A-Za-z0-9] + // All but &, <, >, ", ' will produce a WARNING + // In the future: + // &#dddd; and &xHHHH; should be converted to UTF-8, + // with a WARNING if the text is not UTF-8 + // &llll; other than the xml standard 5 should produce a WARNING + + // For entity diagnostics track whether the text is an attribute value + if (inattribute && (curChar == '\'' || curChar == '"')) { + if (attrQuoteChar == curChar) { + inattribute = false; + attrQuoteChar = '\0'; + } + else { + attrQuoteChar = curChar; + } + } + if (intoken && curChar == '=') { + inattribute = true; + attrQuoteChar = '\0'; + } + + if (!inentity && curChar == '&') { + inentity = true; + entitytype = ET_NONE; + entityToken = "&"; + continue; + } + + if (inentity) { + if (curChar == ';') { + inentity = false; + } + else { + switch (entitytype) { + case ET_NONE: + // A hex entity cannot start with X in XML, but it can in HTML + // Allow for it here and complain later + if (curChar == 'x' || curChar == 'X') { + entitytype = ET_HEX; + } + else + if (curChar == '#') { + entitytype = ET_NUM; + } + else + if ((curChar >= 'A' && curChar <= 'Z') || + (curChar >= 'a' && curChar <= 'z') || + (curChar >= '0' && curChar <= '9')) { + entitytype = ET_CHAR; + } + else { + inentity = false; + entitytype = ET_ERR; + } + break; + + case ET_NUM : + if (!(curChar >= '0' && curChar <= '9')) { + inentity = false; + entitytype = ET_ERR; + } + break; + case ET_HEX : + if ((curChar >= 'G' && curChar <= 'Z') || + (curChar >= 'g' && curChar <= 'z')) { + // Starts out as a HEX entity, but it isn't one + entitytype = ET_CHAR; + } + else + if (!((curChar >= 'A' && curChar <= 'F') || + (curChar >= 'a' && curChar <= 'f') || + (curChar >= '0' && curChar <= '9'))) { + inentity = false; + entitytype = ET_ERR; + } + break; + case ET_CHAR : + if (!((curChar >= 'A' && curChar <= 'Z') || + (curChar >= 'a' && curChar <= 'z') || + (curChar >= '0' && curChar <= '9'))) { + inentity = false; + entitytype = ET_ERR; + } + break; + default: + cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl; + exit(EXIT_BAD_NESTING); + } + } + + if (entitytype != ET_ERR) { + entityToken.append((char) curChar); + } + + // It is an entity, perhaps invalid, if curChar is ';', error otherwise + // Test to see if we now have an entity or a failure + // It may not be a valid entity. + if (!inentity) { + switch (entitytype) { + case ET_ERR : + // Remove the leading & + entityToken << 1; + cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &" << entityToken << endl; + if (intoken) { + token.append("&"); + token.append(entityToken); + } + else { + text.append("&"); + text.append(entityToken); + } + break; + case ET_HEX : + if (entityToken[1] != 'x') { + cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl; + } + else { + cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl; + } + break; + case ET_CHAR : + if (strcmp(entityToken, "&") && + strcmp(entityToken, "<") && + strcmp(entityToken, ">") && + strcmp(entityToken, """) && + strcmp(entityToken, "'")) { + cout << "WARNING(PARSE): XML only supports 5 Character entities &, <, >, " and ', found " << entityToken << endl; + } + else + if (!strcmp(entityToken, "'")) { + cout << "WARNING(PARSE): While valid for XML, XHTML does not support '." << endl; + if (!inattribute) { + cout << "WARNING(PARSE): ' is unnecessary outside of attribute values. Replacing with '. " << endl; + entityToken = "'"; + } + else { + switch (attrQuoteChar) { + case '"' : + cout << "WARNING(PARSE): ' is unnecessary inside double quoted attribute values. Replacing with '. " << endl; + entityToken = "'"; + break; + case '\'' : + cout << "WARNING(PARSE): ' is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl; + break; + } + } + } + else + if (!strcmp(entityToken, """)) { + cout << "WARNING(PARSE): While valid for XML, " is only needed within double quoted attribute values" << endl; + if (!inattribute) { + cout << "WARNING(PARSE): " is unnecessary outside of attribute values. Replace with \"." << endl; + entityToken = "\""; + } + else { + switch (attrQuoteChar) { + case '"' : + cout << "WARNING(PARSE): " is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl; + break; + case '\'' : + cout << "WARNING(PARSE): " is unnecessary inside single quoted attribute values. Replace with \"." << endl; + entityToken = "\""; + break; + } + } + } + break; + case ET_NUM : + cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl; + break; + case ET_NONE : + default: + break; + } + + // Put the entity into the stream. + if (intoken) { + token.append(entityToken); + } + else { + text.append(entityToken); + } + + if (curChar == ';') { + // The character was handled, so go get the next one. + continue; + } + } + else { + // The character was handled, so go get the next one. + continue; + } } + if (!intoken && curChar == '<') { intoken = true; token = "<"; + inattribute = false; + attrQuoteChar = '\0'; continue; } // Handle XML comments starting with "<!--", ending with "-->" - if (intoken && !incomment) { switch (commentstate) { case CS_NOT_IN_COMMENT : @@ -1532,8 +1826,8 @@ void processOSIS(istream& infile) { } else { switch (curChar) { - case '>' : text.append(">"); break; - case '<' : text.append("<"); break; + case '>' : cout << "WARNING(PARSE): > should be >" << endl; text.append(">"); break; + case '<' : cout << "WARNING(PARSE): < should be <" << endl; text.append("<"); break; default : text.append((char) curChar); break; } } @@ -1552,7 +1846,7 @@ void processOSIS(istream& infile) { int main(int argc, char **argv) { - fprintf(stderr, "You are running osis2mod: $Rev: 3177 $\n"); + fprintf(stderr, "You are running osis2mod: $Rev: 3431 $\n"); if (argc > 1) { for (int i = 1; i < argc; i++) { @@ -1578,19 +1872,25 @@ int main(int argc, char **argv) { int entrySize = 0; SWBuf cipherKey = ""; SWCompress *compressor = 0; + int compLevel = 0; for (int i = 3; i < argc; i++) { if (!strcmp(argv[i], "-a")) { append = 1; } else if (!strcmp(argv[i], "-z")) { - if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); - if (entrySize) usage(*argv, "Cannot specify both -z and -s"); compType = "ZIP"; + if (i+1 < argc && argv[i+1][0] != '-') { + switch (argv[++i][0]) { + case 'l': compType = "LZSS"; break; + case 'z': compType = "ZIP"; break; + case 'b': compType = "BZIP2"; break; + case 'x': compType = "XZ"; break; + } + } } else if (!strcmp(argv[i], "-Z")) { if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); - if (entrySize) usage(*argv, "Cannot specify both -Z and -s"); compType = "LZSS"; } else if (!strcmp(argv[i], "-b")) { @@ -1603,6 +1903,30 @@ int main(int argc, char **argv) { else if (!strcmp(argv[i], "-N")) { normalize = false; } + else if (!strcmp(argv[i], "-e")) { + if (i+1 < argc) { + switch (argv[++i][0]) { + case '1': // leave as UTF-8 + outputEncoder = NULL; + outputDecoder = NULL; + break; + + case '2': + outputEncoder = new UTF8UTF16(); + outputDecoder = new UTF16UTF8(); + break; +#ifdef _ICU_ + case 's': + outputEncoder = new UTF8SCSU(); + outputDecoder = new SCSUUTF8(); + break; +#endif + default: + outputEncoder = NULL; + outputDecoder = NULL; + } + } + } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); @@ -1612,7 +1936,6 @@ int main(int argc, char **argv) { else usage(*argv, "-v requires <v11n>"); } else if (!strcmp(argv[i], "-s")) { - if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z"); if (i+1 < argc) { entrySize = atoi(argv[++i]); if (entrySize == 2 || entrySize == 4) { @@ -1628,20 +1951,48 @@ int main(int argc, char **argv) { if (i+1 < argc) debug |= atoi(argv[++i]); else usage(*argv, "-d requires <flags>"); } + else if (!strcmp(argv[i], "-l")) { + if (i+1 < argc) { + compLevel = atoi(argv[++i]); + } + else usage(*argv, "-l requires a value from 1-9"); + + if (compLevel < 0 || compLevel > 10) { + usage(*argv, "-l requires a value from 1-9"); + } + } else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } - if (isCommentary) isCommentary = true; // avoid unused warning for now + if (isCommentary) isCommentary = true; // avoid unused warning for now - if (compType == "ZIP") { + if (compType == "LZSS") { + compressor = new LZSSCompress(); + } + else if (compType == "ZIP") { #ifndef EXCLUDEZLIB compressor = new ZipCompress(); #else - usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library"); + usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library"); #endif } - else if (compType == "LZSS") { - compressor = new LZSSCompress(); + else if (compType == "BZIP2") { +#ifndef EXCLUDEBZIP2 + compressor = new Bzip2Compress(); +#else + usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library"); +#endif + } + else if (compType == "XZ") { +#ifndef EXCLUDEXZ + compressor = new XzCompress(); +#else + usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library"); +#endif + } + + if (compressor && compLevel > 0) { + compressor->setLevel(compLevel); } #ifndef _ICU_ @@ -1652,16 +2003,24 @@ int main(int argc, char **argv) { #endif if (debug & DEBUG_OTHER) { - cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; + cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcompressLevel: " << compLevel << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; } - if (!append) { // == 0 then create module + if (!append) { // == 0 then create module // Try to initialize a default set of datafiles and indicies at our // datapath location passed to us from the user. if (compressor) { - if (zText::createModule(path, iType, v11n)) { - fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); - exit(EXIT_NO_CREATE); + if (entrySize == 4) { + if (zText4::createModule(path, iType, v11n)) { + fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); + exit(EXIT_NO_CREATE); + } + } + else { + if (zText::createModule(path, iType, v11n)) { + fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); + exit(EXIT_NO_CREATE); + } } } else if (entrySize == 4) { @@ -1680,50 +2039,69 @@ int main(int argc, char **argv) { // Do some initialization stuff if (compressor) { - // Create a compressed text module allowing very large entries - // Taking defaults except for first, fourth, fifth and last argument - module = new zText( - path, // ipath - 0, // iname - 0, // idesc - iType, // iblockType - compressor, // icomp - 0, // idisp - ENC_UNKNOWN, // enc - DIRECTION_LTR, // dir - FMT_UNKNOWN, // markup - 0, // lang - v11n // versification + if (entrySize == 4) { + // Create a compressed text module allowing very large entries + // Taking defaults except for first, fourth, fifth and last argument + module = new zText4( + path, // ipath + 0, // iname + 0, // idesc + iType, // iblockType + compressor, // icomp + 0, // idisp + ENC_UNKNOWN, // enc + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // lang + v11n // versification ); + } + else { + // Create a compressed text module allowing reasonable sized entries + // Taking defaults except for first, fourth, fifth and last argument + module = new zText( + path, // ipath + 0, // iname + 0, // idesc + iType, // iblockType + compressor, // icomp + 0, // idisp + ENC_UNKNOWN, // enc + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // lang + v11n // versification + ); + } } else if (entrySize == 4) { // Create a raw text module allowing very large entries // Taking defaults except for first and last argument module = new RawText4( - path, // ipath - 0, // iname - 0, // idesc - 0, // idisp - ENC_UNKNOWN, // encoding - DIRECTION_LTR, // dir - FMT_UNKNOWN, // markup - 0, // ilang - v11n // versification + path, // ipath + 0, // iname + 0, // idesc + 0, // idisp + ENC_UNKNOWN, // encoding + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // ilang + v11n // versification ); } else { // Create a raw text module allowing reasonable sized entries // Taking defaults except for first and last argument module = new RawText( - path, // ipath - 0, // iname - 0, // idesc - 0, // idisp - ENC_UNKNOWN, // encoding - DIRECTION_LTR, // dir - FMT_UNKNOWN, // markup - 0, // ilang - v11n // versification + path, // ipath + 0, // iname + 0, // idesc + 0, // idisp + ENC_UNKNOWN, // encoding + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // ilang + v11n // versification ); } @@ -1759,6 +2137,10 @@ int main(int argc, char **argv) { delete module; if (cipherFilter) delete cipherFilter; + if (outputEncoder) + delete outputEncoder; + if (outputDecoder) + delete outputDecoder; fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program); exit(0); // success |