summaryrefslogtreecommitdiff
path: root/utilities/osis2mod.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utilities/osis2mod.cpp')
-rw-r--r--utilities/osis2mod.cpp548
1 files changed, 465 insertions, 83 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp
index b8514b1..7ffe4ff 100644
--- a/utilities/osis2mod.cpp
+++ b/utilities/osis2mod.cpp
@@ -1,13 +1,13 @@
/******************************************************************************
*
- * osis2mod.cpp - Utility to import a module in OSIS format
+ * osis2mod.cpp - Utility to import a module in OSIS format
*
- * $Id: osis2mod.cpp 3177 2014-04-17 04:24:37Z greg.hellings $
+ * $Id: osis2mod.cpp 3431 2016-08-16 22:46:19Z refdoc $
*
* Copyright 2003-2014 CrossWire Bible Society (http://www.crosswire.org)
- * CrossWire Bible Society
- * P. O. Box 2528
- * Tempe, AZ 85280-2528
+ * CrossWire Bible Society
+ * P. O. Box 2528
+ * Tempe, AZ 85280-2528
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
@@ -44,14 +44,27 @@
#include <versekey.h>
#include <ztext.h>
+#include <ztext4.h>
#include <lzsscomprs.h>
+#ifndef EXCLUDEZLIB
#include <zipcomprs.h>
+#endif
+#ifndef EXCLUDEBZIP2
+#include <bz2comprs.h>
+#endif
+#ifndef EXCLUDEXZ
+#include <xzcomprs.h>
+#endif
#include <cipherfil.h>
#ifdef _ICU_
#include <utf8nfc.h>
#include <latin1utf8.h>
+#include <utf8scsu.h>
+#include <scsuutf8.h>
#endif
+#include <utf8utf16.h>
+#include <utf16utf8.h>
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
@@ -82,6 +95,9 @@ const int EXIT_BAD_NESTING = 5; // BSP or BCV nesting is bad
UTF8NFC normalizer;
Latin1UTF8 converter;
#endif
+SWFilter* outputEncoder = NULL;
+SWFilter* outputDecoder = NULL;
+
int normalized = 0;
int converted = 0;
@@ -113,7 +129,8 @@ bool isOSISAbbrev(const char *buf) {
* U-00000000 - U-0000007F 0nnnnnnn
* U-00000080 - U-000007FF 110nnnnn 10nnnnnn
* U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
- * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-0010FFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ *
* Note:
* 1. The latest UTF-8 RFC allows for a max of 4 bytes.
* Earlier allowed 6.
@@ -351,7 +368,7 @@ void prepareSWVerseKey(SWBuf &buf) {
* Determine whether a verse as given is valid for the versification.
* This is done by comparing the before and after of normalization.
*/
-bool isValidRef(const char *buf) {
+bool isValidRef(const char *buf, const char *caller) {
// Create a VerseKey that does not do auto normalization
// Note: need to turn on headings so that a heading does not get normalized anyway
// And set it to the reference under question
@@ -382,7 +399,7 @@ bool isValidRef(const char *buf) {
// If we have gotten here the reference is not in the selected versification.
// cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
if (debug & DEBUG_REV11N) {
- cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl;
+ cout << "DEBUG(V11N)[" << caller << "]: " << before << " normalizes to " << after << endl;
}
return false;
@@ -465,7 +482,7 @@ void makeValidRef(VerseKey &key) {
void writeEntry(SWBuf &text, bool force = false) {
char keyOsisID[255];
- static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3177 $\"/>";
+ static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3431 $\"/>";
static bool firstOT = true;
static bool firstNT = true;
@@ -494,7 +511,7 @@ void writeEntry(SWBuf &text, bool force = false) {
// If we have seen a verse and the supplied one is different then we output the collected one.
if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
- if (!isValidRef(lastKey)) {
+ if (!isValidRef(lastKey, "writeEntry")) {
makeValidRef(lastKey);
}
@@ -525,6 +542,11 @@ void writeEntry(SWBuf &text, bool force = false) {
}
}
+ // If the desired output encoding is non-UTF-8, convert to that encoding
+ if (outputEncoder) {
+ outputEncoder->processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ }
+
// If the entry already exists, then append this entry to the text.
// This is for verses that are outside the chosen versification. They are appended to the prior verse.
// The space should not be needed if we retained verse tags.
@@ -532,7 +554,16 @@ void writeEntry(SWBuf &text, bool force = false) {
module->flush();
SWBuf currentText = module->getRawEntry();
cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
+
+ // If we have a non-UTF-8 encoding, we should decode it before concatenating, then re-encode it
+ if (outputDecoder) {
+ outputDecoder->processText(activeVerseText, (SWKey *)2);
+ outputDecoder->processText(currentText, (SWKey *)2);
+ }
activeVerseText = currentText + " " + activeVerseText;
+ if (outputEncoder) {
+ outputEncoder->processText(activeVerseText, (SWKey *)2);
+ }
}
if (debug & DEBUG_WRITE) {
@@ -563,7 +594,7 @@ void writeEntry(SWBuf &text, bool force = false) {
void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
// Only link verses that are in the versification.
- if (!isValidRef(linkKey)) {
+ if (!isValidRef(linkKey, "linkToEntry")) {
return;
}
@@ -581,7 +612,7 @@ void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
}
// Return true if the content was handled or is to be ignored.
-// false if the what has been seen is to be accumulated and considered later.
+// false if the what has been seen is to be accumulated and considered later.
bool handleToken(SWBuf &text, XMLTag token) {
// Everything between the begin book tag and the first begin chapter tag is inBookIntro
@@ -671,7 +702,7 @@ bool handleToken(SWBuf &text, XMLTag token) {
// BOOK START, <div type="book" ...>
if (tokenName == "div" && typeAttr == "book") {
- if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case
+ if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case
if (debug & DEBUG_TITLE) {
cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl;
@@ -802,10 +833,13 @@ bool handleToken(SWBuf &text, XMLTag token) {
// At that point we will output links.
// This can be done by incrementing, which will produce an error
// if there is only one verse.
- verseKeys.setPosition(TOP);
- verseKeys.increment(1);
- if (!verseKeys.popError()) {
- linkedVerses.push_back(verseKeys);
+ if (memberKeyCount > 1) {
+ verseKeys.setPosition(TOP);
+ verseKeys.increment(1);
+ if (!verseKeys.popError()) {
+ cout << "DEBUG(LINK): " << currentVerse.getOSISRef() << endl;
+ linkedVerses.push_back(verseKeys);
+ }
}
}
else {
@@ -852,6 +886,39 @@ bool handleToken(SWBuf &text, XMLTag token) {
// Now consider everything else.
+/*
+ // "majorSection" is code for the Book 1-5 of Psalms // This is incorrect assumption - majorSection can appear in any large book and can start and end inside chapters
+ if (tokenName == "div" && typeAttr == "majorSection") {
+ if (inBookIntro) {
+ if (debug & DEBUG_TITLE) {
+ cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl;
+ }
+ writeEntry(text);
+ }
+
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): majorSection found " << currentVerse.getOSISRef() << endl;
+ }
+
+ strcpy(currentOsisID, currentVerse.getOSISRef());
+
+// as a result of the incorrect assumption these flags are set also incorrectly and cause problems in situations where majorSections do not follow the assumptions made during creation of this patch
+
+ inChapter = false;
+ inVerse = false;
+ inPreVerse = false;
+ inBookIntro = false;
+ inChapterIntro = true;
+
+ if (debug & DEBUG_TITLE) {
+ cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl;
+ }
+
+ verseDepth = 0;
+
+ return false;
+ }
+*/
// Handle WOC quotes.
// Note this requires transformBSP to make them into milestones
// Otherwise have to do it here
@@ -897,8 +964,8 @@ bool handleToken(SWBuf &text, XMLTag token) {
if (inChapterIntro) {
// Determine when we are no longer in a chapter heading, but in pre-verse material:
// If we see one of the following:
- // a section div
- // a title that is not main, chapter or sub or unclassified (no type attribute)
+ // a section div
+ // a title that is not main, chapter or sub or unclassified (no type attribute)
if ((tokenName == "div" && typeAttr == "section") ||
(tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub")
) {
@@ -961,7 +1028,7 @@ bool handleToken(SWBuf &text, XMLTag token) {
if (tokenName != topToken.getName()) {
cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
-// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
+// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a bug somewhere that's killing the converter here.
// So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
// (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
}
@@ -1149,6 +1216,7 @@ XMLTag transformBSP(XMLTag t) {
static std::stack<XMLTag> bspTagStack;
static int sID = 1;
char buf[11];
+ SWBuf typeAttr = t.getAttribute("type");
// Support simplification transformations
if (t.isEmpty()) {
@@ -1173,12 +1241,13 @@ XMLTag transformBSP(XMLTag t) {
// The following containers are milestoneable.
// abbr, closer, div, foreign, l, lg, salute, signed, speech
// Leaving out:
- // abbr When would this ever cross a boundary?
- // seg as it is used for a divineName hack
- // foreign so that it can be easily italicized
+ // abbr When would this ever cross a boundary?
+ // seg as it is used for a divineName hack
+ // foreign so that it can be easily italicized
+ // div type="colophon" so that it can be treated as a block
else if (tagName == "chapter" ||
tagName == "closer" ||
- tagName == "div" ||
+ (tagName == "div" && typeAttr != "colophon") ||
tagName == "l" ||
tagName == "lg" ||
tagName == "q" ||
@@ -1208,11 +1277,13 @@ XMLTag transformBSP(XMLTag t) {
}
bspTagStack.pop();
+ SWBuf topTypeAttr = topToken.getAttribute("type");
// Look for the milestoneable container tags handled above.
+ // Have to treat div type="colophon" differently
if (tagName == "chapter" ||
tagName == "closer" ||
- tagName == "div" ||
+ (tagName == "div" && topTypeAttr != "colophon") ||
tagName == "l" ||
tagName == "lg" ||
tagName == "p" ||
@@ -1272,8 +1343,8 @@ void writeLinks()
while (!verseKeys.popError()) {
linkKey = verseKeys.getElement();
- verseKeys.increment(1);
linkToEntry(linkKey, destKey);
+ verseKeys.increment(1);
}
}
}
@@ -1288,15 +1359,18 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals
fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to\n");
fprintf(stderr, "\t\t\t\t read from standard input\n");
fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n");
- fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n");
- fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n");
- fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n");
+ fprintf(stderr, " -z <l|z|b|x>\t\t compression type (default: none)\n");
+ fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n");
+ fprintf(stderr, " -b <2|3|4>\t\t compression block size (default: 4)\n");
fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
+ fprintf(stderr, " -l <1-9>\t\t compression level (default varies by compression type)\n");
fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
-#ifdef _ICU_
- fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
+#ifdef _ICU_
+ fprintf(stderr, " -e <1|2|s>\t\t convert Unicode encoding (default: 1)\n");
+ fprintf(stderr, "\t\t\t\t 1 - UTF-8 ; 2 - UTF-16 ; s - SCSU\n");
+ fprintf(stderr, " -N\t\t\t do not normalize to NFC\n");
if (verboseHelp) {
fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n");
@@ -1353,7 +1427,7 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals
void processOSIS(istream& infile) {
typedef enum {
- CS_NOT_IN_COMMENT, // or seen starting "<"
+ CS_NOT_IN_COMMENT, // or seen starting "<"
CS_SEEN_STARTING_EXCLAMATION,
CS_SEEN_STARTING_HYPHEN,
CS_IN_COMMENT,
@@ -1362,13 +1436,21 @@ void processOSIS(istream& infile) {
CS_SEEN_ENDING_GREATER_THAN
} t_commentstate;
+ typedef enum {
+ ET_NUM,
+ ET_HEX,
+ ET_CHAR,
+ ET_NONE,
+ ET_ERR
+ } t_entitytype;
+
activeOsisID[0] = '\0';
strcpy(currentOsisID,"N/A");
currentVerse.setVersificationSystem(v11n);
currentVerse.setAutoNormalize(false);
- currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings
+ currentVerse.setIntros(true); // turn on mod/testmnt/book/chap headings
currentVerse.setPersist(true);
module->setKey(currentVerse);
@@ -1382,6 +1464,13 @@ void processOSIS(istream& infile) {
bool inWhitespace = false;
bool seeingSpace = false;
unsigned char curChar = '\0';
+ SWBuf entityToken;
+ bool inentity = false;
+ t_entitytype entitytype = ET_NONE;
+ unsigned char attrQuoteChar = '\0';
+ bool inattribute = false;
+ unsigned int linePos = 1;
+ unsigned int charPos = 0;
while (infile.good()) {
@@ -1398,16 +1487,221 @@ void processOSIS(istream& infile) {
// Does a SWORD module actually require this?
if (curChar == '\n') {
curChar = ' ';
+ charPos = 0;
+ linePos++;
+ }
+ charPos++;
+
+ // Look for entities:
+ // These are of the form &#dddd;, &xHHHH; or &llll;
+ // where dddd is a sequence of digits
+ // HHHH is a sequence of [A-Fa-f0-9]
+ // llll is amp, lt, gt, quot or apos
+ // but we will look for a sequence of [A-Za-z0-9]
+ // All but &amp;, &lt;, &gt;, &quot;, &apos; will produce a WARNING
+ // In the future:
+ // &#dddd; and &xHHHH; should be converted to UTF-8,
+ // with a WARNING if the text is not UTF-8
+ // &llll; other than the xml standard 5 should produce a WARNING
+
+ // For entity diagnostics track whether the text is an attribute value
+ if (inattribute && (curChar == '\'' || curChar == '"')) {
+ if (attrQuoteChar == curChar) {
+ inattribute = false;
+ attrQuoteChar = '\0';
+ }
+ else {
+ attrQuoteChar = curChar;
+ }
+ }
+ if (intoken && curChar == '=') {
+ inattribute = true;
+ attrQuoteChar = '\0';
+ }
+
+ if (!inentity && curChar == '&') {
+ inentity = true;
+ entitytype = ET_NONE;
+ entityToken = "&";
+ continue;
+ }
+
+ if (inentity) {
+ if (curChar == ';') {
+ inentity = false;
+ }
+ else {
+ switch (entitytype) {
+ case ET_NONE:
+ // A hex entity cannot start with X in XML, but it can in HTML
+ // Allow for it here and complain later
+ if (curChar == 'x' || curChar == 'X') {
+ entitytype = ET_HEX;
+ }
+ else
+ if (curChar == '#') {
+ entitytype = ET_NUM;
+ }
+ else
+ if ((curChar >= 'A' && curChar <= 'Z') ||
+ (curChar >= 'a' && curChar <= 'z') ||
+ (curChar >= '0' && curChar <= '9')) {
+ entitytype = ET_CHAR;
+ }
+ else {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+
+ case ET_NUM :
+ if (!(curChar >= '0' && curChar <= '9')) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ case ET_HEX :
+ if ((curChar >= 'G' && curChar <= 'Z') ||
+ (curChar >= 'g' && curChar <= 'z')) {
+ // Starts out as a HEX entity, but it isn't one
+ entitytype = ET_CHAR;
+ }
+ else
+ if (!((curChar >= 'A' && curChar <= 'F') ||
+ (curChar >= 'a' && curChar <= 'f') ||
+ (curChar >= '0' && curChar <= '9'))) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ case ET_CHAR :
+ if (!((curChar >= 'A' && curChar <= 'Z') ||
+ (curChar >= 'a' && curChar <= 'z') ||
+ (curChar >= '0' && curChar <= '9'))) {
+ inentity = false;
+ entitytype = ET_ERR;
+ }
+ break;
+ default:
+ cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl;
+ exit(EXIT_BAD_NESTING);
+ }
+ }
+
+ if (entitytype != ET_ERR) {
+ entityToken.append((char) curChar);
+ }
+
+ // It is an entity, perhaps invalid, if curChar is ';', error otherwise
+ // Test to see if we now have an entity or a failure
+ // It may not be a valid entity.
+ if (!inentity) {
+ switch (entitytype) {
+ case ET_ERR :
+ // Remove the leading &
+ entityToken << 1;
+ cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &amp;" << entityToken << endl;
+ if (intoken) {
+ token.append("&amp;");
+ token.append(entityToken);
+ }
+ else {
+ text.append("&amp;");
+ text.append(entityToken);
+ }
+ break;
+ case ET_HEX :
+ if (entityToken[1] != 'x') {
+ cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl;
+ }
+ else {
+ cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl;
+ }
+ break;
+ case ET_CHAR :
+ if (strcmp(entityToken, "&amp;") &&
+ strcmp(entityToken, "&lt;") &&
+ strcmp(entityToken, "&gt;") &&
+ strcmp(entityToken, "&quot;") &&
+ strcmp(entityToken, "&apos;")) {
+ cout << "WARNING(PARSE): XML only supports 5 Character entities &amp;, &lt;, &gt;, &quot; and &apos;, found " << entityToken << endl;
+ }
+ else
+ if (!strcmp(entityToken, "&apos;")) {
+ cout << "WARNING(PARSE): While valid for XML, XHTML does not support &apos;." << endl;
+ if (!inattribute) {
+ cout << "WARNING(PARSE): &apos; is unnecessary outside of attribute values. Replacing with '. " << endl;
+ entityToken = "'";
+ }
+ else {
+ switch (attrQuoteChar) {
+ case '"' :
+ cout << "WARNING(PARSE): &apos; is unnecessary inside double quoted attribute values. Replacing with '. " << endl;
+ entityToken = "'";
+ break;
+ case '\'' :
+ cout << "WARNING(PARSE): &apos; is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl;
+ break;
+ }
+ }
+ }
+ else
+ if (!strcmp(entityToken, "&quot;")) {
+ cout << "WARNING(PARSE): While valid for XML, &quot; is only needed within double quoted attribute values" << endl;
+ if (!inattribute) {
+ cout << "WARNING(PARSE): &quot; is unnecessary outside of attribute values. Replace with \"." << endl;
+ entityToken = "\"";
+ }
+ else {
+ switch (attrQuoteChar) {
+ case '"' :
+ cout << "WARNING(PARSE): &quot; is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl;
+ break;
+ case '\'' :
+ cout << "WARNING(PARSE): &quot; is unnecessary inside single quoted attribute values. Replace with \"." << endl;
+ entityToken = "\"";
+ break;
+ }
+ }
+ }
+ break;
+ case ET_NUM :
+ cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl;
+ break;
+ case ET_NONE :
+ default:
+ break;
+ }
+
+ // Put the entity into the stream.
+ if (intoken) {
+ token.append(entityToken);
+ }
+ else {
+ text.append(entityToken);
+ }
+
+ if (curChar == ';') {
+ // The character was handled, so go get the next one.
+ continue;
+ }
+ }
+ else {
+ // The character was handled, so go get the next one.
+ continue;
+ }
}
+
if (!intoken && curChar == '<') {
intoken = true;
token = "<";
+ inattribute = false;
+ attrQuoteChar = '\0';
continue;
}
// Handle XML comments starting with "<!--", ending with "-->"
-
if (intoken && !incomment) {
switch (commentstate) {
case CS_NOT_IN_COMMENT :
@@ -1532,8 +1826,8 @@ void processOSIS(istream& infile) {
}
else {
switch (curChar) {
- case '>' : text.append("&gt;"); break;
- case '<' : text.append("&lt;"); break;
+ case '>' : cout << "WARNING(PARSE): > should be &gt;" << endl; text.append("&gt;"); break;
+ case '<' : cout << "WARNING(PARSE): < should be &lt;" << endl; text.append("&lt;"); break;
default : text.append((char) curChar); break;
}
}
@@ -1552,7 +1846,7 @@ void processOSIS(istream& infile) {
int main(int argc, char **argv) {
- fprintf(stderr, "You are running osis2mod: $Rev: 3177 $\n");
+ fprintf(stderr, "You are running osis2mod: $Rev: 3431 $\n");
if (argc > 1) {
for (int i = 1; i < argc; i++) {
@@ -1578,19 +1872,25 @@ int main(int argc, char **argv) {
int entrySize = 0;
SWBuf cipherKey = "";
SWCompress *compressor = 0;
+ int compLevel = 0;
for (int i = 3; i < argc; i++) {
if (!strcmp(argv[i], "-a")) {
append = 1;
}
else if (!strcmp(argv[i], "-z")) {
- if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
- if (entrySize) usage(*argv, "Cannot specify both -z and -s");
compType = "ZIP";
+ if (i+1 < argc && argv[i+1][0] != '-') {
+ switch (argv[++i][0]) {
+ case 'l': compType = "LZSS"; break;
+ case 'z': compType = "ZIP"; break;
+ case 'b': compType = "BZIP2"; break;
+ case 'x': compType = "XZ"; break;
+ }
+ }
}
else if (!strcmp(argv[i], "-Z")) {
if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
- if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
compType = "LZSS";
}
else if (!strcmp(argv[i], "-b")) {
@@ -1603,6 +1903,30 @@ int main(int argc, char **argv) {
else if (!strcmp(argv[i], "-N")) {
normalize = false;
}
+ else if (!strcmp(argv[i], "-e")) {
+ if (i+1 < argc) {
+ switch (argv[++i][0]) {
+ case '1': // leave as UTF-8
+ outputEncoder = NULL;
+ outputDecoder = NULL;
+ break;
+
+ case '2':
+ outputEncoder = new UTF8UTF16();
+ outputDecoder = new UTF16UTF8();
+ break;
+#ifdef _ICU_
+ case 's':
+ outputEncoder = new UTF8SCSU();
+ outputDecoder = new SCSUUTF8();
+ break;
+#endif
+ default:
+ outputEncoder = NULL;
+ outputDecoder = NULL;
+ }
+ }
+ }
else if (!strcmp(argv[i], "-c")) {
if (i+1 < argc) cipherKey = argv[++i];
else usage(*argv, "-c requires <cipher_key>");
@@ -1612,7 +1936,6 @@ int main(int argc, char **argv) {
else usage(*argv, "-v requires <v11n>");
}
else if (!strcmp(argv[i], "-s")) {
- if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
if (i+1 < argc) {
entrySize = atoi(argv[++i]);
if (entrySize == 2 || entrySize == 4) {
@@ -1628,20 +1951,48 @@ int main(int argc, char **argv) {
if (i+1 < argc) debug |= atoi(argv[++i]);
else usage(*argv, "-d requires <flags>");
}
+ else if (!strcmp(argv[i], "-l")) {
+ if (i+1 < argc) {
+ compLevel = atoi(argv[++i]);
+ }
+ else usage(*argv, "-l requires a value from 1-9");
+
+ if (compLevel < 0 || compLevel > 10) {
+ usage(*argv, "-l requires a value from 1-9");
+ }
+ }
else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
- if (isCommentary) isCommentary = true; // avoid unused warning for now
+ if (isCommentary) isCommentary = true; // avoid unused warning for now
- if (compType == "ZIP") {
+ if (compType == "LZSS") {
+ compressor = new LZSSCompress();
+ }
+ else if (compType == "ZIP") {
#ifndef EXCLUDEZLIB
compressor = new ZipCompress();
#else
- usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library");
+ usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library");
#endif
}
- else if (compType == "LZSS") {
- compressor = new LZSSCompress();
+ else if (compType == "BZIP2") {
+#ifndef EXCLUDEBZIP2
+ compressor = new Bzip2Compress();
+#else
+ usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library");
+#endif
+ }
+ else if (compType == "XZ") {
+#ifndef EXCLUDEXZ
+ compressor = new XzCompress();
+#else
+ usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library");
+#endif
+ }
+
+ if (compressor && compLevel > 0) {
+ compressor->setLevel(compLevel);
}
#ifndef _ICU_
@@ -1652,16 +2003,24 @@ int main(int argc, char **argv) {
#endif
if (debug & DEBUG_OTHER) {
- cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
+ cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcompressLevel: " << compLevel << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
}
- if (!append) { // == 0 then create module
+ if (!append) { // == 0 then create module
// Try to initialize a default set of datafiles and indicies at our
// datapath location passed to us from the user.
if (compressor) {
- if (zText::createModule(path, iType, v11n)) {
- fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
- exit(EXIT_NO_CREATE);
+ if (entrySize == 4) {
+ if (zText4::createModule(path, iType, v11n)) {
+ fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+ exit(EXIT_NO_CREATE);
+ }
+ }
+ else {
+ if (zText::createModule(path, iType, v11n)) {
+ fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+ exit(EXIT_NO_CREATE);
+ }
}
}
else if (entrySize == 4) {
@@ -1680,50 +2039,69 @@ int main(int argc, char **argv) {
// Do some initialization stuff
if (compressor) {
- // Create a compressed text module allowing very large entries
- // Taking defaults except for first, fourth, fifth and last argument
- module = new zText(
- path, // ipath
- 0, // iname
- 0, // idesc
- iType, // iblockType
- compressor, // icomp
- 0, // idisp
- ENC_UNKNOWN, // enc
- DIRECTION_LTR, // dir
- FMT_UNKNOWN, // markup
- 0, // lang
- v11n // versification
+ if (entrySize == 4) {
+ // Create a compressed text module allowing very large entries
+ // Taking defaults except for first, fourth, fifth and last argument
+ module = new zText4(
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ iType, // iblockType
+ compressor, // icomp
+ 0, // idisp
+ ENC_UNKNOWN, // enc
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // lang
+ v11n // versification
);
+ }
+ else {
+ // Create a compressed text module allowing reasonable sized entries
+ // Taking defaults except for first, fourth, fifth and last argument
+ module = new zText(
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ iType, // iblockType
+ compressor, // icomp
+ 0, // idisp
+ ENC_UNKNOWN, // enc
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // lang
+ v11n // versification
+ );
+ }
}
else if (entrySize == 4) {
// Create a raw text module allowing very large entries
// Taking defaults except for first and last argument
module = new RawText4(
- path, // ipath
- 0, // iname
- 0, // idesc
- 0, // idisp
- ENC_UNKNOWN, // encoding
- DIRECTION_LTR, // dir
- FMT_UNKNOWN, // markup
- 0, // ilang
- v11n // versification
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ 0, // idisp
+ ENC_UNKNOWN, // encoding
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // ilang
+ v11n // versification
);
}
else {
// Create a raw text module allowing reasonable sized entries
// Taking defaults except for first and last argument
module = new RawText(
- path, // ipath
- 0, // iname
- 0, // idesc
- 0, // idisp
- ENC_UNKNOWN, // encoding
- DIRECTION_LTR, // dir
- FMT_UNKNOWN, // markup
- 0, // ilang
- v11n // versification
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ 0, // idisp
+ ENC_UNKNOWN, // encoding
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // ilang
+ v11n // versification
);
}
@@ -1759,6 +2137,10 @@ int main(int argc, char **argv) {
delete module;
if (cipherFilter)
delete cipherFilter;
+ if (outputEncoder)
+ delete outputEncoder;
+ if (outputDecoder)
+ delete outputDecoder;
fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
exit(0); // success