1 files changed, 465 insertions, 83 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp
index b8514b1..7ffe4ff 100644
--- a/utilities/osis2mod.cpp
+++ b/utilities/osis2mod.cpp
@@ -1,13 +1,13 @@
 /******************************************************************************
  *
- *  osis2mod.cpp -	Utility to import a module in OSIS format
+ *  osis2mod.cpp - Utility to import a module in OSIS format
  *
- * $Id: osis2mod.cpp 3177 2014-04-17 04:24:37Z greg.hellings $
+ * $Id: osis2mod.cpp 3431 2016-08-16 22:46:19Z refdoc $
  *
  * Copyright 2003-2014 CrossWire Bible Society (http://www.crosswire.org)
- *	CrossWire Bible Society
- *	P. O. Box 2528
- *	Tempe, AZ  85280-2528
+ *      CrossWire Bible Society
+ *      P. O. Box 2528
+ *      Tempe, AZ  85280-2528
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -44,14 +44,27 @@
 #include <versekey.h>
 
 #include <ztext.h>
+#include <ztext4.h>
 #include <lzsscomprs.h>
+#ifndef EXCLUDEZLIB
 #include <zipcomprs.h>
+#endif
+#ifndef EXCLUDEBZIP2
+#include <bz2comprs.h>
+#endif
+#ifndef EXCLUDEXZ
+#include <xzcomprs.h>
+#endif
 #include <cipherfil.h>
 
 #ifdef _ICU_
 #include <utf8nfc.h>
 #include <latin1utf8.h>
+#include <utf8scsu.h>
+#include <scsuutf8.h>
 #endif
+#include <utf8utf16.h>
+#include <utf16utf8.h>
 
 #ifndef NO_SWORD_NAMESPACE
 using namespace sword;
@@ -82,6 +95,9 @@ const int EXIT_BAD_NESTING =   5; // BSP or BCV nesting is bad
 UTF8NFC    normalizer;
 Latin1UTF8 converter;
 #endif
+SWFilter*  outputEncoder = NULL;
+SWFilter*  outputDecoder = NULL;
+
 int normalized = 0;
 int converted  = 0;
 
@@ -113,7 +129,8 @@ bool isOSISAbbrev(const char *buf) {
  * U-00000000 - U-0000007F  0nnnnnnn
  * U-00000080 - U-000007FF  110nnnnn  10nnnnnn
  * U-00000800 - U-0000FFFF  1110nnnn  10nnnnnn  10nnnnnn
- * U-00010000 - U-001FFFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ * U-00010000 - U-0010FFFF  11110nnn  10nnnnnn  10nnnnnn  10nnnnnn
+ *
  * Note:
  *   1.  The latest UTF-8 RFC allows for a max of 4 bytes.
  *       Earlier allowed 6.
@@ -351,7 +368,7 @@ void prepareSWVerseKey(SWBuf &buf) {
  * Determine whether a verse as given is valid for the versification.
  * This is done by comparing the before and after of normalization.
  */
-bool isValidRef(const char *buf) {
+bool isValidRef(const char *buf, const char *caller) {
 	// Create a VerseKey that does not do auto normalization
 	// Note: need to turn on headings so that a heading does not get normalized anyway
 	// And set it to the reference under question
@@ -382,7 +399,7 @@ bool isValidRef(const char *buf) {
 	// If we have gotten here the reference is not in the selected versification.
 	// cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
 	if (debug & DEBUG_REV11N) {
-		cout << "DEBUG(V11N): " << before << " normalizes to "  << after << endl;
+		cout << "DEBUG(V11N)[" << caller << "]: " << before << " normalizes to "  << after << endl;
 	}
 
 	return false;
@@ -465,7 +482,7 @@ void makeValidRef(VerseKey &key) {
 void writeEntry(SWBuf &text, bool force = false) {
 	char keyOsisID[255];
 
-	static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3177 $\"/>";
+	static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 3431 $\"/>";
 	static bool firstOT = true;
 	static bool firstNT = true;
 
@@ -494,7 +511,7 @@ void writeEntry(SWBuf &text, bool force = false) {
 	// If we have seen a verse and the supplied one is different then we output the collected one.
 	if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
 
-		if (!isValidRef(lastKey)) {
+		if (!isValidRef(lastKey, "writeEntry")) {
 			makeValidRef(lastKey);
 		}
 
@@ -525,6 +542,11 @@ void writeEntry(SWBuf &text, bool force = false) {
 			}
 		}
 
+		// If the desired output encoding is non-UTF-8, convert to that encoding
+		if (outputEncoder) {
+			outputEncoder->processText(activeVerseText, (SWKey *)2);  // note the hack of 2 to mimic a real key. TODO: remove all hacks
+		}
+
 		// If the entry already exists, then append this entry to the text.
 		// This is for verses that are outside the chosen versification. They are appended to the prior verse.
 		// The space should not be needed if we retained verse tags.
@@ -532,7 +554,16 @@ void writeEntry(SWBuf &text, bool force = false) {
 			module->flush();
 			SWBuf currentText = module->getRawEntry();
 			cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
+
+			// If we have a non-UTF-8 encoding, we should decode it before concatenating, then re-encode it
+			if (outputDecoder) {
+				outputDecoder->processText(activeVerseText, (SWKey *)2);
+				outputDecoder->processText(currentText, (SWKey *)2);
+			}
 			activeVerseText = currentText + " " + activeVerseText;
+			if (outputEncoder) {
+				outputEncoder->processText(activeVerseText, (SWKey *)2);
+			}
 		}
 
 		if (debug & DEBUG_WRITE) {
@@ -563,7 +594,7 @@ void writeEntry(SWBuf &text, bool force = false) {
 void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
 
 	// Only link verses that are in the versification.
-	if (!isValidRef(linkKey)) {
+	if (!isValidRef(linkKey, "linkToEntry")) {
 		return;
 	}
 
@@ -581,7 +612,7 @@ void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
 }
 
 // Return true if the content was handled or is to be ignored.
-//		false if the what has been seen is to be accumulated and considered later.
+//        false if the what has been seen is to be accumulated and considered later.
 bool handleToken(SWBuf &text, XMLTag token) {
 
 	// Everything between the begin book tag and the first begin chapter tag is inBookIntro
@@ -671,7 +702,7 @@ bool handleToken(SWBuf &text, XMLTag token) {
 
 			// BOOK START, <div type="book" ...>
 			if (tokenName == "div" && typeAttr == "book") {
-				if (inBookIntro || inChapterIntro) {	// this one should never happen, but just in case
+				if (inBookIntro || inChapterIntro) { // this one should never happen, but just in case
 
 					if (debug & DEBUG_TITLE) {
 						cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS INTRO " << endl;
@@ -802,10 +833,13 @@ bool handleToken(SWBuf &text, XMLTag token) {
 					// At that point we will output links.
 					// This can be done by incrementing, which will produce an error
 					// if there is only one verse.
-					verseKeys.setPosition(TOP);
-					verseKeys.increment(1);
-					if (!verseKeys.popError()) {
-						linkedVerses.push_back(verseKeys);
+					if (memberKeyCount > 1) {
+						verseKeys.setPosition(TOP);
+						verseKeys.increment(1);
+						if (!verseKeys.popError()) {
+							cout << "DEBUG(LINK): " << currentVerse.getOSISRef() << endl;
+							linkedVerses.push_back(verseKeys);
+						}
 					}
 				}
 				else {
@@ -852,6 +886,39 @@ bool handleToken(SWBuf &text, XMLTag token) {
 
 		// Now consider everything else.
 
+/*
+		// "majorSection" is code for the Book 1-5 of Psalms // This is incorrect assumption - majorSection can appear in any large book and can start and end inside chapters
+		if (tokenName == "div" && typeAttr == "majorSection") {
+			if (inBookIntro) {
+				if (debug & DEBUG_TITLE) {
+					cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK INTRO "<< text << endl;
+				}
+				writeEntry(text);
+			}
+
+			if (debug & DEBUG_OTHER) {
+				cout << "DEBUG(FOUND): majorSection found " << currentVerse.getOSISRef() << endl;
+			}
+
+			strcpy(currentOsisID, currentVerse.getOSISRef());
+
+// as a result of the incorrect assumption these flags are set also incorrectly and cause problems in situations where majorSections do not follow the assumptions made during creation of this patch
+
+			inChapter       = false;
+			inVerse         = false;
+			inPreVerse      = false;
+			inBookIntro     = false;
+			inChapterIntro  = true;
+
+			if (debug & DEBUG_TITLE) {
+				cout << "DEBUG(TITLE): " << currentOsisID << ": Looking for chapter introduction" << endl;
+			}
+
+			verseDepth      = 0;
+
+			return false;
+		}
+*/
 		// Handle WOC quotes.
 		// Note this requires transformBSP to make them into milestones
 		// Otherwise have to do it here
@@ -897,8 +964,8 @@ bool handleToken(SWBuf &text, XMLTag token) {
 			if (inChapterIntro) {
 				// Determine when we are no longer in a chapter heading, but in pre-verse material:
 				// If we see one of the following:
-				// 	a section div
-				// 	a title that is not main, chapter or sub or unclassified (no type attribute)
+				//     a section div
+				//     a title that is not main, chapter or sub or unclassified (no type attribute)
 				if ((tokenName == "div" && typeAttr == "section") ||
 				    (tokenName == "title" && typeAttr.length() != 0 && typeAttr != "main" && typeAttr != "chapter" && typeAttr != "sub")
 				) {
@@ -961,7 +1028,7 @@ bool handleToken(SWBuf &text, XMLTag token) {
 
 			if (tokenName != topToken.getName()) {
 				cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
-//				exit(EXIT_BAD_NESTING);	// (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
+//				exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a bug somewhere that's killing the converter here.
 						// So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
 						// (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
 			}
@@ -1149,6 +1216,7 @@ XMLTag transformBSP(XMLTag t) {
 	static std::stack<XMLTag> bspTagStack;
 	static int sID = 1;
 	char buf[11];
+	SWBuf typeAttr = t.getAttribute("type");
 
 	// Support simplification transformations
 	if (t.isEmpty()) {
@@ -1173,12 +1241,13 @@ XMLTag transformBSP(XMLTag t) {
 		// The following containers are milestoneable.
 		// abbr, closer, div, foreign, l, lg, salute, signed, speech
 		// Leaving out:
-		//   abbr	When would this ever cross a boundary?
-		//   seg	as it is used for a divineName hack
-		//   foreign	so that it can be easily italicized
+		//   abbr       When would this ever cross a boundary?
+		//   seg        as it is used for a divineName hack
+		//   foreign    so that it can be easily italicized
+		//   div type="colophon" so that it can be treated as a block
 		else if (tagName == "chapter" ||
 			 tagName == "closer"  ||
-			 tagName == "div"     ||
+			 (tagName == "div" && typeAttr != "colophon") ||
 			 tagName == "l"       ||
 			 tagName == "lg"      ||
 			 tagName == "q"       ||
@@ -1208,11 +1277,13 @@ XMLTag transformBSP(XMLTag t) {
 			}
 
 			bspTagStack.pop();
+			SWBuf topTypeAttr = topToken.getAttribute("type");
 
 			// Look for the milestoneable container tags handled above.
+			// Have to treat div type="colophon" differently
 			if (tagName == "chapter" ||
 			    tagName == "closer"  ||
-			    tagName == "div"     ||
+			    (tagName == "div" && topTypeAttr != "colophon") ||
 			    tagName == "l"       ||
 			    tagName == "lg"      ||
 			    tagName == "p"       ||
@@ -1272,8 +1343,8 @@ void writeLinks()
 
 		while (!verseKeys.popError()) {
 			linkKey = verseKeys.getElement();
-			verseKeys.increment(1);
 			linkToEntry(linkKey, destKey);
+			verseKeys.increment(1);
 		}
 	}
 }
@@ -1288,15 +1359,18 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals
 	fprintf(stderr, "  <osisDoc>\t\t path to the validated OSIS document, or '-' to\n");
 	fprintf(stderr, "\t\t\t\t read from standard input\n");
 	fprintf(stderr, "  -a\t\t\t augment module if exists (default is to create new)\n");
-	fprintf(stderr, "  -z\t\t\t use ZIP compression (default no compression)\n");
-	fprintf(stderr, "  -Z\t\t\t use LZSS compression (default no compression)\n");
-	fprintf(stderr, "  -b <2|3|4>\t\t compression block size (default 4):\n");
+	fprintf(stderr, "  -z <l|z|b|x>\t\t compression type (default: none)\n");
+	fprintf(stderr, "\t\t\t\t l - LZSS; z - ZIP; b - bzip2; x - xz\n");
+	fprintf(stderr, "  -b <2|3|4>\t\t compression block size (default: 4)\n");
 	fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
+	fprintf(stderr, "  -l <1-9>\t\t compression level (default varies by compression type)\n");
 	fprintf(stderr, "  -c <cipher_key>\t encipher module using supplied key\n");
 	fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
 
-#ifdef _ICU_       
-	fprintf(stderr, "  -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
+#ifdef _ICU_
+	fprintf(stderr, "  -e <1|2|s>\t\t convert Unicode encoding (default: 1)\n");
+	fprintf(stderr, "\t\t\t\t 1 - UTF-8 ; 2 - UTF-16 ; s - SCSU\n");
+	fprintf(stderr, "  -N\t\t\t do not normalize to NFC\n");
 	if (verboseHelp) {
 		fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
 		fprintf(stderr, "\t\t\t\t  and then normalize to NFC)\n");
@@ -1353,7 +1427,7 @@ void usage(const char *app, const char *error = 0, const bool verboseHelp = fals
 
 void processOSIS(istream& infile) {
 	typedef enum {
-		CS_NOT_IN_COMMENT,		// or seen starting "<"
+		CS_NOT_IN_COMMENT,            // or seen starting "<"
 		CS_SEEN_STARTING_EXCLAMATION,
 		CS_SEEN_STARTING_HYPHEN,
 		CS_IN_COMMENT,
@@ -1362,13 +1436,21 @@ void processOSIS(istream& infile) {
 		CS_SEEN_ENDING_GREATER_THAN
 	} t_commentstate;
 
+	typedef enum {
+		ET_NUM,
+		ET_HEX,
+		ET_CHAR,
+		ET_NONE,
+		ET_ERR
+	} t_entitytype;
+
 	activeOsisID[0] = '\0';
 
 	strcpy(currentOsisID,"N/A");
 
 	currentVerse.setVersificationSystem(v11n);
 	currentVerse.setAutoNormalize(false);
-	currentVerse.setIntros(true);	// turn on mod/testmnt/book/chap headings
+	currentVerse.setIntros(true);  // turn on mod/testmnt/book/chap headings
 	currentVerse.setPersist(true);
 
 	module->setKey(currentVerse);
@@ -1382,6 +1464,13 @@ void processOSIS(istream& infile) {
 	bool inWhitespace = false;
 	bool seeingSpace = false;
 	unsigned char curChar = '\0';
+	SWBuf entityToken;
+	bool inentity = false;
+	t_entitytype entitytype = ET_NONE;
+	unsigned char attrQuoteChar = '\0';
+	bool inattribute = false;
+	unsigned int linePos = 1;
+	unsigned int charPos = 0;
 
 	while (infile.good()) {
 
@@ -1398,16 +1487,221 @@ void processOSIS(istream& infile) {
 		// Does a SWORD module actually require this?
 		if (curChar == '\n') {
 			curChar = ' ';
+			charPos = 0;
+			linePos++;
+		}
+		charPos++;
+
+		// Look for entities:
+		// These are of the form &#dddd;, &xHHHH; or &llll;
+		// where dddd is a sequence of digits
+		//       HHHH is a sequence of [A-Fa-f0-9]
+		//       llll is amp, lt, gt, quot or apos
+		//            but we will look for a sequence of [A-Za-z0-9]
+		// All but &amp;, &lt;, &gt;, &quot;, &apos; will produce a WARNING
+		// In the future:
+		//    &#dddd; and &xHHHH; should be converted to UTF-8,
+		//        with a WARNING if the text is not UTF-8
+		//    &llll; other than the xml standard 5 should produce a WARNING
+
+		// For entity diagnostics track whether the text is an attribute value
+		if (inattribute && (curChar == '\'' || curChar == '"')) {
+			if (attrQuoteChar == curChar) {
+				inattribute = false;
+				attrQuoteChar = '\0';
+			}
+			else {
+				attrQuoteChar = curChar;
+			}
+		}
+		if (intoken && curChar == '=') {
+			inattribute = true;
+			attrQuoteChar = '\0';
+		}
+
+		if (!inentity && curChar == '&') {
+			inentity = true;
+			entitytype = ET_NONE;
+			entityToken = "&";
+			continue;
+		}
+
+		if (inentity) {
+			if (curChar == ';') {
+				inentity = false;
+			}
+			else {
+				switch (entitytype) {
+				    case ET_NONE:
+					// A hex entity cannot start with X in XML, but it can in HTML
+					// Allow for it here and complain later
+					if (curChar == 'x' || curChar == 'X') {
+						entitytype = ET_HEX;
+					}
+					else
+					if (curChar == '#') {
+						entitytype = ET_NUM;
+					}
+					else
+					if ((curChar >= 'A' && curChar <= 'Z') ||
+					    (curChar >= 'a' && curChar <= 'z') ||
+					    (curChar >= '0' && curChar <= '9')) {
+						entitytype = ET_CHAR;
+					}
+					else {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+
+				    case ET_NUM :
+					if (!(curChar >= '0' && curChar <= '9')) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    case ET_HEX :
+					if ((curChar >= 'G' && curChar <= 'Z') ||
+					    (curChar >= 'g' && curChar <= 'z')) {
+						// Starts out as a HEX entity, but it isn't one
+						entitytype = ET_CHAR;
+					}
+					else
+					if (!((curChar >= 'A' && curChar <= 'F') ||
+					      (curChar >= 'a' && curChar <= 'f') ||
+					      (curChar >= '0' && curChar <= '9'))) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    case ET_CHAR :
+					if (!((curChar >= 'A' && curChar <= 'Z') ||
+					      (curChar >= 'a' && curChar <= 'z') ||
+					      (curChar >= '0' && curChar <= '9'))) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    default:
+					cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl;
+					exit(EXIT_BAD_NESTING);
+				}
+			}
+
+			if (entitytype != ET_ERR) {
+				entityToken.append((char) curChar);
+			}
+
+			// It is an entity, perhaps invalid, if curChar is ';', error otherwise
+			// Test to see if we now have an entity or a failure
+			// It may not be a valid entity.
+			if (!inentity) {
+				switch (entitytype) {
+				    case ET_ERR :
+					// Remove the leading &
+					entityToken << 1;
+					cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &amp;" << entityToken << endl;
+					if (intoken) {
+						token.append("&amp;");
+						token.append(entityToken);
+					}
+					else {
+						text.append("&amp;");
+						text.append(entityToken);
+					}
+					break;
+				    case ET_HEX :
+					if (entityToken[1] != 'x') {
+						cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl;
+					}
+					else {
+						cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl;
+					}
+					break;
+				    case ET_CHAR :
+					if (strcmp(entityToken, "&amp;")  &&
+				            strcmp(entityToken, "&lt;")   &&
+				            strcmp(entityToken, "&gt;")   &&
+				            strcmp(entityToken, "&quot;") &&
+				            strcmp(entityToken, "&apos;")) {
+						cout << "WARNING(PARSE): XML only supports 5 Character entities &amp;, &lt;, &gt;, &quot; and &apos;, found " << entityToken << endl;
+					}
+					else
+					if (!strcmp(entityToken, "&apos;")) {
+						cout << "WARNING(PARSE): While valid for XML, XHTML does not support &apos;." << endl;
+						if (!inattribute) {
+							cout << "WARNING(PARSE): &apos; is unnecessary outside of attribute values. Replacing with '. " << endl;
+							entityToken = "'";
+						}
+						else {
+							switch (attrQuoteChar) {
+							    case '"' :
+								cout << "WARNING(PARSE): &apos; is unnecessary inside double quoted attribute values. Replacing with '. " << endl;
+								entityToken = "'";
+								break;
+							    case '\'' :
+								cout << "WARNING(PARSE): &apos; is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl;
+								break;
+							}
+						}
+					}
+					else
+					if (!strcmp(entityToken, "&quot;")) {
+						cout << "WARNING(PARSE): While valid for XML, &quot; is only needed within double quoted attribute values" << endl;
+						if (!inattribute) {
+							cout << "WARNING(PARSE): &quot; is unnecessary outside of attribute values. Replace with \"." << endl;
+							entityToken = "\"";
+						}
+						else {
+							switch (attrQuoteChar) {
+							    case '"' :
+								cout << "WARNING(PARSE): &quot; is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl;
+								break;
+							    case '\'' :
+								cout << "WARNING(PARSE): &quot; is unnecessary inside single quoted attribute values. Replace with \"." << endl;
+								entityToken = "\"";
+								break;
+							}
+						}
+					}
+					break;
+				    case ET_NUM :
+					cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl;
+					break;
+				    case ET_NONE :
+				    default:
+					break;
+				}
+
+				// Put the entity into the stream.
+				if (intoken) {
+					token.append(entityToken);
+				}
+				else {
+					text.append(entityToken);
+				}
+
+				if (curChar == ';') {
+					// The character was handled, so go get the next one.
+					continue;
+				}
+			}
+			else {
+				// The character was handled, so go get the next one.
+				continue;
+			}
 		}
 
+
 		if (!intoken && curChar == '<') {
 			intoken = true;
 			token = "<";
+			inattribute = false;
+			attrQuoteChar = '\0';
 			continue;
 		}
 
 		// Handle XML comments starting with "<!--", ending with "-->"
-
 		if (intoken && !incomment) {
 			switch (commentstate) {
 				case CS_NOT_IN_COMMENT :
@@ -1532,8 +1826,8 @@ void processOSIS(istream& infile) {
 		}
 		else {
 			switch (curChar) {
-				case '>' : text.append("&gt;"); break;
-				case '<' : text.append("&lt;"); break;
+				case '>' : cout << "WARNING(PARSE): > should be &gt;" << endl; text.append("&gt;"); break;
+				case '<' : cout << "WARNING(PARSE): < should be &lt;" << endl; text.append("&lt;"); break;
 				default  : text.append((char) curChar); break;
 			}
 		}
@@ -1552,7 +1846,7 @@ void processOSIS(istream& infile) {
 
 int main(int argc, char **argv) {
 
-	fprintf(stderr, "You are running osis2mod: $Rev: 3177 $\n");
+	fprintf(stderr, "You are running osis2mod: $Rev: 3431 $\n");
 	
 	if (argc > 1) {
 		for (int i = 1; i < argc; i++) {
@@ -1578,19 +1872,25 @@ int main(int argc, char **argv) {
 	int entrySize          = 0;
 	SWBuf cipherKey        = "";
 	SWCompress *compressor = 0;
+	int compLevel      = 0;
 
 	for (int i = 3; i < argc; i++) {
 		if (!strcmp(argv[i], "-a")) {
 			append = 1;
 		}
 		else if (!strcmp(argv[i], "-z")) {
-			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
-			if (entrySize) usage(*argv, "Cannot specify both -z and -s");
 			compType = "ZIP";
+			if (i+1 < argc && argv[i+1][0] != '-') {
+				switch (argv[++i][0]) {
+				case 'l': compType = "LZSS"; break;
+				case 'z': compType = "ZIP"; break;
+				case 'b': compType = "BZIP2"; break;
+				case 'x': compType = "XZ"; break;
+				}
+			}
 		}
 		else if (!strcmp(argv[i], "-Z")) {
 			if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
-			if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
 			compType = "LZSS";
 		}
 		else if (!strcmp(argv[i], "-b")) {
@@ -1603,6 +1903,30 @@ int main(int argc, char **argv) {
 		else if (!strcmp(argv[i], "-N")) {
 			normalize = false;
 		}
+		else if (!strcmp(argv[i], "-e")) {
+			if (i+1 < argc) {
+				switch (argv[++i][0]) {
+				case '1': // leave as UTF-8
+					outputEncoder = NULL;
+					outputDecoder = NULL;
+					break;
+
+				case '2':
+					outputEncoder = new UTF8UTF16();
+					outputDecoder = new UTF16UTF8();
+					break;
+#ifdef _ICU_
+				case 's':
+					outputEncoder = new UTF8SCSU();
+					outputDecoder = new SCSUUTF8();
+					break;
+#endif
+				default:
+					outputEncoder = NULL;
+					outputDecoder = NULL;
+				}
+			}
+		}
 		else if (!strcmp(argv[i], "-c")) {
 			if (i+1 < argc) cipherKey = argv[++i];
 			else usage(*argv, "-c requires <cipher_key>");
@@ -1612,7 +1936,6 @@ int main(int argc, char **argv) {
 			else usage(*argv, "-v requires <v11n>");
 		}
 		else if (!strcmp(argv[i], "-s")) {
-			if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
 			if (i+1 < argc) {
 				entrySize = atoi(argv[++i]);
 				if (entrySize == 2 || entrySize == 4) {
@@ -1628,20 +1951,48 @@ int main(int argc, char **argv) {
 			if (i+1 < argc) debug |= atoi(argv[++i]);
 			else usage(*argv, "-d requires <flags>");
 		}
+		else if (!strcmp(argv[i], "-l")) {
+			if (i+1 < argc) {
+				compLevel = atoi(argv[++i]);
+			}
+			else usage(*argv, "-l requires a value from 1-9");
+			
+			if (compLevel < 0 || compLevel > 10) {
+				usage(*argv, "-l requires a value from 1-9");
+			}
+		}
 		else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
 	}
 
-	if (isCommentary) isCommentary = true;	// avoid unused warning for now
+	if (isCommentary) isCommentary = true;  // avoid unused warning for now
 
-	if (compType == "ZIP") {
+	if (compType == "LZSS") {
+		compressor = new LZSSCompress();
+	}
+	else if (compType == "ZIP") {
 #ifndef EXCLUDEZLIB
 		compressor = new ZipCompress();
 #else
-		usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libzip is available when compiling SWORD library");
+		usage(*argv, "ERROR: SWORD library not compiled with ZIP compression support.\n\tBe sure libz is available when compiling SWORD library");
 #endif
 	}
-	else if (compType == "LZSS") {
-		compressor = new LZSSCompress();
+	else if (compType == "BZIP2") {
+#ifndef EXCLUDEBZIP2
+		compressor = new Bzip2Compress();
+#else
+		usage(*argv, "ERROR: SWORD library not compiled with bzip2 compression support.\n\tBe sure libbz2 is available when compiling SWORD library");
+#endif
+	}
+	else if (compType == "XZ") {
+#ifndef EXCLUDEXZ
+		compressor = new XzCompress();
+#else
+		usage(*argv, "ERROR: SWORD library not compiled with xz compression support.\n\tBe sure liblzma is available when compiling SWORD library");
+#endif		
+	}
+
+	if (compressor && compLevel > 0) {
+		compressor->setLevel(compLevel);
 	}
 
 #ifndef _ICU_
@@ -1652,16 +2003,24 @@ int main(int argc, char **argv) {
 #endif
 
 	if (debug & DEBUG_OTHER) {
-		cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
+		cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcompressLevel: " << compLevel << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
 	}
 
-	if (!append) {	// == 0 then create module
+	if (!append) {  // == 0 then create module
 	// Try to initialize a default set of datafiles and indicies at our
 	// datapath location passed to us from the user.
 		if (compressor) {
-			if (zText::createModule(path, iType, v11n)) {
-				fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
-				exit(EXIT_NO_CREATE);
+			if (entrySize == 4) {
+				if (zText4::createModule(path, iType, v11n)) {
+					fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+					exit(EXIT_NO_CREATE);
+				}
+			}
+			else {
+				if (zText::createModule(path, iType, v11n)) {
+					fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+					exit(EXIT_NO_CREATE);
+				}
 			}
 		}
 		else if (entrySize == 4) {
@@ -1680,50 +2039,69 @@ int main(int argc, char **argv) {
 
 	// Do some initialization stuff
 	if (compressor) {
-		// Create a compressed text module allowing very large entries
-		// Taking defaults except for first, fourth, fifth and last argument
-		module = new zText(
-				path,		// ipath
-				0,		// iname
-				0,		// idesc
-				iType,		// iblockType
-				compressor,	// icomp
-				0,		// idisp
-				ENC_UNKNOWN,	// enc
-				DIRECTION_LTR,	// dir
-				FMT_UNKNOWN,	// markup
-				0,		// lang
-				v11n		// versification
+		if (entrySize == 4) {
+			// Create a compressed text module allowing very large entries
+			// Taking defaults except for first, fourth, fifth and last argument
+			module = new zText4(
+				path,           // ipath
+				0,              // iname
+				0,              // idesc
+				iType,          // iblockType
+				compressor,     // icomp
+				0,              // idisp
+				ENC_UNKNOWN,    // enc
+				DIRECTION_LTR,  // dir
+				FMT_UNKNOWN,    // markup
+				0,              // lang
+				v11n            // versification
 		       );
+		}
+		else {
+			// Create a compressed text module allowing reasonable sized entries
+			// Taking defaults except for first, fourth, fifth and last argument
+			module = new zText(
+				path,           // ipath
+				0,              // iname
+				0,              // idesc
+				iType,          // iblockType
+				compressor,     // icomp
+				0,              // idisp
+				ENC_UNKNOWN,    // enc
+				DIRECTION_LTR,  // dir
+				FMT_UNKNOWN,    // markup
+				0,              // lang
+				v11n            // versification
+		       );
+		}
 	}
 	else if (entrySize == 4) {
 		// Create a raw text module allowing very large entries
 		// Taking defaults except for first and last argument
 		module = new RawText4(
-				path,		// ipath
-				0,		// iname
-				0,		// idesc
-				0,		// idisp
-				ENC_UNKNOWN,	// encoding
-				DIRECTION_LTR,	// dir
-				FMT_UNKNOWN,	// markup
-				0,		// ilang
-				v11n		// versification
+				path,           // ipath
+				0,              // iname
+				0,              // idesc
+				0,              // idisp
+				ENC_UNKNOWN,    // encoding
+				DIRECTION_LTR,  // dir
+				FMT_UNKNOWN,    // markup
+				0,              // ilang
+				v11n            // versification
 			);
 	}
 	else {
 		// Create a raw text module allowing reasonable sized entries
 		// Taking defaults except for first and last argument
 		module = new RawText(
-				path,		// ipath
-				0,		// iname
-				0,		// idesc
-				0,		// idisp
-				ENC_UNKNOWN,	// encoding
-				DIRECTION_LTR,	// dir
-				FMT_UNKNOWN,	// markup
-				0,		// ilang
-				v11n		// versification
+				path,           // ipath
+				0,              // iname
+				0,              // idesc
+				0,              // idisp
+				ENC_UNKNOWN,    // encoding
+				DIRECTION_LTR,  // dir
+				FMT_UNKNOWN,    // markup
+				0,              // ilang
+				v11n            // versification
 			);
 	}
 
@@ -1759,6 +2137,10 @@ int main(int argc, char **argv) {
 	delete module;
 	if (cipherFilter)
 		delete cipherFilter;
+	if (outputEncoder)
+		delete outputEncoder;
+	if (outputDecoder)
+		delete outputDecoder;
 
 	fprintf(stderr, "SUCCESS: %s: has finished its work and will now rest\n", program);
 	exit(0); // success