summaryrefslogtreecommitdiff
path: root/utilities/osis2mod.cpp
diff options
context:
space:
mode:
authorRoberto C. Sanchez <roberto@connexer.com>2014-03-29 10:54:01 -0400
committerRoberto C. Sanchez <roberto@connexer.com>2014-03-29 10:54:01 -0400
commit71a39f4652cd51df814c930dd268f3c9ad2aee86 (patch)
tree5994350a603908c4e4d660bc9d72c4ec43dd648e /utilities/osis2mod.cpp
parent03134fa5f6f25d92724ce4c183f9bbe12a9e37dc (diff)
Imported Upstream version 1.6.0+dfsg
Diffstat (limited to 'utilities/osis2mod.cpp')
-rw-r--r--utilities/osis2mod.cpp1798
1 files changed, 1154 insertions, 644 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp
index 69d984d..473a90f 100644
--- a/utilities/osis2mod.cpp
+++ b/utilities/osis2mod.cpp
@@ -1,24 +1,40 @@
+/*
+ * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org)
+ * CrossWire Bible Society
+ * P. O. Box 2528
+ * Tempe, AZ 85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
-#include <string>
#include <stack>
+#include <vector>
#include <iostream>
#include <fstream>
#include <utilstr.h>
#include <swmgr.h>
#include <rawtext.h>
+#include <rawtext4.h>
#include <swbuf.h>
#include <utilxml.h>
#include <listkey.h>
#include <versekey.h>
#include <ztext.h>
-// #include <zld.h>
-// #include <zcom.h>
#include <lzsscomprs.h>
#include <zipcomprs.h>
#include <cipherfil.h>
@@ -28,61 +44,61 @@
#include <latin1utf8.h>
#endif
-//#define DEBUG
-
-// Debug for simple transformation stack
-//#define DEBUG_XFORM
-
-// Debug for parsing osisRefs
-//#define DEBUG_REF
-
-// Debug for tag stack
-//#define DEBUG_STACK
-
#ifndef NO_SWORD_NAMESPACE
using namespace sword;
#endif
using namespace std;
-#ifdef _ICU_
-UTF8NFC normalizer;
-int normalized = 0;
+// Turn debugging on and off
+//#define DEBUG
+int debug = 0;
+const int DEBUG_WRITE = 1; // writing to module
+const int DEBUG_VERSE = 2; // verse start and end
+const int DEBUG_QUOTE = 4; // quotes, especially Words of Christ (WOC)
+const int DEBUG_TITLE = 8; // titles
+const int DEBUG_INTERVERSE = 16; // inter-verse maerial
+const int DEBUG_XFORM = 32; // transformations
+const int DEBUG_REV11N = 64; // versification
+const int DEBUG_REF = 128; // parsing of osisID and osisRef
+const int DEBUG_STACK = 256; // cleanup of references
+const int DEBUG_OTHER = 512; // ins and outs of books, chapters and verses
+
+// Exit codes
+const int EXIT_BAD_ARG = 1; // Bad parameter given for program
+const int EXIT_NO_WRITE = 2; // Could not open the module for writing
+const int EXIT_NO_CREATE = 3; // Could not create the module
+const int EXIT_NO_READ = 4; // Could not open the input file for reading.
+const int EXIT_BAD_NESTING = 5; // BSP or BCV nesting is bad
+#ifdef _ICU_
+UTF8NFC normalizer;
Latin1UTF8 converter;
-int converted = 0;
#endif
+int normalized = 0;
+int converted = 0;
SWText *module = 0;
-VerseKey *currentVerse = 0;
+VerseKey currentVerse;
+SWBuf v11n = "KJV";
char activeOsisID[255];
char currentOsisID[255];
-const char *osisabbrevs[] = {"Gen", "Exod", "Lev", "Num", "Deut", "Josh", "Judg",
- "Ruth", "1Sam", "2Sam", "1Kgs", "2Kgs", "1Chr", "2Chr", "Ezra", "Neh",
- "Esth", "Job", "Ps", "Prov", "Eccl", "Song", "Isa", "Jer", "Lam", "Ezek",
- "Dan", "Hos", "Joel", "Amos", "Obad", "Jonah", "Mic", "Nah", "Hab",
- "Zeph", "Hag", "Zech", "Mal",
- "Matt", "Mark", "Luke", "John", "Acts", "Rom", "1Cor", "2Cor", "Gal",
- "Eph", "Phil", "Col", "1Thess", "2Thess", "1Tim", "2Tim", "Titus",
- "Phlm", "Heb", "Jas", "1Pet", "2Pet", "1John", "2John", "3John",
- "Jude", "Rev"};
+SWBuf activeVerseText;
+
+ListKey currentKeyIDs = ListKey();
+
+std::vector<ListKey> linkedVerses;
static bool inCanonicalOSISBook = true; // osisID is for a book that is not in Sword's canon
-static bool normalize = true; // Whether to normalize UTF-8 to NFC
+static bool normalize = true; // Whether to normalize UTF-8 to NFC
bool isOSISAbbrev(const char *buf) {
- bool match = false;
- for (int i = 0; i < 66; i++) {
- if (!strcmp(buf, osisabbrevs[i])) {
- match = true;
- break;
- }
- }
- return match;
+ VerseMgr *vmgr = VerseMgr::getSystemVerseMgr();
+ const VerseMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
+ return av11n->getBookNumberByOSISName(buf) >= 0;
}
-
/**
* Determine whether the string contains a valid unicode sequence.
* The following table give the pattern of a valid UTF-8 character.
@@ -109,51 +125,92 @@ bool isOSISAbbrev(const char *buf) {
* author DM Smith
*/
int detectUTF8(const char *txt) {
- unsigned int countUTF8 = 0;
- int count = 0;
-
- // Cast it to make masking and shifting easier
- const unsigned char *p = (const unsigned char*) txt;
- while (*p) {
- // Is the high order bit set?
- if (*p & 0x80) {
- // Then count the number of high order bits that are set.
- // This determines the number of following bytes
- // that are a part of the unicode character
- unsigned char i = *p;
- for (count = 0; i & 0x80; count++) {
- i <<= 1;
- }
-
- // Validate count:
- // Count 0: bug in code that would cause core walking
- // Count 1: is a pattern of 10nnnnnn,
- // which does not signal the start of a unicode character
- // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
- // are not legal starts, either
- if (count < 2 || count > 4) return 0;
-
- // At this point we expect (count - 1) following characters
- // of the pattern 10nnnnnn
- while (--count && *++p) {
- // The pattern of each following character must be: 10nnnnnn
- // So, compare the top 2 bits.
- if ((0xc0 & *p) != 0x80) return 0;
- }
-
- // Oops, we've run out of bytes too soon: Cannot be UTF-8
- if (count) return 0;
-
- // We have a valid UTF-8 character, so count it
- countUTF8++;
- }
+ unsigned int countUTF8 = 0;
+ int count = 0;
+
+ // Cast it to make masking and shifting easier
+ const unsigned char *p = (const unsigned char*) txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // Then count the number of high order bits that are set.
+ // This determines the number of following bytes
+ // that are a part of the unicode character
+ unsigned char i = *p;
+ for (count = 0; i & 0x80; count++) {
+ i <<= 1;
+ }
+
+ // Validate count:
+ // Count 0: bug in code that would cause core walking
+ // Count 1: is a pattern of 10nnnnnn,
+ // which does not signal the start of a unicode character
+ // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
+ // are not legal starts, either
+ if (count < 2 || count > 4) return 0;
+
+ // At this point we expect (count - 1) following characters
+ // of the pattern 10nnnnnn
+ while (--count && *++p) {
+ // The pattern of each following character must be: 10nnnnnn
+ // So, compare the top 2 bits.
+ if ((0xc0 & *p) != 0x80) return 0;
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (count) return 0;
+
+ // We have a valid UTF-8 character, so count it
+ countUTF8++;
+ }
- // Advance to the next character to examine.
- p++;
- }
-
- // At this point it is either UTF-8 or 7-bit ascii
- return countUTF8 ? 1 : -1;
+ // Advance to the next character to examine.
+ p++;
+ }
+
+ // At this point it is either UTF-8 or 7-bit ascii
+ return countUTF8 ? 1 : -1;
+}
+
+void prepareSWText(const char *osisID, SWBuf &text)
+{
+ // Always check on UTF8 and report on non-UTF8 entries
+ int utf8State = detectUTF8(text.c_str());
+
+ // Trust, but verify.
+ if (!normalize && !utf8State) {
+ cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl;
+ }
+
+#ifdef _ICU_
+ if (normalize) {
+ // Don't need to normalize text that is ASCII
+ // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
+ if (!utf8State) {
+ cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
+ converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ converted++;
+
+ // Prepare for double check. This probably can be removed.
+ // But for now we are running the check again.
+ // This is to determine whether we need to normalize output of the conversion.
+ utf8State = detectUTF8(text.c_str());
+ }
+
+ // Double check. This probably can be removed.
+ if (!utf8State) {
+ cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
+ }
+
+ if (utf8State > 0) {
+ SWBuf before = text;
+ normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ if (before != text) {
+ normalized++;
+ }
+ }
+ }
+#endif
}
// This routine converts an osisID or osisRef into one that SWORD can parse into a verse list
@@ -175,8 +232,10 @@ void prepareSWVerseKey(SWBuf &buf) {
bool inRange = false;
while (*p) {
if (inRange) {
-#ifdef DEBUG_REF
- cout << "Copy range marker:" << *p << endl;;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): Copy range marker:" << *p << endl;;
+ }
#endif
// Range markers are copied as is
*s++ = *p++;
@@ -193,28 +252,36 @@ void prepareSWVerseKey(SWBuf &buf) {
if (*n == ':') {
// set p to skip the work prefix
p = n + 1;
-#ifdef DEBUG_REF
- cout << "Found a work prefix ";
- for (char *x = s; x <= n; x++) {
- cout << *x;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): Found a work prefix ";
+ for (char *x = s; x <= n; x++) {
+ cout << *x;
+ }
+ cout << endl;
}
- cout << endl;
#endif
}
// Now we are in the meat of an osisID.
// Copy it to its end but stop on a grain marker of '!'
-#ifdef DEBUG_REF
- cout << "Copy osisID:";
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): Copy osisID:";
+ }
#endif
while (*p && *p != '!' && *p != ' ' && *p != '-') {
-#ifdef DEBUG_REF
- cout << *p;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << *p;
+ }
#endif
*s++ = *p++;
}
-#ifdef DEBUG_REF
- cout << endl;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << endl;
+ }
#endif
// The ! and everything following until we hit
@@ -224,12 +291,14 @@ void prepareSWVerseKey(SWBuf &buf) {
while (*n && *n != ' ' && *n != '-') {
n++;
}
-#ifdef DEBUG_REF
- cout << "Found a grain suffix ";
- for (char *x = p; x < n; x++) {
- cout << *x;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): Found a grain suffix ";
+ for (char *x = p; x < n; x++) {
+ cout << *x;
+ }
+ cout << endl;
}
- cout << endl;
#endif
p = n;
}
@@ -240,9 +309,11 @@ void prepareSWVerseKey(SWBuf &buf) {
// then we are entering a range
inRange = !inRange && *p == '-';
-#ifdef DEBUG_REF
- if (inRange) {
- cout << "Found a range" << endl;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ if (inRange) {
+ cout << "DEBUG(REF): Found a range" << endl;
+ }
}
#endif
@@ -254,8 +325,10 @@ void prepareSWVerseKey(SWBuf &buf) {
}
// replacing them all with a ';'
*s++ = ';';
-#ifdef DEBUG_REF
- cout << "replacing space with ;. Remaining: " << p << endl;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl;
+ }
#endif
}
}
@@ -267,607 +340,921 @@ void prepareSWVerseKey(SWBuf &buf) {
*s = '\0';
// Since we modified the swbuf, we need to tell it what we have done
buf.setSize(s - buf.c_str());
-#ifdef DEBUG_REF
- cout << "shortended keyVal to`" << buf.c_str() << "`"<< endl;
+#ifdef DEBUG
+ if (debug & DEBUG_REF) {
+ cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl;
+ }
#endif
}
}
-bool isKJVRef(const char *buf) {
- VerseKey vk, test;
- vk.AutoNormalize(0);
- vk.Headings(1); // turn on mod/testmnt/book/chap headings
- vk.Persist(1);
- // lets do some tests on the verse --------------
- vk = buf;
- test = buf;
+/**
+ * Determine whether a verse as given is valid for the versification.
+ * This is done by comparing the before and after of normalization.
+ */
+bool isValidRef(const char *buf) {
+ // Create a VerseKey that does not do auto normalization
+ // Note: need to turn on headings so that a heading does not get normalized anyway
+ // And set it to the reference under question
+ VerseKey before;
+ before.setVersificationSystem(currentVerse.getVersificationSystem());
+ before.AutoNormalize(0);
+ before.Headings(1);
+ before.setText(buf);
+
+ // If we are a heading we must bail
+ // These will autonormalize to the last verse of the prior chapter
+ if (!before.Testament() || !before.Book() || !before.Chapter() || !before.Verse()) {
+ return true;
+ }
+
+ // Create a VerseKey that does do auto normalization
+ // And set it to the reference under question
+ VerseKey after;
+ after.setVersificationSystem(currentVerse.getVersificationSystem());
+ after.AutoNormalize(1);
+ after.setText(buf);
+
+ if (before == after)
+ {
+ return true;
+ }
+
+ // If we have gotten here the reference is not in the selected versification.
+ cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
- if (vk.Testament() && vk.Book() && vk.Chapter() && vk.Verse()) { // if we're not a heading
#ifdef DEBUG
- cout << (const char*)vk << " == " << (const char*)test << endl;
-#endif
- return (vk == test);
+ if (debug & DEBUG_REV11N) {
+ cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl;
}
- else return true; // no check if we're a heading... Probably bad.
+#endif
+
+ return false;
}
+/**
+ * This routine is used to ensure that all the text in the input is saved to the module.
+ * Assumption: The input orders all the verses for a chapter in numerical order. Thus, any
+ * verses that are not in the chosen versification (v11n) follow those that are.
+ *
+ * The prior implementation of this adjusted the verse to the last one that is in the chosen v11n.
+ * If it the chapter were extra, then it is appended to the last verse of the last
+ * chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is
+ * appended to the last verse of the chapter.
+ *
+ * The problem with this is when a OSIS verse refers to more than one verse, e.g.
+ * osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n)
+ * and then it is followed by Gen.1.32.
+ *
+ * This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31
+ * are not linked but rather empty. This routine will then find the last verse in the computed
+ * chapter that has content.
+ *
+ * Alternative, we could have done linking as we went, but this routine would have needed
+ * to find the first entry in the link set and elsewhere in the code when appending to a
+ * verse, it would need to be checked for adjacent links and those would have needed to be adjusted.
+ *
+ * param key the key that may need to be adjusted
+ */
+void makeValidRef(VerseKey &key) {
+
+ int chapterMax = key.getChapterMax();
+ int verseMax = key.getVerseMax();
-void makeKJVRef(VerseKey &key) {
- cout << "re-versified " << key;
#ifdef DEBUG
- cout << "\tC" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) << ":V" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+ if (debug & DEBUG_REV11N) {
+ cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl;
+ }
#endif
- if (key.Chapter() > key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) {
- key.Chapter(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax);
- key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+
+ cout << "INFO(V11N): " << key.getOSISRef() << " is not in the " << key.getVersificationSystem() << " versification.";
+ // Since isValidRef returned false constrain the key to the nearest prior reference.
+ // If we are past the last chapter set the reference to the last chapter
+ if (key.Chapter() > chapterMax) {
+ key.Chapter(chapterMax);
}
- else if (key.Verse() > key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]) {
- key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+
+ // Either we set the chapter to the last chapter and now need to set to the last verse in the chapter
+ // Or the verse is beyond the end of the chapter.
+ // In any case we need to constrain the verse to it's chapter.
+ key.Verse(verseMax);
+
+ // There are three cases we want to handle:
+ // In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29.
+ // In each of these cases the out-of-versification, extra verse is Matt.7.30.
+ // 1) The "extra" verse follows the last verse in the chapter.
+ // <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
+ // In this case re-versify Matt.7.30 as Matt.7.29.
+ //
+ // 2) The "extra" verse follows a range (a set of linked verses).
+ // <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
+ // In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set.
+ // Since we are post-poning linking, we want to re-reversify to the last entry in the module.
+ //
+ // 3) The last verse in the chapter is not in the input. There may be other verses missing as well.
+ // <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse>
+ // In this case we should re-versify Matt.7.30 as Matt.7.29.
+ // However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module.
+
+ while (!key.Error() && !module->hasEntry(&key)) {
+ key.decrement(1);
}
- cout << "\tas " << key << endl;
-}
+ cout << " Appending content to " << key.getOSISRef() << endl;
+}
-void writeEntry(VerseKey &key, SWBuf &text, bool force = false) {
- static SWBuf activeVerseText;
+void writeEntry(SWBuf &text, bool force = false) {
char keyOsisID[255];
- if (inCanonicalOSISBook) {
- strcpy(keyOsisID, key.getOSISRef());
+ static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 2400 $\"/>";
+ static bool firstOT = true;
+ static bool firstNT = true;
- // set keyOsisID to anything that an osisID cannot be.
- if (force) {
- strcpy(keyOsisID, "-force");
- }
-
- static VerseKey lastKey;
- lastKey.AutoNormalize(0);
- lastKey.Headings(1);
+ if (!inCanonicalOSISBook) {
+ return;
+ }
- VerseKey saveKey;
- saveKey.AutoNormalize(0);
- saveKey.Headings(1);
- saveKey = key;
+ strcpy(keyOsisID, currentVerse.getOSISRef());
- // If we have seen a verse and the supplied one is different then we output the collected one.
- if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
+ // set keyOsisID to anything that an osisID cannot be.
+ if (force) {
+ strcpy(keyOsisID, "-force");
+ }
- key = lastKey;
+ static VerseKey lastKey;
+ lastKey.setVersificationSystem(currentVerse.getVersificationSystem());
+ lastKey.AutoNormalize(0);
+ lastKey.Headings(1);
- if (!isKJVRef(key)) {
- makeKJVRef(key);
- }
+ VerseKey saveKey;
+ saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
+ saveKey.AutoNormalize(0);
+ saveKey.Headings(1);
+ saveKey = currentVerse;
-#ifdef _ICU_
- int utf8State = detectUTF8(activeVerseText.c_str());
- if (normalize) {
- // Don't need to normalize text that is ASCII
- // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
- if (!utf8State) {
- cout << "Warning: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
- converter.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
- converted++;
-
- // Prepare for double check. This probably can be removed.
- // But for now we are running the check again.
- // This is to determine whether we need to normalize output of the conversion.
- utf8State = detectUTF8(activeVerseText.c_str());
- }
+ // If we have seen a verse and the supplied one is different then we output the collected one.
+ if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
- // Double check. This probably can be removed.
- if (!utf8State) {
- cout << "Error: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
- }
+ if (!isValidRef(lastKey)) {
+ makeValidRef(lastKey);
+ }
- if (utf8State > 0) {
- SWBuf before = activeVerseText;
- normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
- if (before != activeVerseText) {
- normalized++;
- }
- }
+ currentVerse = lastKey;
+
+ prepareSWText(activeOsisID, activeVerseText);
+
+ // Put the revision into the module
+ int testmt = currentVerse.Testament();
+ if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) {
+ VerseKey t;
+ t.setVersificationSystem(currentVerse.getVersificationSystem());
+ t.AutoNormalize(0);
+ t.Headings(1);
+ t = currentVerse;
+ currentVerse.Book(0);
+ currentVerse.Chapter(0);
+ currentVerse.Verse(0);
+ module->setEntry(revision);
+ currentVerse = t;
+ switch (testmt) {
+ case 1:
+ firstOT = false;
+ break;
+ case 2:
+ firstNT = false;
+ break;
}
-#endif
+ }
- SWBuf currentText = module->getRawEntry();
- if (currentText.length()) {
- cout << "Appending entry: " << key.getOSISRef() << ": " << activeVerseText << endl;
- activeVerseText = currentText + " " + activeVerseText;
- }
+ // If the entry already exists, then append this entry to the text.
+ // This is for verses that are outside the chosen versification. They are appended to the prior verse.
+ // The space should not be needed if we retained verse tags.
+ SWBuf currentText = module->getRawEntry();
+ if (currentText.length()) {
+ cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
+ activeVerseText = currentText + " " + activeVerseText;
+ }
#ifdef DEBUG
- cout << "Write: " << activeOsisID << ":" << key.getOSISRef() << ": " << activeVerseText << endl;
-#endif
-
- module->setEntry(activeVerseText);
- activeVerseText = "";
+ if (debug & DEBUG_WRITE) {
+ cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
}
+#endif
- // eliminate leading whitespace on the beginning of each verse and
- // before we append to current content, since we just added one
- text.trimStart();
- if (activeVerseText.length()) {
- activeVerseText += " ";
- activeVerseText += text;
- }
- else {
- activeVerseText = text;
- }
+ module->setEntry(activeVerseText);
+ activeVerseText = "";
+ }
- key = saveKey;
- lastKey = key;
- strcpy(activeOsisID, keyOsisID);
+ // The following is for initial verse content and for appending interverse content.
+ // Eliminate leading whitespace on the beginning of each verse and
+ // before we append to current content, since we just added one
+ text.trimStart();
+ if (activeVerseText.length()) {
+ activeVerseText += " ";
+ activeVerseText += text;
+ }
+ else {
+ activeVerseText = text;
}
+ // text has been consumed so clear it out.
+ text = "";
+
+ currentVerse = saveKey;
+ lastKey = currentVerse;
+ strcpy(activeOsisID, keyOsisID);
}
+void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
+
+ // Only link verses that are in the versification.
+ if (!isValidRef(linkKey)) {
+ return;
+ }
-void linkToEntry(VerseKey& dest) {
- //cout << "Verse: " << key << "\n";
- //cout << "TEXT: " << text << "\n\n";
- //SWBuf currentText = module->getRawEntry();
- //if (currentText.length())
- // text = currentText + " " + text;
VerseKey saveKey;
+ saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
saveKey.AutoNormalize(0);
saveKey.Headings(1);
- saveKey = *currentVerse;
+ saveKey = currentVerse;
+ currentVerse = linkKey;
- if (!isKJVRef(*currentVerse)) {
- makeKJVRef(*currentVerse);
- }
-
- cout << "Linking " << module->KeyText() << " to " << dest.getText() << "\n";
+ cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n";
module->linkEntry(&dest);
- *currentVerse = saveKey;
+ currentVerse = saveKey;
}
// Return true if the content was handled or is to be ignored.
// false if the what has been seen is to be accumulated and considered later.
-bool handleToken(SWBuf &text, XMLTag *token) {
+bool handleToken(SWBuf &text, XMLTag token) {
// Everything between the begin book tag and the first begin chapter tag is inBookHeader
- static bool inBookHeader = false;
+ static bool inBookHeader = false;
+
// Everything between the begin chapter tag and the first begin verse tag is inChapterHeader
- static bool inChapterHeader = false;
+ static bool inChapterHeader = false;
- // Flags to indicate whether we are in a book, chapter and/or verse
- //static bool inBook = false;
- //static bool inChapter = false;
- static bool inVerse = true;
+ // Flags indicating whether we are processing the content of a chapter
+ static bool inChapter = false;
- static SWBuf header = "";
+ // Flags indicating whether we are processing the content of a verse
+ static bool inVerse = false;
- // Used to remember titles that need to be handle specially
- static SWBuf lastTitle = "";
- static int titleOffset = -1;
- static bool inTitle = false;
- static int titleDepth = 0;
+ // Flags indicating whether we are processing the content of to be prepended to a verse
+ static bool inPreVerse = false;
+ static int genID = 1;
- static ListKey lastVerseIDs = ListKey();
+ // Flag indicating whether we are in "Words of Christ"
+ static bool inWOC = false;
+ // Tag for WOC quotes within a verse
+ static XMLTag wocTag = "<q who=\"Jesus\" marker=\"\">";
+
+ // Flag used to indicate where useful text begins
+ static bool firstDiv = false;
+
+ // Stack of quote elements used to handle Words of Christ
+ static std::stack<XMLTag> quoteStack;
// Stack of elements used to validate that books, chapters and verses are well-formed
// This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse
// to be begin and end tags, too.
// It is an error if books and chapters are not well formed (though not required by OSIS)
// It is a warning that verses are not well formed (because some clients are not ready)
- static std::stack<XMLTag*> tagStack;
- // The following are used to validate well-formedness
- static int chapterDepth = 0;
- static int bookDepth = 0;
- static int verseDepth = 0;
-
- int tagDepth = tagStack.size();
- const char *tokenName = token->getName();
- bool isEndTag = token->isEndTag() || token->getAttribute("eID");
- const char *typeAttr = token->getAttribute("type");
-
- //Titles are treated specially.
- // If the title has an attribute type of "main" or "chapter"
- // it belongs to its <div> or <chapter> and is treated as part of its heading
- // Otherwise if it a title in a chapter before the first the first verse it
- // is put into the verse as a preverse title.
- if (!token->isEmpty() && !isEndTag && titleDepth == 0 && (!strcmp(tokenName, "title")) && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter")))) {
- titleOffset = text.length(); //start of the title tag
- lastTitle = "";
- inTitle = true;
- tagStack.push(token);
-#ifdef DEBUG_STACK
- cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl;
-#endif
- titleDepth = tagStack.size();
- return false;
- }
- // Check titleDepth since titles can be nested. Don't want to quit too early.
- else if (isEndTag && tagDepth == titleDepth && (!strcmp(tokenName, "title"))) {
- lastTitle.append(text.c_str() + titleOffset); //<title ...> up to the end </title>
- lastTitle.append(*token); //</title>
-
-#ifdef DEBUG
- cout << currentOsisID << ":" << endl;
- cout << "\tlastTitle: " << lastTitle.c_str() << endl;
- cout << "\ttext-lastTitle: " << text.c_str()+titleOffset << endl;
- cout << "\ttext: " << text.c_str() << endl;
-#endif
- inTitle = false;
- titleDepth = 0;
-#ifdef DEBUG_STACK
- cout << currentOsisID << ": pop(" << tagStack.size() << ") " << tagStack.top()->getName() << endl;
-#endif
- tagStack.pop();
- return false; // don't add </title> to the text itself
- }
-
+ static std::stack<XMLTag> tagStack;
+ // The following are used to validate well-formedness
+ static int chapterDepth = 0;
+ static int bookDepth = 0;
+ static int verseDepth = 0;
-//-- START TAG -------------------------------------------------------------------------
+ int tagDepth = tagStack.size();
+ const char *tokenName = token.getName();
+ bool isEndTag = token.isEndTag() || token.getAttribute("eID");
+ const char *typeAttr = token.getAttribute("type");
+ // process start tags
if (!isEndTag) {
// Remember non-empty start tags
- if (!token->isEmpty()) {
+ if (!token.isEmpty()) {
tagStack.push(token);
-#ifdef DEBUG_STACK
- cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl;
+#ifdef DEBUG
+ if (debug & DEBUG_STACK) {
+ cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl;
+ }
#endif
}
- //-- WITH OSIS ID -------------------------------------------------------------------------
- //-- OR ANNOTATE REF -------------------------------------------------------------------------
- if (token->getAttribute("osisID") || token->getAttribute("annotateRef")) {
+ // throw away everything up to the first div
+ if (!firstDiv) {
+ if (!strcmp(tokenName, "div")) {
+#ifdef DEBUG
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl;
+ }
+#endif
+ // TODO: Save off the content to use it to suggest the module's conf.
+ firstDiv = true;
+ text = "";
+ }
+ else {
+ // Collect the content so it can be used to suggest the module's conf.
+ return false;
+ }
+ }
+
+ //-- WITH osisID OR annotateRef -------------------------------------------------------------------------
+ // Handle Book, Chapter, and Verse (or commentary equivalent)
+ if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) {
- // BOOK START
+ // BOOK START, <div type="book" ...>
if ((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "book"))) {
- inVerse = false;
if (inBookHeader || inChapterHeader) { // this one should never happen, but just in case
#ifdef DEBUG
- cout << currentOsisID << ": HEADING ";
+ if (debug & DEBUG_TITLE) {
+ cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS HEADING " << endl;
+ cout << "\tinChapterHeader = " << inChapterHeader << endl;
+ cout << "\tinBookHeader = " << inBookHeader << endl;
+ }
#endif
- currentVerse->Testament(0);
- currentVerse->Book(0);
- currentVerse->Chapter(0);
- currentVerse->Verse(0);
- writeEntry(*currentVerse, text);
+ currentVerse.Testament(0);
+ currentVerse.Book(0);
+ currentVerse.Chapter(0);
+ currentVerse.Verse(0);
+ writeEntry(text);
}
- strcpy(currentOsisID, token->getAttribute("osisID"));
- *currentVerse = currentOsisID;
- currentVerse->Chapter(0);
- currentVerse->Verse(0);
- inBookHeader = true;
+ currentVerse = token.getAttribute("osisID");
+ currentVerse.Chapter(0);
+ currentVerse.Verse(0);
+ strcpy(currentOsisID, currentVerse.getOSISRef());
+
+ inChapter = false;
+ inVerse = false;
+ inPreVerse = false;
+ inBookHeader = true;
inChapterHeader = false;
- lastTitle = "";
- text = "";
- bookDepth = tagStack.size();
- chapterDepth = 0;
- verseDepth = 0;
- inCanonicalOSISBook = isOSISAbbrev(token->getAttribute("osisID"));
+ bookDepth = tagStack.size();
+ chapterDepth = 0;
+ verseDepth = 0;
- return true;
+ inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID"));
+ if (!inCanonicalOSISBook) {
+ cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl;
+ }
+#ifdef DEBUG
+ else if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl;
+ }
+#endif
+
+ return false;
}
- // CHAPTER START
- else if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter")))
- || (!strcmp(tokenName, "chapter"))
- ) {
- inVerse = false;
+ // CHAPTER START, <div type="chapter" ...> or <chapter ...>
+ if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter"))) ||
+ (!strcmp(tokenName, "chapter"))
+ ) {
if (inBookHeader) {
#ifdef DEBUG
- cout << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl;
+ if (debug & DEBUG_TITLE) {
+ cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl;
+ }
#endif
- writeEntry(*currentVerse, text);
+ writeEntry(text);
}
- strcpy(currentOsisID, token->getAttribute("osisID"));
- *currentVerse = currentOsisID;
- currentVerse->Verse(0);
- inBookHeader = false;
+ currentVerse = token.getAttribute("osisID");
+ currentVerse.Verse(0);
+#ifdef DEBUG
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl;
+ }
+#endif
+ strcpy(currentOsisID, currentVerse.getOSISRef());
+
+ inChapter = true;
+ inVerse = false;
+ inPreVerse = false;
+ inBookHeader = false;
inChapterHeader = true;
- lastTitle = "";
- text = "";
- chapterDepth = tagStack.size();
- verseDepth = 0;
- return true;
+ chapterDepth = tagStack.size();
+ verseDepth = 0;
+
+ return false;
}
- // VERSE OR COMMENTARY START
- else if (!strcmp(tokenName, "verse") ||
- (!strcmp(tokenName, "div") &&
- token->getAttribute("annotateType"))) {
+ // VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...>
+ if (!strcmp(tokenName, "verse") ||
+ (!strcmp(tokenName, "div") && token.getAttribute("annotateType"))) {
#ifdef DEBUG
- cout << "Entering verse" << endl;
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): Entering verse" << endl;
+ }
#endif
- inVerse = true;
if (inChapterHeader) {
SWBuf heading = text;
-
- //make sure we don't insert the preverse title which belongs to the first verse of this chapter!
- // Did we have a preverse title?
- if (lastTitle.length())
- {
- //Was the preVerse title in the header (error if not)?
- const char* header = heading.c_str();
- const char* preVerse = strstr(header, lastTitle);
- if (preVerse) {
- if (preVerse == header) {
- heading = ""; // do nothing
- }
- else {
- // remove everything before the title from the beginning.
- text = preVerse;
- // Remove text from the end of the header.
- heading.setSize(preVerse - header);
- }
- }
- else {
- cout << currentOsisID << ": Warning: Bug in code. Could not find title." << endl;
- }
- }
- else {
- text = "";
- }
+ text = "";
if (heading.length()) {
#ifdef DEBUG
- cout << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl;
+ if (debug & DEBUG_TITLE) {
+ cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl;
+ }
#endif
- writeEntry(*currentVerse, heading);
+ writeEntry(heading);
}
inChapterHeader = false;
}
- SWBuf keyVal = token->getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID");
+ // Did we have pre-verse material that needs to be marked?
+ if (inPreVerse) {
+ char genBuf[200];
+ sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++);
+ text.append(genBuf);
+ }
+
+ // Get osisID for verse or annotateRef for commentary
+ SWBuf keyVal = token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID");
+
+ // Massage the key into a form that ParseVerseList can accept
prepareSWVerseKey(keyVal);
- lastVerseIDs = currentVerse->ParseVerseList(keyVal, *currentVerse, true);
- // set currentVerse to the first value in the keyVal
- VerseKey *element = SWDYNAMIC_CAST(VerseKey, lastVerseIDs.GetElement(0));
- if (element) {
- *currentVerse = element->LowerBound().getText();
+ // The osisID or annotateRef can be more than a single verse
+ // The first or only one is the currentVerse
+ // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing.
+ // This should never happen if the references are valid OSIS references
+ ListKey verseKeys = currentVerse.ParseVerseList(keyVal, currentVerse, true);
+ int memberKeyCount = verseKeys.Count();
+ if (memberKeyCount) {
+ currentVerse = verseKeys.getElement(0);
+ // See if this osisID or annotateRef refers to more than one verse.
+ // If it does, save it until all verses have been seen.
+ // At that point we will output links.
+ // This can be done by incrementing, which will produce an error
+ // if there is only one verse.
+ verseKeys.setPosition(TOP);
+ verseKeys.increment(1);
+ if (!verseKeys.Error()) {
+ linkedVerses.push_back(verseKeys);
+ }
}
else {
- *currentVerse = lastVerseIDs.GetElement(0)->getText();
+ cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID") << endl;
}
- strcpy(currentOsisID, currentVerse->getOSISRef());
+ strcpy(currentOsisID, currentVerse.getOSISRef());
#ifdef DEBUG
- cout << "Current verse is " << *currentVerse << endl;
- cout << "osisID/annotateRef is adjusted to" << keyVal << endl;
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl;
+ cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl;
+ }
#endif
- verseDepth = tagStack.size();
+ inVerse = true;
+ inPreVerse = false;
+ inBookHeader = false;
+ inChapterHeader = false;
+ verseDepth = tagStack.size();
- return true;
- }
- }
- // Handle stuff between the verses
- // Whitespace producing empty tokens are appended to prior entry
- // Also the quote
- // This is a hack to get ESV to work
- else if (!inTitle && !inVerse && token->isEmpty()) { // && !inBookHeader && !inChapterHeader) {
- if (!strcmp(tokenName, "p") ||
- !strcmp(tokenName, "div") ||
- !strcmp(tokenName, "q") ||
- !strcmp(tokenName, "l") ||
- !strcmp(tokenName, "lb") ||
- !strcmp(tokenName, "lg")
- ) {
+ // Include the token if it is not a verse
+ if (strcmp(tokenName, "verse")) {
+ text.append(token);
+ }
#ifdef DEBUG
- if (token) {
- cout << currentOsisID << ": appending interverse start token " << *token << ":" << text.c_str() << endl;
+ else if (debug & DEBUG_VERSE)
+ {
+ // transform the verse into a milestone
+ XMLTag t = "<milestone resp=\"v\" />";
+ // copy all the attributes of the verse element to the milestone
+ StringList attrNames = token.getAttributeNames();
+ for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
+ const char* attr = (*loop).c_str();
+ t.setAttribute(attr, token.getAttribute(attr));
}
+ text.append(t);
+ }
#endif
- SWBuf tmp = token->toString();
- writeEntry(*currentVerse, tmp);
+
+ if (inWOC) {
+ text.append(wocTag);
+ }
return true;
}
+ } // done with Handle Book, Chapter, and Verse (or commentary equivalent)
+
+ // Now consider everything else.
+
+ // Handle WOC quotes.
+ // Note this requires transformBSP to make them into milestones
+ // Otherwise have to do it here
+ if (!strcmp(tokenName, "q")) {
+ quoteStack.push(token);
#ifdef DEBUG
- else {
- if (token) {
- cout << currentOsisID << ": interverse start token " << *token << ":" << text.c_str() << endl;
- }
+ if (debug & DEBUG_QUOTE) {
+ cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl;
}
#endif
+ if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
+ inWOC = true;
+
+ // Output per verse WOC markup.
+ text.append(wocTag);
+
+ // Output the quotation mark if appropriate, inside the WOC.
+ // If there is no marker attribute, let the SWORD engine manufacture one.
+ // If there is a marker attribute and it has content, then output that.
+ // If the marker attribute is present and empty, then there is nothing to do.
+ // And have it within the WOC markup
+ if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
+ token.setAttribute("who", 0); // remove the who="Jesus"
+ text.append(token);
+ }
+ return true;
+ }
+ return false;
+ }
+
+ // Have we found the start of pre-verse material?
+ // Pre-verse material follows the following rules
+ // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book.
+ // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter
+ // and the first verse of the chapter.
+ // A <div> with a type other than section will be taken as a chapter introduction.
+ // A <title> of type acrostic, psalm or no type, will be taken as a title for the verse.
+ // A <title> of type main or chapter will be seen as a chapter title.
+ // 3) Between verses, the material is split between the prior verse and the next verse.
+ // Basically, while end and empty tags are found, they belong to the prior verse.
+ // Once a begin tag is found, it belongs to the next verse.
+ // If the title has an attribute type of "main" or "chapter"
+ // it belongs to its <div> or <chapter> and is treated as part of its heading
+ // Otherwise if it a title in a chapter before the first the first verse it
+ // is put into the verse as a preverse title.
+
+ if (!inPreVerse && !inBookHeader) {
+ if (inChapterHeader) {
+ // Determine when we are no longer in a chapter heading, but in pre-verse material:
+ // If we see one of the following:
+ // a section div
+ // a title that is not main or chapter
+ if ((!strcmp(tokenName, "div") && (typeAttr && !strcmp(typeAttr, "section"))) ||
+ (!strcmp(tokenName, "title") && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter"))))
+ ) {
+ // Since we have found the boundary, we need to write out the chapter heading
+ writeEntry(text);
+ // And we are no longer in the chapter heading
+ inChapterHeader = false;
+ // But rather, we are now in pre-verse material
+ inPreVerse = true;
+ }
+ }
+ else if (!inVerse && inChapter) {
+ inPreVerse = true;
+ }
+
+ if (inPreVerse) {
+ char genBuf[200];
+ sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID++);
+ text.append(genBuf);
+ }
}
- }
-//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
+#ifdef DEBUG
+ if (debug & DEBUG_INTERVERSE) {
+ if (!inVerse && !inBookHeader && !inChapterHeader) {
+ cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl;
+ }
+ }
+#endif
+ return false;
+ } // Done with procesing start and empty tags
+
+ // Process end tags
else {
if (tagStack.empty()) {
- cout << currentOsisID << ": tag expected" << endl;
- exit(1);
+ cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl;
+ exit(EXIT_BAD_NESTING);
}
- XMLTag* topToken = 0;
- if (!token->isEmpty()) {
- topToken = tagStack.top();
+ // Note: empty end tags have the eID attribute
+ if (!token.isEmpty()) {
+ XMLTag topToken = tagStack.top();
tagDepth = tagStack.size();
-#ifdef DEBUG_STACK
- cout << currentOsisID << ": pop(" << tagDepth << ") " << topToken->getName() << endl;
+#ifdef DEBUG
+ if (debug & DEBUG_STACK) {
+ cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl;
+ }
#endif
tagStack.pop();
- if (strcmp(topToken->getName(), tokenName)) {
- cout << "Error: " << currentOsisID << ": Expected " << topToken->getName() << " found " << tokenName << endl;
-// exit(1); // I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
+ if (strcmp(topToken.getName(), tokenName)) {
+ cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
+// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
// So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
+ // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
}
}
+ // We haven't seen the first div so there is nothing to do.
+ if (!firstDiv) {
+ // Collect the content so it can be used to suggest the module's conf.
+ return false;
+ }
+
// VERSE and COMMENTARY END
if (!strcmp(tokenName, "verse") || (inVerse && !strcmp(tokenName, "div"))) {
- inVerse = false;
if (tagDepth != verseDepth) {
- cout << "Warning verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
+ cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
}
- if (lastTitle.length()) {
- const char* end = strchr(lastTitle, '>');
+ // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
+ if (inWOC) {
+ text.append("</q>");
+ }
+
+
+ // Include the token if it is not a verse
+ if (strcmp(tokenName, "verse")) {
+ text.append(token);
+ }
#ifdef DEBUG
- cout << currentOsisID << ":" << endl;
- cout << "\t" << lastTitle << endl;
- cout << "\tlength=" << int(end+1 - lastTitle.c_str()) << ", tag:" << lastTitle.c_str() << endl;
+ else if (debug & DEBUG_VERSE)
+ {
+ // transform the verse into a milestone
+ XMLTag t = "<milestone resp=\"v\" />";
+ // copy all the attributes of the verse element to the milestone
+ StringList attrNames = token.getAttributeNames();
+ for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
+ const char* attr = (*loop).c_str();
+ t.setAttribute(attr, token.getAttribute(attr));
+ }
+ text.append(t);
+ }
#endif
- SWBuf titleTagText;
- titleTagText.append(lastTitle.c_str(), end+1 - lastTitle.c_str());
+ writeEntry(text);
+
+ inVerse = false;
+ inPreVerse = false;
+ verseDepth = 0;
+
+ return true;
+ }
+
+ // Handle WOC quotes.
+ // Note this requires transformBSP to make them into milestones
+ // Otherwise have to manage it here
+ if (!strcmp(tokenName, "q")) {
+ XMLTag topToken = quoteStack.top();
#ifdef DEBUG
- cout << currentOsisID << ": tagText: " << titleTagText.c_str() << endl;;
+ if (debug & DEBUG_QUOTE) {
+ cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
+ }
#endif
+ quoteStack.pop();
- XMLTag titleTag(titleTagText);
- titleTag.setAttribute("type", "section");
- titleTag.setAttribute("subType", "x-preverse");
-
- //we insert the title into the text again - make sure to remove the old title text
- const char* pos = strstr(text, lastTitle);
- if (pos) {
- SWBuf temp;
- temp.append(text, pos-text.c_str());
- temp.append(pos+lastTitle.length());
- text = temp;
+ // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC
+ // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
+ if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
+#ifdef DEBUG
+ if (debug & DEBUG_QUOTE) {
+ cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
}
-
- //if a title was already inserted at the beginning insert this one after that first title
- int titlePos = 0;
- if (!strncmp(text.c_str(),"<title ",7)) {
- const char* tmp = strstr(text.c_str(), "</title>");
- if (tmp) {
- titlePos = (tmp-text.c_str()) + 8;
- }
+#endif
+ inWOC = false;
+ const char *sID = topToken.getAttribute("sID");
+ const char *eID = token.getAttribute("eID");
+ if (!sID) {
+ sID = "";
}
- text.insert(titlePos, end+1);
- text.insert(titlePos, titleTag);
- }
- // text += token;
- writeEntry(*currentVerse, text);
-
- // If we found an osisID like osisID="Gen.1.1 Gen.1.2 Gen.1.3" we have to link Gen.1.2 and Gen.1.3 to Gen.1.1
- VerseKey dest = *currentVerse;
- VerseKey linkKey;
- linkKey.AutoNormalize(0);
- linkKey.Headings(1); // turn on mod/testmnt/book/chap headings
- linkKey.Persist(1);
- for (lastVerseIDs = TOP; !lastVerseIDs.Error(); lastVerseIDs++) {
- linkKey = lastVerseIDs;
-
- if (linkKey.Verse() != dest.Verse() ||
- linkKey.Chapter() != dest.Chapter() ||
- linkKey.Book() != dest.Book() ||
- linkKey.Testament() != dest.Testament())
- {
- *currentVerse = linkKey;
- linkToEntry(dest);
+ if (!eID) {
+ eID = "";
+ }
+ if (strcmp(sID, eID)) {
+ cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl;
}
- }
- lastTitle = "";
- text = "";
- verseDepth = 0;
- return true;
+
+ // Output the quotation mark if appropriate, inside the WOC.
+ // If there is no marker attribute, let the SWORD engine manufacture one.
+ // If there is a marker attribute and it has content, then output that.
+ // If the marker attribute is present and empty, then there is nothing to do.
+ // And have it within the WOC markup
+ if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
+ token.setAttribute("who", 0); // remove the who="Jesus"
+ text.append(token);
+ }
+
+ // Now close the WOC
+ text.append("</q>");
+ return true;
+ }
+ return false;
}
- else if (!inTitle && !inVerse && !inBookHeader && !inChapterHeader) {
+
+ // Look for the end of document, book and chapter
+ // Also for material that goes with last entry
+ if (!inVerse && !inBookHeader && !inChapterHeader) {
// Is this the end of a chapter.
if (tagDepth == chapterDepth && (!strcmp(tokenName, "div") || !strcmp(tokenName, "chapter"))) {
+ text.append(token);
+ writeEntry(text);
+ inChapter = false;
chapterDepth = 0;
- verseDepth = 0;
- text = "";
+ verseDepth = 0;
return true;
}
- // Or is it the end of a book
- else if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) {
- bookDepth = 0;
+
+ // Is it the end of a book
+ if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) {
+ text.append(token);
+ writeEntry(text);
+ bookDepth = 0;
chapterDepth = 0;
- verseDepth = 0;
- text = "";
+ verseDepth = 0;
return true;
}
- // Or is it the end of an osis document
- else if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
- bookDepth = 0;
+
+ // Do not include the end of an osis document
+ if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
+ bookDepth = 0;
chapterDepth = 0;
- verseDepth = 0;
- text = "";
+ verseDepth = 0;
+ text = "";
return true;
}
- // OTHER MISC END TAGS WHEN !INVERSE
- // Test that is between verses, or after the last is appended to the preceeding verse.
- else if (!strcmp(tokenName, "p") ||
- !strcmp(tokenName, "div") ||
- !strcmp(tokenName, "q") ||
- !strcmp(tokenName, "l") ||
- !strcmp(tokenName, "lb") ||
- !strcmp(tokenName, "lg")
- ) {
- text.append(*token);
- writeEntry(*currentVerse, text);
- text = "";
+
+ // When we are not inPreVerse, the interverse tags get appended to the preceeding verse.
+ if (!inPreVerse) {
+ text.append(token);
+ writeEntry(text);
#ifdef DEBUG
- cout << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+ if (debug & DEBUG_INTERVERSE) {
+ cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+ }
#endif
return true;
}
+
#ifdef DEBUG
- cout << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+ if (debug & DEBUG_INTERVERSE) {
+ cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+ }
#endif
+ return false;
+
}
- }
+
+ return false;
+ } // done with Processing end tags
+
return false;
}
-XMLTag* transform(XMLTag* t) {
- static std::stack<XMLTag*> tagStack;
+/**
+ * Support normalizations necessary for a SWORD module.
+ * OSIS allows for document structure (Book, Section, Paragraph or BSP)
+ * to overlap Bible versification (Book, Chapter, Verse).
+ * Most SWORD applications need to display verses in isolation or in HTML table cells,
+ * requiring each stored entry (i.e. verses) to be well-formed xml.
+ * This routine normalizes container elements which could cross verse boundaries into milestones.
+ * For most of these OSIS elements, there is a milestone form. However, p is not milestoneable.
+ * For this reason, p is transformed into lb elements.
+ * param t the tag to transform
+ * return the transformed tag or the original one
+ */
+XMLTag transformBSP(XMLTag t) {
+ static std::stack<XMLTag> bspTagStack;
static int sID = 1;
char buf[11];
// Support simplification transformations
- if (!t->isEmpty()) {
- if (!t->isEndTag()) {
- tagStack.push(t);
-#ifdef DEBUG_XFORM
- cout << currentOsisID << ": xform push (" << tagStack.size() << ") " << t->getName() << endl;
+ if (t.isEmpty()) {
+#ifdef DEBUG
+ if (debug & DEBUG_XFORM) {
+ cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl;
+ }
#endif
- // Transform <q> into <q sID=""/> except for <q who="Jesus">
- if ((!strcmp(t->getName(), "q")) && (!t->getAttribute("who") || strcmp(t->getAttribute("who"), "Jesus"))) {
- t->setEmpty(true);
- sprintf(buf, "q%d", sID++);
- t->setAttribute("sID", buf);
- }
+ return t;
+ }
- // Transform <p> into <lb type="x-begin-paragraph"/>
- else if (!strcmp(t->getName(), "p")) {
- // note there is no process that should care about type, it is there for reversability
- t->setText("<lb type=\"x-begin-paragraph\" />");
- }
+ const char* tagName = t.getName();
+ if (!t.isEndTag()) {
+ // Transform <p> into <div type="paragraph"> and milestone it
+ if (!strcmp(tagName, "p")) {
+ t.setText("<div type=\"paragraph\" />");
+ sprintf(buf, "gen%d", sID++);
+ t.setAttribute("sID", buf);
+ }
+
+ // Transform <tag> into <tag sID="">, where tag is a milestoneable element.
+ // The following containers are milestoneable.
+ // abbr, closer, div, foreign, l, lg, salute, signed, speech
+ // Leaving out:
+ // abbr When would this ever cross a boundary?
+ // seg as it is used for a divineName hack
+ // foreign so that it can be easily italicized
+ else if (!strcmp(tagName, "chapter") ||
+ !strcmp(tagName, "closer") ||
+ !strcmp(tagName, "div") ||
+ !strcmp(tagName, "l") ||
+ !strcmp(tagName, "lg") ||
+ !strcmp(tagName, "q") ||
+ !strcmp(tagName, "salute") ||
+ !strcmp(tagName, "signed") ||
+ !strcmp(tagName, "speech") ||
+ !strcmp(tagName, "verse")
+ ) {
+ t.setEmpty(true);
+ sprintf(buf, "gen%d", sID++);
+ t.setAttribute("sID", buf);
+ }
+ bspTagStack.push(t);
+#ifdef DEBUG
+ if (debug & DEBUG_XFORM) {
+ cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl;
+ XMLTag topToken = bspTagStack.top();
+ cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl;
}
- else {
- XMLTag *topToken = tagStack.top();
-#ifdef DEBUG_XFORM
- cout << currentOsisID << ": xform pop(" << tagStack.size() << ") " << topToken->getName() << endl;
#endif
- tagStack.pop();
+ }
+ else {
+ XMLTag topToken = bspTagStack.top();
+#ifdef DEBUG
+ if (debug & DEBUG_XFORM) {
+ cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl;
+ }
+#endif
+ bspTagStack.pop();
+
+ // Look for the milestoneable container tags handled above.
+ if (!strcmp(tagName, "chapter") ||
+ !strcmp(tagName, "closer") ||
+ !strcmp(tagName, "div") ||
+ !strcmp(tagName, "l") ||
+ !strcmp(tagName, "lg") ||
+ !strcmp(tagName, "p") ||
+ !strcmp(tagName, "q") ||
+ !strcmp(tagName, "salute") ||
+ !strcmp(tagName, "signed") ||
+ !strcmp(tagName, "speech") ||
+ !strcmp(tagName, "verse")
+ ) {
+ // make this a clone of the start tag with sID changed to eID
+ // Note: in the case of </p> the topToken is a <div type="paragraph">
+ t = topToken;
+ t.setAttribute("eID", t.getAttribute("sID"));
+ t.setAttribute("sID", 0);
+ }
+ }
- // If we have found an end tag for a <q> that was transformed then transform this one as well.
- if ((!strcmp(t->getName(), "q")) && (!strcmp(topToken->getName(), "q")) && (!topToken->getAttribute("who") || strcmp(topToken->getAttribute("who"), "Jesus"))) {
- // make this a clone of the start tag with sID changed to eID
- *t = *topToken;
- t->setAttribute("eID", t->getAttribute("sID"));
- t->setAttribute("sID", 0);
- }
+ return t;
+}
- // Look for paragraph tags.
- // If we have found an end tag for a <p> that was transformed then transform this as well.
- else if ((!strcmp(t->getName(), "p")) && (!strcmp(topToken->getName(), "lb"))) {
- t->setText("<lb type=\"x-end-paragraph\" />");
- }
+/**
+ * Write out all links in the module.
+ * Waiting is necessary because writeEntry might ultimately append
+ * text to a verse moving it's offset in the data file.
+ * While we are minimizing it by postponing the write until we have
+ * gathered the next verse, the following scenario is happening:
+ * A module is using linked verses and has some verses that are not
+ * in the chosen versification. If the out-of-canon verse happens following
+ * a linked verse, the out-of-canon verse is appended to the prior
+ * verse. Care has to be taken that the linked verses all point to
+ * the first of the set.
+ */
+void writeLinks()
+{
+ // Link all the verses
+ VerseKey destKey;
+ destKey.setVersificationSystem(currentVerse.getVersificationSystem());
+ destKey.AutoNormalize(0);
+ destKey.Headings(1);
+
+ VerseKey linkKey;
+ linkKey.setVersificationSystem(currentVerse.getVersificationSystem());
+ linkKey.AutoNormalize(0);
+ linkKey.Headings(1);
+ for (unsigned int i = 0; i < linkedVerses.size(); i++) {
+ // The verseKeys is a list of verses
+ // where the first is the real verse
+ // and the others link to it.
+ ListKey verseKeys = linkedVerses[i];
+ verseKeys.setPosition(TOP);
+ destKey = verseKeys.getElement();
+ verseKeys.increment(1);
+
+ while (!verseKeys.Error()) {
+ linkKey = verseKeys.getElement();
+ verseKeys.increment(1);
+ linkToEntry(linkKey, destKey);
}
}
- return t;
}
void usage(const char *app, const char *error = 0) {
@@ -875,6 +1262,8 @@ void usage(const char *app, const char *error = 0) {
if (error) fprintf(stderr, "\n%s: %s\n", app, error);
fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
+ fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n");
+ fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to read from standard input\n");
fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n");
fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n");
fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n");
@@ -882,15 +1271,133 @@ void usage(const char *app, const char *error = 0) {
fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
- fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
- fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC");
- fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n");
- exit(-1);
+ fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
+ fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
+ fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n");
+ fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n");
+ fprintf(stderr, " -s <2|4>\t\t max text size per entry (default is 2).\n");
+ fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large entries\n");
+ fprintf(stderr, "\t\t\t\t in uncompressed modules (default is 65535 bytes)\n");
+ fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n");
+ fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:\n");
+ VerseMgr *vmgr = VerseMgr::getSystemVerseMgr();
+ StringList av11n = vmgr->getVersificationSystems();
+ for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) {
+ fprintf(stderr, "\t\t\t\t\t%s\n", (*loop).c_str());
+ }
+#ifdef DEBUG
+ fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n");
+ fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n");
+ fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n");
+ fprintf(stderr, "\t\t\t\t\t0 - no debugging\n");
+ fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n");
+ fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n");
+ fprintf(stderr, "\t\t\t\t\t4 - quotes, especially Words of Christ (WOC)\n");
+ fprintf(stderr, "\t\t\t\t\t8 - titles\n");
+ fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n");
+ fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n");
+ fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n");
+ fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n");
+ fprintf(stderr, "\t\t\t\t\t256 - internal stack\n");
+ fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n");
+ fprintf(stderr, "\t\t\t\t This flag can be used more than once.\n");
+#endif
+ fprintf(stderr, "\n");
+ fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n");
+ fprintf(stderr, "\n");
+ exit(EXIT_BAD_ARG);
+}
+
+void processOSIS(istream& infile) {
+ activeOsisID[0] = '\0';
+
+ strcpy(currentOsisID,"N/A");
+
+ currentVerse.setVersificationSystem(v11n);
+ currentVerse.AutoNormalize(0);
+ currentVerse.Headings(1); // turn on mod/testmnt/book/chap headings
+ currentVerse.Persist(1);
+
+ module->setKey(currentVerse);
+ module->setPosition(TOP);
+
+ SWBuf token;
+ SWBuf text;
+ bool intoken = false;
+ bool inWhitespace = false;
+ bool seeingSpace = false;
+ char curChar = '\0';
+
+ while (infile.good()) {
+
+ curChar = infile.get();
+
+ // skip the character if it is bad. infile.good() will catch the problem
+ if (curChar == -1) {
+ continue;
+ }
+
+ if (!intoken && curChar == '<') {
+ intoken = true;
+ token = "<";
+ continue;
+ }
+
+ // Outside of tokens merge adjacent whitespace
+ if (!intoken) {
+ seeingSpace = isspace(curChar);
+ if (seeingSpace) {
+ if (inWhitespace) {
+ continue;
+ }
+ // convert all whitespace to blanks
+ curChar = ' ';
+ }
+ inWhitespace = seeingSpace;
+ }
+
+ if (intoken && curChar == '>') {
+ intoken = false;
+ inWhitespace = false;
+ token.append('>');
+ // take this isalpha if out to check for bugs in text
+ if ((isalpha(token[1])) || (isalpha(token[2]))) {
+ //cout << "Handle:" << token.c_str() << endl;
+ XMLTag t = transformBSP(token.c_str());
+
+ if (!handleToken(text, t)) {
+ text.append(t);
+ }
+ }
+ continue;
+ }
+
+ if (intoken) {
+ token.append(curChar);
+ }
+ else {
+ switch (curChar) {
+ case '>' : text.append("&gt;"); break;
+ case '<' : text.append("&lt;"); break;
+ default : text.append(curChar); break;
+ }
+ }
+ }
+
+ // Force the last entry from the text buffer.
+ text = "";
+ writeEntry(text, true);
+ writeLinks();
+
+#ifdef _ICU_
+ if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
+ if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
+#endif
}
int main(int argc, char **argv) {
- fprintf(stderr, "You are running osis2mod: $Rev: 2169 $\n");
+ fprintf(stderr, "You are running osis2mod: $Rev: 2400 $\n");
// Let's test our command line arguments
if (argc < 3) {
@@ -898,14 +1405,15 @@ int main(int argc, char **argv) {
}
// variables for arguments, holding defaults
- const char* program = argv[0];
- const char* path = argv[1];
- const char* osisDoc = argv[2];
- int append = 0;
- int compType = 0;
- int iType = 4;
- string cipherKey = "";
-
+ const char* program = argv[0];
+ const char* path = argv[1];
+ const char* osisDoc = argv[2];
+ int append = 0;
+ SWBuf compType = "";
+ bool isCommentary = false;
+ int iType = 4;
+ int entrySize = 0;
+ SWBuf cipherKey = "";
SWCompress *compressor = 0;
for (int i = 3; i < argc; i++) {
@@ -913,12 +1421,14 @@ int main(int argc, char **argv) {
append = 1;
}
else if (!strcmp(argv[i], "-z")) {
- if (compType) usage(*argv, "Cannot specify both -z and -Z");
- compType = 2;
+ if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+ if (entrySize) usage(*argv, "Cannot specify both -z and -s");
+ compType = "ZIP";
}
else if (!strcmp(argv[i], "-Z")) {
- if (compType) usage(*argv, "Cannot specify both -z and -Z");
- compType = 1;
+ if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
+ if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
+ compType = "LZSS";
}
else if (!strcmp(argv[i], "-b")) {
if (i+1 < argc) {
@@ -934,62 +1444,127 @@ int main(int argc, char **argv) {
if (i+1 < argc) cipherKey = argv[++i];
else usage(*argv, "-c requires <cipher_key>");
}
+ else if (!strcmp(argv[i], "-v")) {
+ if (i+1 < argc) v11n = argv[++i];
+ else usage(*argv, "-v requires <v11n>");
+ }
+ else if (!strcmp(argv[i], "-s")) {
+ if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
+ if (i+1 < argc) {
+ entrySize = atoi(argv[++i]);
+ if (entrySize == 2 || entrySize == 4) {
+ continue;
+ }
+ }
+ usage(*argv, "-s requires one of <2|4>");
+ }
+ else if (!strcmp(argv[i], "-C")) {
+ isCommentary = true;
+ }
+#ifdef DEBUG
+ else if (!strcmp(argv[i], "-d")) {
+ if (i+1 < argc) debug |= atoi(argv[++i]);
+ else usage(*argv, "-d requires <flags>");
+ }
+#endif
else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
}
- switch (compType) { // these are deleted by zText
- case 0: break;
- case 1: compressor = new LZSSCompress(); break;
- case 2: compressor = new ZipCompress(); break;
- }
+ if (compType == "ZIP") {
+ compressor = new ZipCompress();
+ }
+ else if (compType = "LZSS") {
+ compressor = new LZSSCompress();
+ }
#ifndef _ICU_
if (normalize) {
normalize = false;
- cout << program << " is not compiled with support for ICU. Ignoring -n flag." << endl;
+ cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl;
}
#endif
#ifdef DEBUG
- cout << "path: " << path << " osisDoc: " << osisDoc << " create: " << append << " compressType: " << compType << " blockType: " << iType << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
- cout << "";
-// exit(-3);
+ if (debug & DEBUG_OTHER) {
+ cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
+ }
#endif
-
if (!append) { // == 0 then create module
// Try to initialize a default set of datafiles and indicies at our
// datapath location passed to us from the user.
- if ( compressor ) {
- if ( zText::createModule(path, iType) ) {
- fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path);
- exit(-3);
+ if (compressor) {
+ if (zText::createModule(path, iType, v11n)) {
+ fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+ exit(EXIT_NO_CREATE);
+ }
+ }
+ else if (entrySize == 4) {
+ if (RawText4::createModule(path, v11n)) {
+ fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+ exit(EXIT_NO_CREATE);
}
}
- else if (RawText::createModule(path)) {
- fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path);
- exit(-3);
+ else {
+ if (RawText::createModule(path, v11n)) {
+ fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
+ exit(EXIT_NO_CREATE);
+ }
}
}
- // Let's see if we can open our input file
- ifstream infile(osisDoc);
- if (infile.fail()) {
- fprintf(stderr, "error: %s: couldn't open input file: %s \n", program, osisDoc);
- exit(-2);
- }
-
// Do some initialization stuff
if (compressor) {
- module = new zText(path, 0, 0, iType, compressor);
+ // Create a compressed text module allowing very large entries
+ // Taking defaults except for first, fourth, fifth and last argument
+ module = new zText(
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ iType, // iblockType
+ compressor, // icomp
+ 0, // idisp
+ ENC_UNKNOWN, // enc
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // lang
+ v11n // versification
+ );
+ }
+ else if (entrySize == 4) {
+ // Create a raw text module allowing very large entries
+ // Taking defaults except for first and last argument
+ module = new RawText4(
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ 0, // idisp
+ ENC_UNKNOWN, // encoding
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // ilang
+ v11n // versification
+ );
}
- else{
- module = new RawText(path); // open our datapath with our RawText driver.
+ else {
+ // Create a raw text module allowing reasonable sized entries
+ // Taking defaults except for first and last argument
+ module = new RawText(
+ path, // ipath
+ 0, // iname
+ 0, // idesc
+ 0, // idisp
+ ENC_UNKNOWN, // encoding
+ DIRECTION_LTR, // dir
+ FMT_UNKNOWN, // markup
+ 0, // ilang
+ v11n // versification
+ );
}
SWFilter *cipherFilter = 0;
- if (!cipherKey.empty()) {
+ if (cipherKey.length()) {
fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
cipherFilter = new CipherFilter(cipherKey.c_str());
module->AddRawFilter(cipherFilter);
@@ -997,94 +1572,29 @@ int main(int argc, char **argv) {
if (!module->isWritable()) {
fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
- exit(-1);
+ exit(EXIT_NO_WRITE);
}
- activeOsisID[0] = '\0';
- strcpy(currentOsisID,"N/A");
-
- currentVerse = new VerseKey();
- currentVerse->AutoNormalize(0);
- currentVerse->Headings(1); // turn on mod/testmnt/book/chap headings
- currentVerse->Persist(1);
-
- module->setKey(*currentVerse);
-
- (*module) = TOP;
-
- SWBuf token;
- SWBuf text;
- bool intoken = false;
- bool inWhitespace = false;
- bool seeingSpace = false;
- char curChar = '\0';
-
- while (infile.good()) {
-
- curChar = infile.get();
-
- // skip the character if it is bad. infile.good() will catch the problem
- if (curChar == -1) {
- continue;
- }
-
- if (!intoken && curChar == '<') {
- intoken = true;
- token = "<";
- continue;
- }
-
- // Outside of tokens merge adjacent whitespace
- if (!intoken) {
- seeingSpace = isspace(curChar);
- if (seeingSpace) {
- if (inWhitespace) {
- continue;
- }
- // convert all whitespace to blanks
- curChar = ' ';
- }
- inWhitespace = seeingSpace;
- }
-
- if (intoken && curChar == '>') {
- intoken = false;
- inWhitespace = false;
- token.append('>');
- // take this isalpha if out to check for bugs in text
- if ((isalpha(token[1])) || (isalpha(token[2]))) {
- //cout << "Handle:" << token.c_str() << endl;
- XMLTag *t = new XMLTag(token.c_str());
-
- if (!handleToken(text, transform(t))) {
- text.append(*t);
- }
- }
- continue;
+ // Either read from std::cin (aka stdin), when the argument is a '-'
+ // or from a specified file.
+ if (!strcmp(osisDoc, "-")) {
+ processOSIS(cin);
+ }
+ else {
+ // Let's see if we can open our input file
+ ifstream infile(osisDoc);
+ if (infile.fail()) {
+ fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
+ exit(EXIT_NO_READ);
}
-
- if (intoken)
- token.append(curChar);
- else
- switch (curChar) {
- case '>' : text.append("&gt;"); break;
- case '<' : text.append("&lt;"); break;
- default : text.append(curChar); break;
- }
+ processOSIS(infile);
+ infile.close();
}
- // Force the last entry from the text buffer.
- text = "";
- writeEntry(*currentVerse, text, true);
delete module;
- delete currentVerse;
if (cipherFilter)
delete cipherFilter;
- infile.close();
-#ifdef _ICU_
- if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
- if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
-#endif
+ exit(0); // success
}