summaryrefslogtreecommitdiff
path: root/utilities/osis2mod.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'utilities/osis2mod.cpp')
-rw-r--r--utilities/osis2mod.cpp1090
1 files changed, 1090 insertions, 0 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp
new file mode 100644
index 0000000..69d984d
--- /dev/null
+++ b/utilities/osis2mod.cpp
@@ -0,0 +1,1090 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+#include <stack>
+#include <iostream>
+#include <fstream>
+
+#include <utilstr.h>
+#include <swmgr.h>
+#include <rawtext.h>
+#include <swbuf.h>
+#include <utilxml.h>
+#include <listkey.h>
+#include <versekey.h>
+
+#include <ztext.h>
+// #include <zld.h>
+// #include <zcom.h>
+#include <lzsscomprs.h>
+#include <zipcomprs.h>
+#include <cipherfil.h>
+
+#ifdef _ICU_
+#include <utf8nfc.h>
+#include <latin1utf8.h>
+#endif
+
+//#define DEBUG
+
+// Debug for simple transformation stack
+//#define DEBUG_XFORM
+
+// Debug for parsing osisRefs
+//#define DEBUG_REF
+
+// Debug for tag stack
+//#define DEBUG_STACK
+
+#ifndef NO_SWORD_NAMESPACE
+using namespace sword;
+#endif
+
+using namespace std;
+
+#ifdef _ICU_
+UTF8NFC normalizer;
+int normalized = 0;
+
+Latin1UTF8 converter;
+int converted = 0;
+#endif
+
+SWText *module = 0;
+VerseKey *currentVerse = 0;
+char activeOsisID[255];
+char currentOsisID[255];
+const char *osisabbrevs[] = {"Gen", "Exod", "Lev", "Num", "Deut", "Josh", "Judg",
+ "Ruth", "1Sam", "2Sam", "1Kgs", "2Kgs", "1Chr", "2Chr", "Ezra", "Neh",
+ "Esth", "Job", "Ps", "Prov", "Eccl", "Song", "Isa", "Jer", "Lam", "Ezek",
+ "Dan", "Hos", "Joel", "Amos", "Obad", "Jonah", "Mic", "Nah", "Hab",
+ "Zeph", "Hag", "Zech", "Mal",
+
+ "Matt", "Mark", "Luke", "John", "Acts", "Rom", "1Cor", "2Cor", "Gal",
+ "Eph", "Phil", "Col", "1Thess", "2Thess", "1Tim", "2Tim", "Titus",
+ "Phlm", "Heb", "Jas", "1Pet", "2Pet", "1John", "2John", "3John",
+ "Jude", "Rev"};
+
+static bool inCanonicalOSISBook = true; // osisID is for a book that is not in Sword's canon
+static bool normalize = true; // Whether to normalize UTF-8 to NFC
+
+bool isOSISAbbrev(const char *buf) {
+ bool match = false;
+ for (int i = 0; i < 66; i++) {
+ if (!strcmp(buf, osisabbrevs[i])) {
+ match = true;
+ break;
+ }
+ }
+ return match;
+}
+
+
+/**
+ * Determine whether the string contains a valid unicode sequence.
+ * The following table give the pattern of a valid UTF-8 character.
+ * Unicode Range 1st 2nd 3rd 4th
+ * U-00000000 - U-0000007F 0nnnnnnn
+ * U-00000080 - U-000007FF 110nnnnn 10nnnnnn
+ * U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
+ * U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
+ * Note:
+ * 1. The latest UTF-8 RFC allows for a max of 4 bytes.
+ * Earlier allowed 6.
+ * 2. The number of bits of the leading byte before the first 0
+ * is the total number of bytes.
+ * 3. The "n" are the bits of the unicode codepoint.
+ * This routine does not check to see if the code point is in the range.
+ * It could.
+ *
+ * param txt the text to check
+ * return 1 if all high order characters form a valid unicode sequence
+ * -1 if there are no high order characters.
+ * Note: this is also a valid unicode sequence
+ * 0 if there are high order characters that do not form
+ * a valid unicode sequence
+ * author DM Smith
+ */
+int detectUTF8(const char *txt) {
+ unsigned int countUTF8 = 0;
+ int count = 0;
+
+ // Cast it to make masking and shifting easier
+ const unsigned char *p = (const unsigned char*) txt;
+ while (*p) {
+ // Is the high order bit set?
+ if (*p & 0x80) {
+ // Then count the number of high order bits that are set.
+ // This determines the number of following bytes
+ // that are a part of the unicode character
+ unsigned char i = *p;
+ for (count = 0; i & 0x80; count++) {
+ i <<= 1;
+ }
+
+ // Validate count:
+ // Count 0: bug in code that would cause core walking
+ // Count 1: is a pattern of 10nnnnnn,
+ // which does not signal the start of a unicode character
+ // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
+ // are not legal starts, either
+ if (count < 2 || count > 4) return 0;
+
+ // At this point we expect (count - 1) following characters
+ // of the pattern 10nnnnnn
+ while (--count && *++p) {
+ // The pattern of each following character must be: 10nnnnnn
+ // So, compare the top 2 bits.
+ if ((0xc0 & *p) != 0x80) return 0;
+ }
+
+ // Oops, we've run out of bytes too soon: Cannot be UTF-8
+ if (count) return 0;
+
+ // We have a valid UTF-8 character, so count it
+ countUTF8++;
+ }
+
+ // Advance to the next character to examine.
+ p++;
+ }
+
+ // At this point it is either UTF-8 or 7-bit ascii
+ return countUTF8 ? 1 : -1;
+}
+
+// This routine converts an osisID or osisRef into one that SWORD can parse into a verse list
+// An osisRef is made up of:
+// a single osisID
+// an osisID-osisID
+// or
+// an osisRef osisRef
+//
+// An osisID can have a work prefix which is terminated by a : and may have a grain
+// which is started by a !
+//
+// However, SWORD cannot handle work prefixes or grains and expects ranges to be
+// separated with a single;
+void prepareSWVerseKey(SWBuf &buf) {
+ // This routine modifies the buf in place
+ char* s = buf.getRawData();
+ char* p = s;
+ bool inRange = false;
+ while (*p) {
+ if (inRange) {
+#ifdef DEBUG_REF
+ cout << "Copy range marker:" << *p << endl;;
+#endif
+ // Range markers are copied as is
+ *s++ = *p++;
+ }
+
+ // Look ahead to see if we are in a work prefix
+ // but don't look past an osisID
+ char *n = p;
+ while (*n && *n != ':' && *n != ' ' && *n != '-') {
+ n++;
+ }
+
+ // We have found a work prefix
+ if (*n == ':') {
+ // set p to skip the work prefix
+ p = n + 1;
+#ifdef DEBUG_REF
+ cout << "Found a work prefix ";
+ for (char *x = s; x <= n; x++) {
+ cout << *x;
+ }
+ cout << endl;
+#endif
+ }
+
+ // Now we are in the meat of an osisID.
+ // Copy it to its end but stop on a grain marker of '!'
+#ifdef DEBUG_REF
+ cout << "Copy osisID:";
+#endif
+ while (*p && *p != '!' && *p != ' ' && *p != '-') {
+#ifdef DEBUG_REF
+ cout << *p;
+#endif
+ *s++ = *p++;
+ }
+#ifdef DEBUG_REF
+ cout << endl;
+#endif
+
+ // The ! and everything following until we hit
+ // the end of the osisID is part of the grain reference
+ if (*p == '!') {
+ n = p;
+ while (*n && *n != ' ' && *n != '-') {
+ n++;
+ }
+#ifdef DEBUG_REF
+ cout << "Found a grain suffix ";
+ for (char *x = p; x < n; x++) {
+ cout << *x;
+ }
+ cout << endl;
+#endif
+ p = n;
+ }
+
+ // At this point we have processed an osisID
+
+ // if we are not in a range and the next characer is a -
+ // then we are entering a range
+ inRange = !inRange && *p == '-';
+
+#ifdef DEBUG_REF
+ if (inRange) {
+ cout << "Found a range" << endl;
+ }
+#endif
+
+ // between ranges and stand alone osisIDs we might have whitespace
+ if (!inRange && *p == ' ') {
+ // skip this and subsequent spaces
+ while (*p == ' ') {
+ p++;
+ }
+ // replacing them all with a ';'
+ *s++ = ';';
+#ifdef DEBUG_REF
+ cout << "replacing space with ;. Remaining: " << p << endl;
+#endif
+ }
+ }
+
+ // Determine whether we have modified the buffer
+ // We have modified the buffer if s is not sitting on the null byte of the original
+ if (*s) {
+ // null terminate the reference
+ *s = '\0';
+ // Since we modified the swbuf, we need to tell it what we have done
+ buf.setSize(s - buf.c_str());
+#ifdef DEBUG_REF
+ cout << "shortended keyVal to`" << buf.c_str() << "`"<< endl;
+#endif
+ }
+}
+
+bool isKJVRef(const char *buf) {
+ VerseKey vk, test;
+ vk.AutoNormalize(0);
+ vk.Headings(1); // turn on mod/testmnt/book/chap headings
+ vk.Persist(1);
+ // lets do some tests on the verse --------------
+ vk = buf;
+ test = buf;
+
+ if (vk.Testament() && vk.Book() && vk.Chapter() && vk.Verse()) { // if we're not a heading
+#ifdef DEBUG
+ cout << (const char*)vk << " == " << (const char*)test << endl;
+#endif
+ return (vk == test);
+ }
+ else return true; // no check if we're a heading... Probably bad.
+}
+
+
+void makeKJVRef(VerseKey &key) {
+ cout << "re-versified " << key;
+#ifdef DEBUG
+ cout << "\tC" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) << ":V" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+#endif
+ if (key.Chapter() > key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) {
+ key.Chapter(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax);
+ key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+ }
+ else if (key.Verse() > key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]) {
+ key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]);
+ }
+ cout << "\tas " << key << endl;
+}
+
+
+void writeEntry(VerseKey &key, SWBuf &text, bool force = false) {
+ static SWBuf activeVerseText;
+ char keyOsisID[255];
+
+ if (inCanonicalOSISBook) {
+ strcpy(keyOsisID, key.getOSISRef());
+
+ // set keyOsisID to anything that an osisID cannot be.
+ if (force) {
+ strcpy(keyOsisID, "-force");
+ }
+
+ static VerseKey lastKey;
+ lastKey.AutoNormalize(0);
+ lastKey.Headings(1);
+
+ VerseKey saveKey;
+ saveKey.AutoNormalize(0);
+ saveKey.Headings(1);
+ saveKey = key;
+
+ // If we have seen a verse and the supplied one is different then we output the collected one.
+ if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
+
+ key = lastKey;
+
+ if (!isKJVRef(key)) {
+ makeKJVRef(key);
+ }
+
+#ifdef _ICU_
+ int utf8State = detectUTF8(activeVerseText.c_str());
+ if (normalize) {
+ // Don't need to normalize text that is ASCII
+ // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
+ if (!utf8State) {
+ cout << "Warning: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
+ converter.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ converted++;
+
+ // Prepare for double check. This probably can be removed.
+ // But for now we are running the check again.
+ // This is to determine whether we need to normalize output of the conversion.
+ utf8State = detectUTF8(activeVerseText.c_str());
+ }
+
+ // Double check. This probably can be removed.
+ if (!utf8State) {
+ cout << "Error: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl;
+ }
+
+ if (utf8State > 0) {
+ SWBuf before = activeVerseText;
+ normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
+ if (before != activeVerseText) {
+ normalized++;
+ }
+ }
+ }
+#endif
+
+ SWBuf currentText = module->getRawEntry();
+ if (currentText.length()) {
+ cout << "Appending entry: " << key.getOSISRef() << ": " << activeVerseText << endl;
+ activeVerseText = currentText + " " + activeVerseText;
+ }
+
+#ifdef DEBUG
+ cout << "Write: " << activeOsisID << ":" << key.getOSISRef() << ": " << activeVerseText << endl;
+#endif
+
+ module->setEntry(activeVerseText);
+ activeVerseText = "";
+ }
+
+ // eliminate leading whitespace on the beginning of each verse and
+ // before we append to current content, since we just added one
+ text.trimStart();
+ if (activeVerseText.length()) {
+ activeVerseText += " ";
+ activeVerseText += text;
+ }
+ else {
+ activeVerseText = text;
+ }
+
+ key = saveKey;
+ lastKey = key;
+ strcpy(activeOsisID, keyOsisID);
+ }
+}
+
+
+void linkToEntry(VerseKey& dest) {
+ //cout << "Verse: " << key << "\n";
+ //cout << "TEXT: " << text << "\n\n";
+ //SWBuf currentText = module->getRawEntry();
+ //if (currentText.length())
+ // text = currentText + " " + text;
+ VerseKey saveKey;
+ saveKey.AutoNormalize(0);
+ saveKey.Headings(1);
+ saveKey = *currentVerse;
+
+ if (!isKJVRef(*currentVerse)) {
+ makeKJVRef(*currentVerse);
+ }
+
+ cout << "Linking " << module->KeyText() << " to " << dest.getText() << "\n";
+ module->linkEntry(&dest);
+
+ *currentVerse = saveKey;
+}
+
+// Return true if the content was handled or is to be ignored.
+// false if the what has been seen is to be accumulated and considered later.
+bool handleToken(SWBuf &text, XMLTag *token) {
+
+ // Everything between the begin book tag and the first begin chapter tag is inBookHeader
+ static bool inBookHeader = false;
+ // Everything between the begin chapter tag and the first begin verse tag is inChapterHeader
+ static bool inChapterHeader = false;
+
+ // Flags to indicate whether we are in a book, chapter and/or verse
+ //static bool inBook = false;
+ //static bool inChapter = false;
+ static bool inVerse = true;
+
+ static SWBuf header = "";
+
+ // Used to remember titles that need to be handle specially
+ static SWBuf lastTitle = "";
+ static int titleOffset = -1;
+ static bool inTitle = false;
+ static int titleDepth = 0;
+
+ static ListKey lastVerseIDs = ListKey();
+
+ // Stack of elements used to validate that books, chapters and verses are well-formed
+ // This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse
+ // to be begin and end tags, too.
+ // It is an error if books and chapters are not well formed (though not required by OSIS)
+ // It is a warning that verses are not well formed (because some clients are not ready)
+ static std::stack<XMLTag*> tagStack;
+ // The following are used to validate well-formedness
+ static int chapterDepth = 0;
+ static int bookDepth = 0;
+ static int verseDepth = 0;
+
+ int tagDepth = tagStack.size();
+ const char *tokenName = token->getName();
+ bool isEndTag = token->isEndTag() || token->getAttribute("eID");
+ const char *typeAttr = token->getAttribute("type");
+
+ //Titles are treated specially.
+ // If the title has an attribute type of "main" or "chapter"
+ // it belongs to its <div> or <chapter> and is treated as part of its heading
+ // Otherwise if it a title in a chapter before the first the first verse it
+ // is put into the verse as a preverse title.
+ if (!token->isEmpty() && !isEndTag && titleDepth == 0 && (!strcmp(tokenName, "title")) && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter")))) {
+ titleOffset = text.length(); //start of the title tag
+ lastTitle = "";
+ inTitle = true;
+ tagStack.push(token);
+#ifdef DEBUG_STACK
+ cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl;
+#endif
+ titleDepth = tagStack.size();
+ return false;
+ }
+ // Check titleDepth since titles can be nested. Don't want to quit too early.
+ else if (isEndTag && tagDepth == titleDepth && (!strcmp(tokenName, "title"))) {
+ lastTitle.append(text.c_str() + titleOffset); //<title ...> up to the end </title>
+ lastTitle.append(*token); //</title>
+
+#ifdef DEBUG
+ cout << currentOsisID << ":" << endl;
+ cout << "\tlastTitle: " << lastTitle.c_str() << endl;
+ cout << "\ttext-lastTitle: " << text.c_str()+titleOffset << endl;
+ cout << "\ttext: " << text.c_str() << endl;
+#endif
+ inTitle = false;
+ titleDepth = 0;
+#ifdef DEBUG_STACK
+ cout << currentOsisID << ": pop(" << tagStack.size() << ") " << tagStack.top()->getName() << endl;
+#endif
+ tagStack.pop();
+ return false; // don't add </title> to the text itself
+ }
+
+
+
+//-- START TAG -------------------------------------------------------------------------
+
+ if (!isEndTag) {
+
+ // Remember non-empty start tags
+ if (!token->isEmpty()) {
+ tagStack.push(token);
+#ifdef DEBUG_STACK
+ cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl;
+#endif
+ }
+
+ //-- WITH OSIS ID -------------------------------------------------------------------------
+ //-- OR ANNOTATE REF -------------------------------------------------------------------------
+ if (token->getAttribute("osisID") || token->getAttribute("annotateRef")) {
+
+ // BOOK START
+ if ((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "book"))) {
+ inVerse = false;
+ if (inBookHeader || inChapterHeader) { // this one should never happen, but just in case
+#ifdef DEBUG
+ cout << currentOsisID << ": HEADING ";
+#endif
+ currentVerse->Testament(0);
+ currentVerse->Book(0);
+ currentVerse->Chapter(0);
+ currentVerse->Verse(0);
+ writeEntry(*currentVerse, text);
+ }
+ strcpy(currentOsisID, token->getAttribute("osisID"));
+ *currentVerse = currentOsisID;
+ currentVerse->Chapter(0);
+ currentVerse->Verse(0);
+ inBookHeader = true;
+ inChapterHeader = false;
+ lastTitle = "";
+ text = "";
+ bookDepth = tagStack.size();
+ chapterDepth = 0;
+ verseDepth = 0;
+
+ inCanonicalOSISBook = isOSISAbbrev(token->getAttribute("osisID"));
+
+ return true;
+ }
+
+ // CHAPTER START
+ else if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter")))
+ || (!strcmp(tokenName, "chapter"))
+ ) {
+ inVerse = false;
+ if (inBookHeader) {
+#ifdef DEBUG
+ cout << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl;
+#endif
+ writeEntry(*currentVerse, text);
+ }
+
+ strcpy(currentOsisID, token->getAttribute("osisID"));
+ *currentVerse = currentOsisID;
+ currentVerse->Verse(0);
+ inBookHeader = false;
+ inChapterHeader = true;
+ lastTitle = "";
+ text = "";
+ chapterDepth = tagStack.size();
+ verseDepth = 0;
+
+ return true;
+ }
+
+ // VERSE OR COMMENTARY START
+ else if (!strcmp(tokenName, "verse") ||
+ (!strcmp(tokenName, "div") &&
+ token->getAttribute("annotateType"))) {
+#ifdef DEBUG
+ cout << "Entering verse" << endl;
+#endif
+ inVerse = true;
+ if (inChapterHeader) {
+ SWBuf heading = text;
+
+ //make sure we don't insert the preverse title which belongs to the first verse of this chapter!
+ // Did we have a preverse title?
+ if (lastTitle.length())
+ {
+ //Was the preVerse title in the header (error if not)?
+ const char* header = heading.c_str();
+ const char* preVerse = strstr(header, lastTitle);
+ if (preVerse) {
+ if (preVerse == header) {
+ heading = ""; // do nothing
+ }
+ else {
+ // remove everything before the title from the beginning.
+ text = preVerse;
+ // Remove text from the end of the header.
+ heading.setSize(preVerse - header);
+ }
+ }
+ else {
+ cout << currentOsisID << ": Warning: Bug in code. Could not find title." << endl;
+ }
+ }
+ else {
+ text = "";
+ }
+
+ if (heading.length()) {
+#ifdef DEBUG
+ cout << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl;
+#endif
+ writeEntry(*currentVerse, heading);
+ }
+
+ inChapterHeader = false;
+ }
+
+ SWBuf keyVal = token->getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID");
+ prepareSWVerseKey(keyVal);
+ lastVerseIDs = currentVerse->ParseVerseList(keyVal, *currentVerse, true);
+
+ // set currentVerse to the first value in the keyVal
+ VerseKey *element = SWDYNAMIC_CAST(VerseKey, lastVerseIDs.GetElement(0));
+ if (element) {
+ *currentVerse = element->LowerBound().getText();
+ }
+ else {
+ *currentVerse = lastVerseIDs.GetElement(0)->getText();
+ }
+
+ strcpy(currentOsisID, currentVerse->getOSISRef());
+#ifdef DEBUG
+ cout << "Current verse is " << *currentVerse << endl;
+ cout << "osisID/annotateRef is adjusted to" << keyVal << endl;
+#endif
+
+ verseDepth = tagStack.size();
+
+ return true;
+ }
+ }
+ // Handle stuff between the verses
+ // Whitespace producing empty tokens are appended to prior entry
+ // Also the quote
+ // This is a hack to get ESV to work
+ else if (!inTitle && !inVerse && token->isEmpty()) { // && !inBookHeader && !inChapterHeader) {
+ if (!strcmp(tokenName, "p") ||
+ !strcmp(tokenName, "div") ||
+ !strcmp(tokenName, "q") ||
+ !strcmp(tokenName, "l") ||
+ !strcmp(tokenName, "lb") ||
+ !strcmp(tokenName, "lg")
+ ) {
+#ifdef DEBUG
+ if (token) {
+ cout << currentOsisID << ": appending interverse start token " << *token << ":" << text.c_str() << endl;
+ }
+#endif
+ SWBuf tmp = token->toString();
+ writeEntry(*currentVerse, tmp);
+ return true;
+ }
+#ifdef DEBUG
+ else {
+ if (token) {
+ cout << currentOsisID << ": interverse start token " << *token << ":" << text.c_str() << endl;
+ }
+ }
+#endif
+ }
+ }
+
+//-- EMPTY and END TAG ---------------------------------------------------------------------------------------------
+
+ else {
+
+ if (tagStack.empty()) {
+ cout << currentOsisID << ": tag expected" << endl;
+ exit(1);
+ }
+
+ XMLTag* topToken = 0;
+ if (!token->isEmpty()) {
+ topToken = tagStack.top();
+ tagDepth = tagStack.size();
+#ifdef DEBUG_STACK
+ cout << currentOsisID << ": pop(" << tagDepth << ") " << topToken->getName() << endl;
+#endif
+ tagStack.pop();
+
+ if (strcmp(topToken->getName(), tokenName)) {
+ cout << "Error: " << currentOsisID << ": Expected " << topToken->getName() << " found " << tokenName << endl;
+// exit(1); // I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
+ // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
+ }
+ }
+
+ // VERSE and COMMENTARY END
+ if (!strcmp(tokenName, "verse") || (inVerse && !strcmp(tokenName, "div"))) {
+ inVerse = false;
+
+ if (tagDepth != verseDepth) {
+ cout << "Warning verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
+ }
+
+ if (lastTitle.length()) {
+ const char* end = strchr(lastTitle, '>');
+#ifdef DEBUG
+ cout << currentOsisID << ":" << endl;
+ cout << "\t" << lastTitle << endl;
+ cout << "\tlength=" << int(end+1 - lastTitle.c_str()) << ", tag:" << lastTitle.c_str() << endl;
+#endif
+
+ SWBuf titleTagText;
+ titleTagText.append(lastTitle.c_str(), end+1 - lastTitle.c_str());
+#ifdef DEBUG
+ cout << currentOsisID << ": tagText: " << titleTagText.c_str() << endl;;
+#endif
+
+ XMLTag titleTag(titleTagText);
+ titleTag.setAttribute("type", "section");
+ titleTag.setAttribute("subType", "x-preverse");
+
+ //we insert the title into the text again - make sure to remove the old title text
+ const char* pos = strstr(text, lastTitle);
+ if (pos) {
+ SWBuf temp;
+ temp.append(text, pos-text.c_str());
+ temp.append(pos+lastTitle.length());
+ text = temp;
+ }
+
+ //if a title was already inserted at the beginning insert this one after that first title
+ int titlePos = 0;
+ if (!strncmp(text.c_str(),"<title ",7)) {
+ const char* tmp = strstr(text.c_str(), "</title>");
+ if (tmp) {
+ titlePos = (tmp-text.c_str()) + 8;
+ }
+ }
+ text.insert(titlePos, end+1);
+ text.insert(titlePos, titleTag);
+ }
+ // text += token;
+ writeEntry(*currentVerse, text);
+
+ // If we found an osisID like osisID="Gen.1.1 Gen.1.2 Gen.1.3" we have to link Gen.1.2 and Gen.1.3 to Gen.1.1
+ VerseKey dest = *currentVerse;
+ VerseKey linkKey;
+ linkKey.AutoNormalize(0);
+ linkKey.Headings(1); // turn on mod/testmnt/book/chap headings
+ linkKey.Persist(1);
+ for (lastVerseIDs = TOP; !lastVerseIDs.Error(); lastVerseIDs++) {
+ linkKey = lastVerseIDs;
+
+ if (linkKey.Verse() != dest.Verse() ||
+ linkKey.Chapter() != dest.Chapter() ||
+ linkKey.Book() != dest.Book() ||
+ linkKey.Testament() != dest.Testament())
+ {
+ *currentVerse = linkKey;
+ linkToEntry(dest);
+ }
+ }
+
+ lastTitle = "";
+ text = "";
+ verseDepth = 0;
+ return true;
+ }
+ else if (!inTitle && !inVerse && !inBookHeader && !inChapterHeader) {
+ // Is this the end of a chapter.
+ if (tagDepth == chapterDepth && (!strcmp(tokenName, "div") || !strcmp(tokenName, "chapter"))) {
+ chapterDepth = 0;
+ verseDepth = 0;
+ text = "";
+ return true;
+ }
+ // Or is it the end of a book
+ else if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) {
+ bookDepth = 0;
+ chapterDepth = 0;
+ verseDepth = 0;
+ text = "";
+ return true;
+ }
+ // Or is it the end of an osis document
+ else if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
+ bookDepth = 0;
+ chapterDepth = 0;
+ verseDepth = 0;
+ text = "";
+ return true;
+ }
+ // OTHER MISC END TAGS WHEN !INVERSE
+ // Test that is between verses, or after the last is appended to the preceeding verse.
+ else if (!strcmp(tokenName, "p") ||
+ !strcmp(tokenName, "div") ||
+ !strcmp(tokenName, "q") ||
+ !strcmp(tokenName, "l") ||
+ !strcmp(tokenName, "lb") ||
+ !strcmp(tokenName, "lg")
+ ) {
+ text.append(*token);
+ writeEntry(*currentVerse, text);
+ text = "";
+#ifdef DEBUG
+ cout << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+#endif
+ return true;
+ }
+#ifdef DEBUG
+ cout << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
+#endif
+ }
+ }
+ return false;
+}
+
+XMLTag* transform(XMLTag* t) {
+ static std::stack<XMLTag*> tagStack;
+ static int sID = 1;
+ char buf[11];
+
+ // Support simplification transformations
+ if (!t->isEmpty()) {
+ if (!t->isEndTag()) {
+ tagStack.push(t);
+#ifdef DEBUG_XFORM
+ cout << currentOsisID << ": xform push (" << tagStack.size() << ") " << t->getName() << endl;
+#endif
+ // Transform <q> into <q sID=""/> except for <q who="Jesus">
+ if ((!strcmp(t->getName(), "q")) && (!t->getAttribute("who") || strcmp(t->getAttribute("who"), "Jesus"))) {
+ t->setEmpty(true);
+ sprintf(buf, "q%d", sID++);
+ t->setAttribute("sID", buf);
+ }
+
+ // Transform <p> into <lb type="x-begin-paragraph"/>
+ else if (!strcmp(t->getName(), "p")) {
+ // note there is no process that should care about type, it is there for reversability
+ t->setText("<lb type=\"x-begin-paragraph\" />");
+ }
+ }
+ else {
+ XMLTag *topToken = tagStack.top();
+#ifdef DEBUG_XFORM
+ cout << currentOsisID << ": xform pop(" << tagStack.size() << ") " << topToken->getName() << endl;
+#endif
+ tagStack.pop();
+
+ // If we have found an end tag for a <q> that was transformed then transform this one as well.
+ if ((!strcmp(t->getName(), "q")) && (!strcmp(topToken->getName(), "q")) && (!topToken->getAttribute("who") || strcmp(topToken->getAttribute("who"), "Jesus"))) {
+ // make this a clone of the start tag with sID changed to eID
+ *t = *topToken;
+ t->setAttribute("eID", t->getAttribute("sID"));
+ t->setAttribute("sID", 0);
+ }
+
+ // Look for paragraph tags.
+ // If we have found an end tag for a <p> that was transformed then transform this as well.
+ else if ((!strcmp(t->getName(), "p")) && (!strcmp(topToken->getName(), "lb"))) {
+ t->setText("<lb type=\"x-end-paragraph\" />");
+ }
+ }
+ }
+ return t;
+}
+
+void usage(const char *app, const char *error = 0) {
+
+ if (error) fprintf(stderr, "\n%s: %s\n", app, error);
+
+ fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
+ fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n");
+ fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n");
+ fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n");
+ fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n");
+ fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
+ fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
+ fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
+ fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n");
+ fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC");
+ fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n");
+ exit(-1);
+}
+
+int main(int argc, char **argv) {
+
+ fprintf(stderr, "You are running osis2mod: $Rev: 2169 $\n");
+
+ // Let's test our command line arguments
+ if (argc < 3) {
+ usage(*argv);
+ }
+
+ // variables for arguments, holding defaults
+ const char* program = argv[0];
+ const char* path = argv[1];
+ const char* osisDoc = argv[2];
+ int append = 0;
+ int compType = 0;
+ int iType = 4;
+ string cipherKey = "";
+
+ SWCompress *compressor = 0;
+
+ for (int i = 3; i < argc; i++) {
+ if (!strcmp(argv[i], "-a")) {
+ append = 1;
+ }
+ else if (!strcmp(argv[i], "-z")) {
+ if (compType) usage(*argv, "Cannot specify both -z and -Z");
+ compType = 2;
+ }
+ else if (!strcmp(argv[i], "-Z")) {
+ if (compType) usage(*argv, "Cannot specify both -z and -Z");
+ compType = 1;
+ }
+ else if (!strcmp(argv[i], "-b")) {
+ if (i+1 < argc) {
+ iType = atoi(argv[++i]);
+ if ((iType >= 2) && (iType <= 4)) continue;
+ }
+ usage(*argv, "-b requires one of <2|3|4>");
+ }
+ else if (!strcmp(argv[i], "-N")) {
+ normalize = false;
+ }
+ else if (!strcmp(argv[i], "-c")) {
+ if (i+1 < argc) cipherKey = argv[++i];
+ else usage(*argv, "-c requires <cipher_key>");
+ }
+ else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
+ }
+
+ switch (compType) { // these are deleted by zText
+ case 0: break;
+ case 1: compressor = new LZSSCompress(); break;
+ case 2: compressor = new ZipCompress(); break;
+ }
+
+#ifndef _ICU_
+ if (normalize) {
+ normalize = false;
+ cout << program << " is not compiled with support for ICU. Ignoring -n flag." << endl;
+ }
+#endif
+
+#ifdef DEBUG
+ cout << "path: " << path << " osisDoc: " << osisDoc << " create: " << append << " compressType: " << compType << " blockType: " << iType << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n";
+ cout << "";
+// exit(-3);
+#endif
+
+
+ if (!append) { // == 0 then create module
+ // Try to initialize a default set of datafiles and indicies at our
+ // datapath location passed to us from the user.
+ if ( compressor ) {
+ if ( zText::createModule(path, iType) ) {
+ fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path);
+ exit(-3);
+ }
+ }
+ else if (RawText::createModule(path)) {
+ fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path);
+ exit(-3);
+ }
+ }
+
+ // Let's see if we can open our input file
+ ifstream infile(osisDoc);
+ if (infile.fail()) {
+ fprintf(stderr, "error: %s: couldn't open input file: %s \n", program, osisDoc);
+ exit(-2);
+ }
+
+ // Do some initialization stuff
+ if (compressor) {
+ module = new zText(path, 0, 0, iType, compressor);
+ }
+ else{
+ module = new RawText(path); // open our datapath with our RawText driver.
+ }
+
+ SWFilter *cipherFilter = 0;
+
+ if (!cipherKey.empty()) {
+ fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
+ cipherFilter = new CipherFilter(cipherKey.c_str());
+ module->AddRawFilter(cipherFilter);
+ }
+
+ if (!module->isWritable()) {
+ fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
+ exit(-1);
+ }
+
+ activeOsisID[0] = '\0';
+ strcpy(currentOsisID,"N/A");
+
+ currentVerse = new VerseKey();
+ currentVerse->AutoNormalize(0);
+ currentVerse->Headings(1); // turn on mod/testmnt/book/chap headings
+ currentVerse->Persist(1);
+
+ module->setKey(*currentVerse);
+
+ (*module) = TOP;
+
+ SWBuf token;
+ SWBuf text;
+ bool intoken = false;
+ bool inWhitespace = false;
+ bool seeingSpace = false;
+ char curChar = '\0';
+
+ while (infile.good()) {
+
+ curChar = infile.get();
+
+ // skip the character if it is bad. infile.good() will catch the problem
+ if (curChar == -1) {
+ continue;
+ }
+
+ if (!intoken && curChar == '<') {
+ intoken = true;
+ token = "<";
+ continue;
+ }
+
+ // Outside of tokens merge adjacent whitespace
+ if (!intoken) {
+ seeingSpace = isspace(curChar);
+ if (seeingSpace) {
+ if (inWhitespace) {
+ continue;
+ }
+ // convert all whitespace to blanks
+ curChar = ' ';
+ }
+ inWhitespace = seeingSpace;
+ }
+
+ if (intoken && curChar == '>') {
+ intoken = false;
+ inWhitespace = false;
+ token.append('>');
+ // take this isalpha if out to check for bugs in text
+ if ((isalpha(token[1])) || (isalpha(token[2]))) {
+ //cout << "Handle:" << token.c_str() << endl;
+ XMLTag *t = new XMLTag(token.c_str());
+
+ if (!handleToken(text, transform(t))) {
+ text.append(*t);
+ }
+ }
+ continue;
+ }
+
+ if (intoken)
+ token.append(curChar);
+ else
+ switch (curChar) {
+ case '>' : text.append("&gt;"); break;
+ case '<' : text.append("&lt;"); break;
+ default : text.append(curChar); break;
+ }
+ }
+
+ // Force the last entry from the text buffer.
+ text = "";
+ writeEntry(*currentVerse, text, true);
+ delete module;
+ delete currentVerse;
+ if (cipherFilter)
+ delete cipherFilter;
+ infile.close();
+
+#ifdef _ICU_
+ if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
+ if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
+#endif
+}
+