diff options
author | Roberto C. Sanchez <roberto@connexer.com> | 2014-03-29 10:54:01 -0400 |
---|---|---|
committer | Roberto C. Sanchez <roberto@connexer.com> | 2014-03-29 10:54:01 -0400 |
commit | 71a39f4652cd51df814c930dd268f3c9ad2aee86 (patch) | |
tree | 5994350a603908c4e4d660bc9d72c4ec43dd648e /utilities/osis2mod.cpp | |
parent | 03134fa5f6f25d92724ce4c183f9bbe12a9e37dc (diff) |
Imported Upstream version 1.6.0+dfsg
Diffstat (limited to 'utilities/osis2mod.cpp')
-rw-r--r-- | utilities/osis2mod.cpp | 1798 |
1 files changed, 1154 insertions, 644 deletions
diff --git a/utilities/osis2mod.cpp b/utilities/osis2mod.cpp index 69d984d..473a90f 100644 --- a/utilities/osis2mod.cpp +++ b/utilities/osis2mod.cpp @@ -1,24 +1,40 @@ +/* + * Copyright 2009 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + #include <ctype.h> #include <stdio.h> #include <fcntl.h> #include <errno.h> #include <stdlib.h> -#include <string> #include <stack> +#include <vector> #include <iostream> #include <fstream> #include <utilstr.h> #include <swmgr.h> #include <rawtext.h> +#include <rawtext4.h> #include <swbuf.h> #include <utilxml.h> #include <listkey.h> #include <versekey.h> #include <ztext.h> -// #include <zld.h> -// #include <zcom.h> #include <lzsscomprs.h> #include <zipcomprs.h> #include <cipherfil.h> @@ -28,61 +44,61 @@ #include <latin1utf8.h> #endif -//#define DEBUG - -// Debug for simple transformation stack -//#define DEBUG_XFORM - -// Debug for parsing osisRefs -//#define DEBUG_REF - -// Debug for tag stack -//#define DEBUG_STACK - #ifndef NO_SWORD_NAMESPACE using namespace sword; #endif using namespace std; -#ifdef _ICU_ -UTF8NFC normalizer; -int normalized = 0; +// Turn debugging on and off +//#define DEBUG +int debug = 0; +const int DEBUG_WRITE = 1; // writing to module +const int DEBUG_VERSE = 2; // verse start and end +const int DEBUG_QUOTE = 4; // quotes, especially Words of Christ (WOC) +const int DEBUG_TITLE = 8; // titles +const int DEBUG_INTERVERSE = 16; // inter-verse maerial +const int DEBUG_XFORM = 32; // transformations +const int DEBUG_REV11N = 64; // versification +const int DEBUG_REF = 128; // parsing of osisID and osisRef +const int DEBUG_STACK = 256; // cleanup of references +const int DEBUG_OTHER = 512; // ins and outs of books, chapters and verses + +// Exit codes +const int EXIT_BAD_ARG = 1; // Bad parameter given for program +const int EXIT_NO_WRITE = 2; // Could not open the module for writing +const int EXIT_NO_CREATE = 3; // Could not create the module +const int EXIT_NO_READ = 4; // Could not open the input file for reading. +const int EXIT_BAD_NESTING = 5; // BSP or BCV nesting is bad +#ifdef _ICU_ +UTF8NFC normalizer; Latin1UTF8 converter; -int converted = 0; #endif +int normalized = 0; +int converted = 0; SWText *module = 0; -VerseKey *currentVerse = 0; +VerseKey currentVerse; +SWBuf v11n = "KJV"; char activeOsisID[255]; char currentOsisID[255]; -const char *osisabbrevs[] = {"Gen", "Exod", "Lev", "Num", "Deut", "Josh", "Judg", - "Ruth", "1Sam", "2Sam", "1Kgs", "2Kgs", "1Chr", "2Chr", "Ezra", "Neh", - "Esth", "Job", "Ps", "Prov", "Eccl", "Song", "Isa", "Jer", "Lam", "Ezek", - "Dan", "Hos", "Joel", "Amos", "Obad", "Jonah", "Mic", "Nah", "Hab", - "Zeph", "Hag", "Zech", "Mal", - "Matt", "Mark", "Luke", "John", "Acts", "Rom", "1Cor", "2Cor", "Gal", - "Eph", "Phil", "Col", "1Thess", "2Thess", "1Tim", "2Tim", "Titus", - "Phlm", "Heb", "Jas", "1Pet", "2Pet", "1John", "2John", "3John", - "Jude", "Rev"}; +SWBuf activeVerseText; + +ListKey currentKeyIDs = ListKey(); + +std::vector<ListKey> linkedVerses; static bool inCanonicalOSISBook = true; // osisID is for a book that is not in Sword's canon -static bool normalize = true; // Whether to normalize UTF-8 to NFC +static bool normalize = true; // Whether to normalize UTF-8 to NFC bool isOSISAbbrev(const char *buf) { - bool match = false; - for (int i = 0; i < 66; i++) { - if (!strcmp(buf, osisabbrevs[i])) { - match = true; - break; - } - } - return match; + VerseMgr *vmgr = VerseMgr::getSystemVerseMgr(); + const VerseMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem()); + return av11n->getBookNumberByOSISName(buf) >= 0; } - /** * Determine whether the string contains a valid unicode sequence. * The following table give the pattern of a valid UTF-8 character. @@ -109,51 +125,92 @@ bool isOSISAbbrev(const char *buf) { * author DM Smith */ int detectUTF8(const char *txt) { - unsigned int countUTF8 = 0; - int count = 0; - - // Cast it to make masking and shifting easier - const unsigned char *p = (const unsigned char*) txt; - while (*p) { - // Is the high order bit set? - if (*p & 0x80) { - // Then count the number of high order bits that are set. - // This determines the number of following bytes - // that are a part of the unicode character - unsigned char i = *p; - for (count = 0; i & 0x80; count++) { - i <<= 1; - } - - // Validate count: - // Count 0: bug in code that would cause core walking - // Count 1: is a pattern of 10nnnnnn, - // which does not signal the start of a unicode character - // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 - // are not legal starts, either - if (count < 2 || count > 4) return 0; - - // At this point we expect (count - 1) following characters - // of the pattern 10nnnnnn - while (--count && *++p) { - // The pattern of each following character must be: 10nnnnnn - // So, compare the top 2 bits. - if ((0xc0 & *p) != 0x80) return 0; - } - - // Oops, we've run out of bytes too soon: Cannot be UTF-8 - if (count) return 0; - - // We have a valid UTF-8 character, so count it - countUTF8++; - } + unsigned int countUTF8 = 0; + int count = 0; + + // Cast it to make masking and shifting easier + const unsigned char *p = (const unsigned char*) txt; + while (*p) { + // Is the high order bit set? + if (*p & 0x80) { + // Then count the number of high order bits that are set. + // This determines the number of following bytes + // that are a part of the unicode character + unsigned char i = *p; + for (count = 0; i & 0x80; count++) { + i <<= 1; + } + + // Validate count: + // Count 0: bug in code that would cause core walking + // Count 1: is a pattern of 10nnnnnn, + // which does not signal the start of a unicode character + // Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111 + // are not legal starts, either + if (count < 2 || count > 4) return 0; + + // At this point we expect (count - 1) following characters + // of the pattern 10nnnnnn + while (--count && *++p) { + // The pattern of each following character must be: 10nnnnnn + // So, compare the top 2 bits. + if ((0xc0 & *p) != 0x80) return 0; + } + + // Oops, we've run out of bytes too soon: Cannot be UTF-8 + if (count) return 0; + + // We have a valid UTF-8 character, so count it + countUTF8++; + } - // Advance to the next character to examine. - p++; - } - - // At this point it is either UTF-8 or 7-bit ascii - return countUTF8 ? 1 : -1; + // Advance to the next character to examine. + p++; + } + + // At this point it is either UTF-8 or 7-bit ascii + return countUTF8 ? 1 : -1; +} + +void prepareSWText(const char *osisID, SWBuf &text) +{ + // Always check on UTF8 and report on non-UTF8 entries + int utf8State = detectUTF8(text.c_str()); + + // Trust, but verify. + if (!normalize && !utf8State) { + cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl; + } + +#ifdef _ICU_ + if (normalize) { + // Don't need to normalize text that is ASCII + // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 + if (!utf8State) { + cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl; + converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + converted++; + + // Prepare for double check. This probably can be removed. + // But for now we are running the check again. + // This is to determine whether we need to normalize output of the conversion. + utf8State = detectUTF8(text.c_str()); + } + + // Double check. This probably can be removed. + if (!utf8State) { + cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl; + } + + if (utf8State > 0) { + SWBuf before = text; + normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + if (before != text) { + normalized++; + } + } + } +#endif } // This routine converts an osisID or osisRef into one that SWORD can parse into a verse list @@ -175,8 +232,10 @@ void prepareSWVerseKey(SWBuf &buf) { bool inRange = false; while (*p) { if (inRange) { -#ifdef DEBUG_REF - cout << "Copy range marker:" << *p << endl;; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): Copy range marker:" << *p << endl;; + } #endif // Range markers are copied as is *s++ = *p++; @@ -193,28 +252,36 @@ void prepareSWVerseKey(SWBuf &buf) { if (*n == ':') { // set p to skip the work prefix p = n + 1; -#ifdef DEBUG_REF - cout << "Found a work prefix "; - for (char *x = s; x <= n; x++) { - cout << *x; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): Found a work prefix "; + for (char *x = s; x <= n; x++) { + cout << *x; + } + cout << endl; } - cout << endl; #endif } // Now we are in the meat of an osisID. // Copy it to its end but stop on a grain marker of '!' -#ifdef DEBUG_REF - cout << "Copy osisID:"; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): Copy osisID:"; + } #endif while (*p && *p != '!' && *p != ' ' && *p != '-') { -#ifdef DEBUG_REF - cout << *p; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << *p; + } #endif *s++ = *p++; } -#ifdef DEBUG_REF - cout << endl; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << endl; + } #endif // The ! and everything following until we hit @@ -224,12 +291,14 @@ void prepareSWVerseKey(SWBuf &buf) { while (*n && *n != ' ' && *n != '-') { n++; } -#ifdef DEBUG_REF - cout << "Found a grain suffix "; - for (char *x = p; x < n; x++) { - cout << *x; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): Found a grain suffix "; + for (char *x = p; x < n; x++) { + cout << *x; + } + cout << endl; } - cout << endl; #endif p = n; } @@ -240,9 +309,11 @@ void prepareSWVerseKey(SWBuf &buf) { // then we are entering a range inRange = !inRange && *p == '-'; -#ifdef DEBUG_REF - if (inRange) { - cout << "Found a range" << endl; +#ifdef DEBUG + if (debug & DEBUG_REF) { + if (inRange) { + cout << "DEBUG(REF): Found a range" << endl; + } } #endif @@ -254,8 +325,10 @@ void prepareSWVerseKey(SWBuf &buf) { } // replacing them all with a ';' *s++ = ';'; -#ifdef DEBUG_REF - cout << "replacing space with ;. Remaining: " << p << endl; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl; + } #endif } } @@ -267,607 +340,921 @@ void prepareSWVerseKey(SWBuf &buf) { *s = '\0'; // Since we modified the swbuf, we need to tell it what we have done buf.setSize(s - buf.c_str()); -#ifdef DEBUG_REF - cout << "shortended keyVal to`" << buf.c_str() << "`"<< endl; +#ifdef DEBUG + if (debug & DEBUG_REF) { + cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl; + } #endif } } -bool isKJVRef(const char *buf) { - VerseKey vk, test; - vk.AutoNormalize(0); - vk.Headings(1); // turn on mod/testmnt/book/chap headings - vk.Persist(1); - // lets do some tests on the verse -------------- - vk = buf; - test = buf; +/** + * Determine whether a verse as given is valid for the versification. + * This is done by comparing the before and after of normalization. + */ +bool isValidRef(const char *buf) { + // Create a VerseKey that does not do auto normalization + // Note: need to turn on headings so that a heading does not get normalized anyway + // And set it to the reference under question + VerseKey before; + before.setVersificationSystem(currentVerse.getVersificationSystem()); + before.AutoNormalize(0); + before.Headings(1); + before.setText(buf); + + // If we are a heading we must bail + // These will autonormalize to the last verse of the prior chapter + if (!before.Testament() || !before.Book() || !before.Chapter() || !before.Verse()) { + return true; + } + + // Create a VerseKey that does do auto normalization + // And set it to the reference under question + VerseKey after; + after.setVersificationSystem(currentVerse.getVersificationSystem()); + after.AutoNormalize(1); + after.setText(buf); + + if (before == after) + { + return true; + } + + // If we have gotten here the reference is not in the selected versification. + cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl; - if (vk.Testament() && vk.Book() && vk.Chapter() && vk.Verse()) { // if we're not a heading #ifdef DEBUG - cout << (const char*)vk << " == " << (const char*)test << endl; -#endif - return (vk == test); + if (debug & DEBUG_REV11N) { + cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl; } - else return true; // no check if we're a heading... Probably bad. +#endif + + return false; } +/** + * This routine is used to ensure that all the text in the input is saved to the module. + * Assumption: The input orders all the verses for a chapter in numerical order. Thus, any + * verses that are not in the chosen versification (v11n) follow those that are. + * + * The prior implementation of this adjusted the verse to the last one that is in the chosen v11n. + * If it the chapter were extra, then it is appended to the last verse of the last + * chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is + * appended to the last verse of the chapter. + * + * The problem with this is when a OSIS verse refers to more than one verse, e.g. + * osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n) + * and then it is followed by Gen.1.32. + * + * This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31 + * are not linked but rather empty. This routine will then find the last verse in the computed + * chapter that has content. + * + * Alternative, we could have done linking as we went, but this routine would have needed + * to find the first entry in the link set and elsewhere in the code when appending to a + * verse, it would need to be checked for adjacent links and those would have needed to be adjusted. + * + * param key the key that may need to be adjusted + */ +void makeValidRef(VerseKey &key) { + + int chapterMax = key.getChapterMax(); + int verseMax = key.getVerseMax(); -void makeKJVRef(VerseKey &key) { - cout << "re-versified " << key; #ifdef DEBUG - cout << "\tC" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) << ":V" << (int)(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]); + if (debug & DEBUG_REV11N) { + cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl; + } #endif - if (key.Chapter() > key.builtin_books[key.Testament()-1][key.Book()-1].chapmax) { - key.Chapter(key.builtin_books[key.Testament()-1][key.Book()-1].chapmax); - key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]); + + cout << "INFO(V11N): " << key.getOSISRef() << " is not in the " << key.getVersificationSystem() << " versification."; + // Since isValidRef returned false constrain the key to the nearest prior reference. + // If we are past the last chapter set the reference to the last chapter + if (key.Chapter() > chapterMax) { + key.Chapter(chapterMax); } - else if (key.Verse() > key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]) { - key.Verse(key.builtin_books[key.Testament()-1][key.Book()-1].versemax[key.Chapter()-1]); + + // Either we set the chapter to the last chapter and now need to set to the last verse in the chapter + // Or the verse is beyond the end of the chapter. + // In any case we need to constrain the verse to it's chapter. + key.Verse(verseMax); + + // There are three cases we want to handle: + // In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29. + // In each of these cases the out-of-versification, extra verse is Matt.7.30. + // 1) The "extra" verse follows the last verse in the chapter. + // <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse> + // In this case re-versify Matt.7.30 as Matt.7.29. + // + // 2) The "extra" verse follows a range (a set of linked verses). + // <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse> + // In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set. + // Since we are post-poning linking, we want to re-reversify to the last entry in the module. + // + // 3) The last verse in the chapter is not in the input. There may be other verses missing as well. + // <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse> + // In this case we should re-versify Matt.7.30 as Matt.7.29. + // However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module. + + while (!key.Error() && !module->hasEntry(&key)) { + key.decrement(1); } - cout << "\tas " << key << endl; -} + cout << " Appending content to " << key.getOSISRef() << endl; +} -void writeEntry(VerseKey &key, SWBuf &text, bool force = false) { - static SWBuf activeVerseText; +void writeEntry(SWBuf &text, bool force = false) { char keyOsisID[255]; - if (inCanonicalOSISBook) { - strcpy(keyOsisID, key.getOSISRef()); + static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 2400 $\"/>"; + static bool firstOT = true; + static bool firstNT = true; - // set keyOsisID to anything that an osisID cannot be. - if (force) { - strcpy(keyOsisID, "-force"); - } - - static VerseKey lastKey; - lastKey.AutoNormalize(0); - lastKey.Headings(1); + if (!inCanonicalOSISBook) { + return; + } - VerseKey saveKey; - saveKey.AutoNormalize(0); - saveKey.Headings(1); - saveKey = key; + strcpy(keyOsisID, currentVerse.getOSISRef()); - // If we have seen a verse and the supplied one is different then we output the collected one. - if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) { + // set keyOsisID to anything that an osisID cannot be. + if (force) { + strcpy(keyOsisID, "-force"); + } - key = lastKey; + static VerseKey lastKey; + lastKey.setVersificationSystem(currentVerse.getVersificationSystem()); + lastKey.AutoNormalize(0); + lastKey.Headings(1); - if (!isKJVRef(key)) { - makeKJVRef(key); - } + VerseKey saveKey; + saveKey.setVersificationSystem(currentVerse.getVersificationSystem()); + saveKey.AutoNormalize(0); + saveKey.Headings(1); + saveKey = currentVerse; -#ifdef _ICU_ - int utf8State = detectUTF8(activeVerseText.c_str()); - if (normalize) { - // Don't need to normalize text that is ASCII - // But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8 - if (!utf8State) { - cout << "Warning: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl; - converter.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks - converted++; - - // Prepare for double check. This probably can be removed. - // But for now we are running the check again. - // This is to determine whether we need to normalize output of the conversion. - utf8State = detectUTF8(activeVerseText.c_str()); - } + // If we have seen a verse and the supplied one is different then we output the collected one. + if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) { - // Double check. This probably can be removed. - if (!utf8State) { - cout << "Error: " << activeOsisID << ": Converting to UTF-8 (" << activeVerseText << ")" << endl; - } + if (!isValidRef(lastKey)) { + makeValidRef(lastKey); + } - if (utf8State > 0) { - SWBuf before = activeVerseText; - normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks - if (before != activeVerseText) { - normalized++; - } - } + currentVerse = lastKey; + + prepareSWText(activeOsisID, activeVerseText); + + // Put the revision into the module + int testmt = currentVerse.Testament(); + if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) { + VerseKey t; + t.setVersificationSystem(currentVerse.getVersificationSystem()); + t.AutoNormalize(0); + t.Headings(1); + t = currentVerse; + currentVerse.Book(0); + currentVerse.Chapter(0); + currentVerse.Verse(0); + module->setEntry(revision); + currentVerse = t; + switch (testmt) { + case 1: + firstOT = false; + break; + case 2: + firstNT = false; + break; } -#endif + } - SWBuf currentText = module->getRawEntry(); - if (currentText.length()) { - cout << "Appending entry: " << key.getOSISRef() << ": " << activeVerseText << endl; - activeVerseText = currentText + " " + activeVerseText; - } + // If the entry already exists, then append this entry to the text. + // This is for verses that are outside the chosen versification. They are appended to the prior verse. + // The space should not be needed if we retained verse tags. + SWBuf currentText = module->getRawEntry(); + if (currentText.length()) { + cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl; + activeVerseText = currentText + " " + activeVerseText; + } #ifdef DEBUG - cout << "Write: " << activeOsisID << ":" << key.getOSISRef() << ": " << activeVerseText << endl; -#endif - - module->setEntry(activeVerseText); - activeVerseText = ""; + if (debug & DEBUG_WRITE) { + cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl; } +#endif - // eliminate leading whitespace on the beginning of each verse and - // before we append to current content, since we just added one - text.trimStart(); - if (activeVerseText.length()) { - activeVerseText += " "; - activeVerseText += text; - } - else { - activeVerseText = text; - } + module->setEntry(activeVerseText); + activeVerseText = ""; + } - key = saveKey; - lastKey = key; - strcpy(activeOsisID, keyOsisID); + // The following is for initial verse content and for appending interverse content. + // Eliminate leading whitespace on the beginning of each verse and + // before we append to current content, since we just added one + text.trimStart(); + if (activeVerseText.length()) { + activeVerseText += " "; + activeVerseText += text; + } + else { + activeVerseText = text; } + // text has been consumed so clear it out. + text = ""; + + currentVerse = saveKey; + lastKey = currentVerse; + strcpy(activeOsisID, keyOsisID); } +void linkToEntry(VerseKey &linkKey, VerseKey &dest) { + + // Only link verses that are in the versification. + if (!isValidRef(linkKey)) { + return; + } -void linkToEntry(VerseKey& dest) { - //cout << "Verse: " << key << "\n"; - //cout << "TEXT: " << text << "\n\n"; - //SWBuf currentText = module->getRawEntry(); - //if (currentText.length()) - // text = currentText + " " + text; VerseKey saveKey; + saveKey.setVersificationSystem(currentVerse.getVersificationSystem()); saveKey.AutoNormalize(0); saveKey.Headings(1); - saveKey = *currentVerse; + saveKey = currentVerse; + currentVerse = linkKey; - if (!isKJVRef(*currentVerse)) { - makeKJVRef(*currentVerse); - } - - cout << "Linking " << module->KeyText() << " to " << dest.getText() << "\n"; + cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n"; module->linkEntry(&dest); - *currentVerse = saveKey; + currentVerse = saveKey; } // Return true if the content was handled or is to be ignored. // false if the what has been seen is to be accumulated and considered later. -bool handleToken(SWBuf &text, XMLTag *token) { +bool handleToken(SWBuf &text, XMLTag token) { // Everything between the begin book tag and the first begin chapter tag is inBookHeader - static bool inBookHeader = false; + static bool inBookHeader = false; + // Everything between the begin chapter tag and the first begin verse tag is inChapterHeader - static bool inChapterHeader = false; + static bool inChapterHeader = false; - // Flags to indicate whether we are in a book, chapter and/or verse - //static bool inBook = false; - //static bool inChapter = false; - static bool inVerse = true; + // Flags indicating whether we are processing the content of a chapter + static bool inChapter = false; - static SWBuf header = ""; + // Flags indicating whether we are processing the content of a verse + static bool inVerse = false; - // Used to remember titles that need to be handle specially - static SWBuf lastTitle = ""; - static int titleOffset = -1; - static bool inTitle = false; - static int titleDepth = 0; + // Flags indicating whether we are processing the content of to be prepended to a verse + static bool inPreVerse = false; + static int genID = 1; - static ListKey lastVerseIDs = ListKey(); + // Flag indicating whether we are in "Words of Christ" + static bool inWOC = false; + // Tag for WOC quotes within a verse + static XMLTag wocTag = "<q who=\"Jesus\" marker=\"\">"; + + // Flag used to indicate where useful text begins + static bool firstDiv = false; + + // Stack of quote elements used to handle Words of Christ + static std::stack<XMLTag> quoteStack; // Stack of elements used to validate that books, chapters and verses are well-formed // This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse // to be begin and end tags, too. // It is an error if books and chapters are not well formed (though not required by OSIS) // It is a warning that verses are not well formed (because some clients are not ready) - static std::stack<XMLTag*> tagStack; - // The following are used to validate well-formedness - static int chapterDepth = 0; - static int bookDepth = 0; - static int verseDepth = 0; - - int tagDepth = tagStack.size(); - const char *tokenName = token->getName(); - bool isEndTag = token->isEndTag() || token->getAttribute("eID"); - const char *typeAttr = token->getAttribute("type"); - - //Titles are treated specially. - // If the title has an attribute type of "main" or "chapter" - // it belongs to its <div> or <chapter> and is treated as part of its heading - // Otherwise if it a title in a chapter before the first the first verse it - // is put into the verse as a preverse title. - if (!token->isEmpty() && !isEndTag && titleDepth == 0 && (!strcmp(tokenName, "title")) && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter")))) { - titleOffset = text.length(); //start of the title tag - lastTitle = ""; - inTitle = true; - tagStack.push(token); -#ifdef DEBUG_STACK - cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl; -#endif - titleDepth = tagStack.size(); - return false; - } - // Check titleDepth since titles can be nested. Don't want to quit too early. - else if (isEndTag && tagDepth == titleDepth && (!strcmp(tokenName, "title"))) { - lastTitle.append(text.c_str() + titleOffset); //<title ...> up to the end </title> - lastTitle.append(*token); //</title> - -#ifdef DEBUG - cout << currentOsisID << ":" << endl; - cout << "\tlastTitle: " << lastTitle.c_str() << endl; - cout << "\ttext-lastTitle: " << text.c_str()+titleOffset << endl; - cout << "\ttext: " << text.c_str() << endl; -#endif - inTitle = false; - titleDepth = 0; -#ifdef DEBUG_STACK - cout << currentOsisID << ": pop(" << tagStack.size() << ") " << tagStack.top()->getName() << endl; -#endif - tagStack.pop(); - return false; // don't add </title> to the text itself - } - + static std::stack<XMLTag> tagStack; + // The following are used to validate well-formedness + static int chapterDepth = 0; + static int bookDepth = 0; + static int verseDepth = 0; -//-- START TAG ------------------------------------------------------------------------- + int tagDepth = tagStack.size(); + const char *tokenName = token.getName(); + bool isEndTag = token.isEndTag() || token.getAttribute("eID"); + const char *typeAttr = token.getAttribute("type"); + // process start tags if (!isEndTag) { // Remember non-empty start tags - if (!token->isEmpty()) { + if (!token.isEmpty()) { tagStack.push(token); -#ifdef DEBUG_STACK - cout << currentOsisID << ": push (" << tagStack.size() << ") " << token->getName() << endl; +#ifdef DEBUG + if (debug & DEBUG_STACK) { + cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl; + } #endif } - //-- WITH OSIS ID ------------------------------------------------------------------------- - //-- OR ANNOTATE REF ------------------------------------------------------------------------- - if (token->getAttribute("osisID") || token->getAttribute("annotateRef")) { + // throw away everything up to the first div + if (!firstDiv) { + if (!strcmp(tokenName, "div")) { +#ifdef DEBUG + if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl; + } +#endif + // TODO: Save off the content to use it to suggest the module's conf. + firstDiv = true; + text = ""; + } + else { + // Collect the content so it can be used to suggest the module's conf. + return false; + } + } + + //-- WITH osisID OR annotateRef ------------------------------------------------------------------------- + // Handle Book, Chapter, and Verse (or commentary equivalent) + if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) { - // BOOK START + // BOOK START, <div type="book" ...> if ((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "book"))) { - inVerse = false; if (inBookHeader || inChapterHeader) { // this one should never happen, but just in case #ifdef DEBUG - cout << currentOsisID << ": HEADING "; + if (debug & DEBUG_TITLE) { + cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS HEADING " << endl; + cout << "\tinChapterHeader = " << inChapterHeader << endl; + cout << "\tinBookHeader = " << inBookHeader << endl; + } #endif - currentVerse->Testament(0); - currentVerse->Book(0); - currentVerse->Chapter(0); - currentVerse->Verse(0); - writeEntry(*currentVerse, text); + currentVerse.Testament(0); + currentVerse.Book(0); + currentVerse.Chapter(0); + currentVerse.Verse(0); + writeEntry(text); } - strcpy(currentOsisID, token->getAttribute("osisID")); - *currentVerse = currentOsisID; - currentVerse->Chapter(0); - currentVerse->Verse(0); - inBookHeader = true; + currentVerse = token.getAttribute("osisID"); + currentVerse.Chapter(0); + currentVerse.Verse(0); + strcpy(currentOsisID, currentVerse.getOSISRef()); + + inChapter = false; + inVerse = false; + inPreVerse = false; + inBookHeader = true; inChapterHeader = false; - lastTitle = ""; - text = ""; - bookDepth = tagStack.size(); - chapterDepth = 0; - verseDepth = 0; - inCanonicalOSISBook = isOSISAbbrev(token->getAttribute("osisID")); + bookDepth = tagStack.size(); + chapterDepth = 0; + verseDepth = 0; - return true; + inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID")); + if (!inCanonicalOSISBook) { + cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl; + } +#ifdef DEBUG + else if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl; + } +#endif + + return false; } - // CHAPTER START - else if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter"))) - || (!strcmp(tokenName, "chapter")) - ) { - inVerse = false; + // CHAPTER START, <div type="chapter" ...> or <chapter ...> + if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter"))) || + (!strcmp(tokenName, "chapter")) + ) { if (inBookHeader) { #ifdef DEBUG - cout << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl; + if (debug & DEBUG_TITLE) { + cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl; + } #endif - writeEntry(*currentVerse, text); + writeEntry(text); } - strcpy(currentOsisID, token->getAttribute("osisID")); - *currentVerse = currentOsisID; - currentVerse->Verse(0); - inBookHeader = false; + currentVerse = token.getAttribute("osisID"); + currentVerse.Verse(0); +#ifdef DEBUG + if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl; + } +#endif + strcpy(currentOsisID, currentVerse.getOSISRef()); + + inChapter = true; + inVerse = false; + inPreVerse = false; + inBookHeader = false; inChapterHeader = true; - lastTitle = ""; - text = ""; - chapterDepth = tagStack.size(); - verseDepth = 0; - return true; + chapterDepth = tagStack.size(); + verseDepth = 0; + + return false; } - // VERSE OR COMMENTARY START - else if (!strcmp(tokenName, "verse") || - (!strcmp(tokenName, "div") && - token->getAttribute("annotateType"))) { + // VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...> + if (!strcmp(tokenName, "verse") || + (!strcmp(tokenName, "div") && token.getAttribute("annotateType"))) { #ifdef DEBUG - cout << "Entering verse" << endl; + if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): Entering verse" << endl; + } #endif - inVerse = true; if (inChapterHeader) { SWBuf heading = text; - - //make sure we don't insert the preverse title which belongs to the first verse of this chapter! - // Did we have a preverse title? - if (lastTitle.length()) - { - //Was the preVerse title in the header (error if not)? - const char* header = heading.c_str(); - const char* preVerse = strstr(header, lastTitle); - if (preVerse) { - if (preVerse == header) { - heading = ""; // do nothing - } - else { - // remove everything before the title from the beginning. - text = preVerse; - // Remove text from the end of the header. - heading.setSize(preVerse - header); - } - } - else { - cout << currentOsisID << ": Warning: Bug in code. Could not find title." << endl; - } - } - else { - text = ""; - } + text = ""; if (heading.length()) { #ifdef DEBUG - cout << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl; + if (debug & DEBUG_TITLE) { + cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl; + } #endif - writeEntry(*currentVerse, heading); + writeEntry(heading); } inChapterHeader = false; } - SWBuf keyVal = token->getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID"); + // Did we have pre-verse material that needs to be marked? + if (inPreVerse) { + char genBuf[200]; + sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++); + text.append(genBuf); + } + + // Get osisID for verse or annotateRef for commentary + SWBuf keyVal = token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID"); + + // Massage the key into a form that ParseVerseList can accept prepareSWVerseKey(keyVal); - lastVerseIDs = currentVerse->ParseVerseList(keyVal, *currentVerse, true); - // set currentVerse to the first value in the keyVal - VerseKey *element = SWDYNAMIC_CAST(VerseKey, lastVerseIDs.GetElement(0)); - if (element) { - *currentVerse = element->LowerBound().getText(); + // The osisID or annotateRef can be more than a single verse + // The first or only one is the currentVerse + // Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing. + // This should never happen if the references are valid OSIS references + ListKey verseKeys = currentVerse.ParseVerseList(keyVal, currentVerse, true); + int memberKeyCount = verseKeys.Count(); + if (memberKeyCount) { + currentVerse = verseKeys.getElement(0); + // See if this osisID or annotateRef refers to more than one verse. + // If it does, save it until all verses have been seen. + // At that point we will output links. + // This can be done by incrementing, which will produce an error + // if there is only one verse. + verseKeys.setPosition(TOP); + verseKeys.increment(1); + if (!verseKeys.Error()) { + linkedVerses.push_back(verseKeys); + } } else { - *currentVerse = lastVerseIDs.GetElement(0)->getText(); + cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID") << endl; } - strcpy(currentOsisID, currentVerse->getOSISRef()); + strcpy(currentOsisID, currentVerse.getOSISRef()); #ifdef DEBUG - cout << "Current verse is " << *currentVerse << endl; - cout << "osisID/annotateRef is adjusted to" << keyVal << endl; + if (debug & DEBUG_OTHER) { + cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl; + cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl; + } #endif - verseDepth = tagStack.size(); + inVerse = true; + inPreVerse = false; + inBookHeader = false; + inChapterHeader = false; + verseDepth = tagStack.size(); - return true; - } - } - // Handle stuff between the verses - // Whitespace producing empty tokens are appended to prior entry - // Also the quote - // This is a hack to get ESV to work - else if (!inTitle && !inVerse && token->isEmpty()) { // && !inBookHeader && !inChapterHeader) { - if (!strcmp(tokenName, "p") || - !strcmp(tokenName, "div") || - !strcmp(tokenName, "q") || - !strcmp(tokenName, "l") || - !strcmp(tokenName, "lb") || - !strcmp(tokenName, "lg") - ) { + // Include the token if it is not a verse + if (strcmp(tokenName, "verse")) { + text.append(token); + } #ifdef DEBUG - if (token) { - cout << currentOsisID << ": appending interverse start token " << *token << ":" << text.c_str() << endl; + else if (debug & DEBUG_VERSE) + { + // transform the verse into a milestone + XMLTag t = "<milestone resp=\"v\" />"; + // copy all the attributes of the verse element to the milestone + StringList attrNames = token.getAttributeNames(); + for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { + const char* attr = (*loop).c_str(); + t.setAttribute(attr, token.getAttribute(attr)); } + text.append(t); + } #endif - SWBuf tmp = token->toString(); - writeEntry(*currentVerse, tmp); + + if (inWOC) { + text.append(wocTag); + } return true; } + } // done with Handle Book, Chapter, and Verse (or commentary equivalent) + + // Now consider everything else. + + // Handle WOC quotes. + // Note this requires transformBSP to make them into milestones + // Otherwise have to do it here + if (!strcmp(tokenName, "q")) { + quoteStack.push(token); #ifdef DEBUG - else { - if (token) { - cout << currentOsisID << ": interverse start token " << *token << ":" << text.c_str() << endl; - } + if (debug & DEBUG_QUOTE) { + cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl; } #endif + if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { + inWOC = true; + + // Output per verse WOC markup. + text.append(wocTag); + + // Output the quotation mark if appropriate, inside the WOC. + // If there is no marker attribute, let the SWORD engine manufacture one. + // If there is a marker attribute and it has content, then output that. + // If the marker attribute is present and empty, then there is nothing to do. + // And have it within the WOC markup + if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { + token.setAttribute("who", 0); // remove the who="Jesus" + text.append(token); + } + return true; + } + return false; + } + + // Have we found the start of pre-verse material? + // Pre-verse material follows the following rules + // 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book. + // 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter + // and the first verse of the chapter. + // A <div> with a type other than section will be taken as a chapter introduction. + // A <title> of type acrostic, psalm or no type, will be taken as a title for the verse. + // A <title> of type main or chapter will be seen as a chapter title. + // 3) Between verses, the material is split between the prior verse and the next verse. + // Basically, while end and empty tags are found, they belong to the prior verse. + // Once a begin tag is found, it belongs to the next verse. + // If the title has an attribute type of "main" or "chapter" + // it belongs to its <div> or <chapter> and is treated as part of its heading + // Otherwise if it a title in a chapter before the first the first verse it + // is put into the verse as a preverse title. + + if (!inPreVerse && !inBookHeader) { + if (inChapterHeader) { + // Determine when we are no longer in a chapter heading, but in pre-verse material: + // If we see one of the following: + // a section div + // a title that is not main or chapter + if ((!strcmp(tokenName, "div") && (typeAttr && !strcmp(typeAttr, "section"))) || + (!strcmp(tokenName, "title") && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter")))) + ) { + // Since we have found the boundary, we need to write out the chapter heading + writeEntry(text); + // And we are no longer in the chapter heading + inChapterHeader = false; + // But rather, we are now in pre-verse material + inPreVerse = true; + } + } + else if (!inVerse && inChapter) { + inPreVerse = true; + } + + if (inPreVerse) { + char genBuf[200]; + sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID++); + text.append(genBuf); + } } - } -//-- EMPTY and END TAG --------------------------------------------------------------------------------------------- +#ifdef DEBUG + if (debug & DEBUG_INTERVERSE) { + if (!inVerse && !inBookHeader && !inChapterHeader) { + cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl; + } + } +#endif + return false; + } // Done with procesing start and empty tags + + // Process end tags else { if (tagStack.empty()) { - cout << currentOsisID << ": tag expected" << endl; - exit(1); + cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl; + exit(EXIT_BAD_NESTING); } - XMLTag* topToken = 0; - if (!token->isEmpty()) { - topToken = tagStack.top(); + // Note: empty end tags have the eID attribute + if (!token.isEmpty()) { + XMLTag topToken = tagStack.top(); tagDepth = tagStack.size(); -#ifdef DEBUG_STACK - cout << currentOsisID << ": pop(" << tagDepth << ") " << topToken->getName() << endl; +#ifdef DEBUG + if (debug & DEBUG_STACK) { + cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl; + } #endif tagStack.pop(); - if (strcmp(topToken->getName(), tokenName)) { - cout << "Error: " << currentOsisID << ": Expected " << topToken->getName() << " found " << tokenName << endl; -// exit(1); // I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. + if (strcmp(topToken.getName(), tokenName)) { + cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl; +// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here. // So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway. + // (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting. } } + // We haven't seen the first div so there is nothing to do. + if (!firstDiv) { + // Collect the content so it can be used to suggest the module's conf. + return false; + } + // VERSE and COMMENTARY END if (!strcmp(tokenName, "verse") || (inVerse && !strcmp(tokenName, "div"))) { - inVerse = false; if (tagDepth != verseDepth) { - cout << "Warning verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl; + cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl; } - if (lastTitle.length()) { - const char* end = strchr(lastTitle, '>'); + // If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. + if (inWOC) { + text.append("</q>"); + } + + + // Include the token if it is not a verse + if (strcmp(tokenName, "verse")) { + text.append(token); + } #ifdef DEBUG - cout << currentOsisID << ":" << endl; - cout << "\t" << lastTitle << endl; - cout << "\tlength=" << int(end+1 - lastTitle.c_str()) << ", tag:" << lastTitle.c_str() << endl; + else if (debug & DEBUG_VERSE) + { + // transform the verse into a milestone + XMLTag t = "<milestone resp=\"v\" />"; + // copy all the attributes of the verse element to the milestone + StringList attrNames = token.getAttributeNames(); + for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) { + const char* attr = (*loop).c_str(); + t.setAttribute(attr, token.getAttribute(attr)); + } + text.append(t); + } #endif - SWBuf titleTagText; - titleTagText.append(lastTitle.c_str(), end+1 - lastTitle.c_str()); + writeEntry(text); + + inVerse = false; + inPreVerse = false; + verseDepth = 0; + + return true; + } + + // Handle WOC quotes. + // Note this requires transformBSP to make them into milestones + // Otherwise have to manage it here + if (!strcmp(tokenName, "q")) { + XMLTag topToken = quoteStack.top(); #ifdef DEBUG - cout << currentOsisID << ": tagText: " << titleTagText.c_str() << endl;; + if (debug & DEBUG_QUOTE) { + cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl; + } #endif + quoteStack.pop(); - XMLTag titleTag(titleTagText); - titleTag.setAttribute("type", "section"); - titleTag.setAttribute("subType", "x-preverse"); - - //we insert the title into the text again - make sure to remove the old title text - const char* pos = strstr(text, lastTitle); - if (pos) { - SWBuf temp; - temp.append(text, pos-text.c_str()); - temp.append(pos+lastTitle.length()); - text = temp; + // If we have found an end tag for a <q who="Jesus"> then we are done with the WOC + // and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse. + if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) { +#ifdef DEBUG + if (debug & DEBUG_QUOTE) { + cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl; } - - //if a title was already inserted at the beginning insert this one after that first title - int titlePos = 0; - if (!strncmp(text.c_str(),"<title ",7)) { - const char* tmp = strstr(text.c_str(), "</title>"); - if (tmp) { - titlePos = (tmp-text.c_str()) + 8; - } +#endif + inWOC = false; + const char *sID = topToken.getAttribute("sID"); + const char *eID = token.getAttribute("eID"); + if (!sID) { + sID = ""; } - text.insert(titlePos, end+1); - text.insert(titlePos, titleTag); - } - // text += token; - writeEntry(*currentVerse, text); - - // If we found an osisID like osisID="Gen.1.1 Gen.1.2 Gen.1.3" we have to link Gen.1.2 and Gen.1.3 to Gen.1.1 - VerseKey dest = *currentVerse; - VerseKey linkKey; - linkKey.AutoNormalize(0); - linkKey.Headings(1); // turn on mod/testmnt/book/chap headings - linkKey.Persist(1); - for (lastVerseIDs = TOP; !lastVerseIDs.Error(); lastVerseIDs++) { - linkKey = lastVerseIDs; - - if (linkKey.Verse() != dest.Verse() || - linkKey.Chapter() != dest.Chapter() || - linkKey.Book() != dest.Book() || - linkKey.Testament() != dest.Testament()) - { - *currentVerse = linkKey; - linkToEntry(dest); + if (!eID) { + eID = ""; + } + if (strcmp(sID, eID)) { + cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl; } - } - lastTitle = ""; - text = ""; - verseDepth = 0; - return true; + + // Output the quotation mark if appropriate, inside the WOC. + // If there is no marker attribute, let the SWORD engine manufacture one. + // If there is a marker attribute and it has content, then output that. + // If the marker attribute is present and empty, then there is nothing to do. + // And have it within the WOC markup + if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) { + token.setAttribute("who", 0); // remove the who="Jesus" + text.append(token); + } + + // Now close the WOC + text.append("</q>"); + return true; + } + return false; } - else if (!inTitle && !inVerse && !inBookHeader && !inChapterHeader) { + + // Look for the end of document, book and chapter + // Also for material that goes with last entry + if (!inVerse && !inBookHeader && !inChapterHeader) { // Is this the end of a chapter. if (tagDepth == chapterDepth && (!strcmp(tokenName, "div") || !strcmp(tokenName, "chapter"))) { + text.append(token); + writeEntry(text); + inChapter = false; chapterDepth = 0; - verseDepth = 0; - text = ""; + verseDepth = 0; return true; } - // Or is it the end of a book - else if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) { - bookDepth = 0; + + // Is it the end of a book + if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) { + text.append(token); + writeEntry(text); + bookDepth = 0; chapterDepth = 0; - verseDepth = 0; - text = ""; + verseDepth = 0; return true; } - // Or is it the end of an osis document - else if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) { - bookDepth = 0; + + // Do not include the end of an osis document + if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) { + bookDepth = 0; chapterDepth = 0; - verseDepth = 0; - text = ""; + verseDepth = 0; + text = ""; return true; } - // OTHER MISC END TAGS WHEN !INVERSE - // Test that is between verses, or after the last is appended to the preceeding verse. - else if (!strcmp(tokenName, "p") || - !strcmp(tokenName, "div") || - !strcmp(tokenName, "q") || - !strcmp(tokenName, "l") || - !strcmp(tokenName, "lb") || - !strcmp(tokenName, "lg") - ) { - text.append(*token); - writeEntry(*currentVerse, text); - text = ""; + + // When we are not inPreVerse, the interverse tags get appended to the preceeding verse. + if (!inPreVerse) { + text.append(token); + writeEntry(text); #ifdef DEBUG - cout << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; + if (debug & DEBUG_INTERVERSE) { + cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; + } #endif return true; } + #ifdef DEBUG - cout << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; + if (debug & DEBUG_INTERVERSE) { + cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl; + } #endif + return false; + } - } + + return false; + } // done with Processing end tags + return false; } -XMLTag* transform(XMLTag* t) { - static std::stack<XMLTag*> tagStack; +/** + * Support normalizations necessary for a SWORD module. + * OSIS allows for document structure (Book, Section, Paragraph or BSP) + * to overlap Bible versification (Book, Chapter, Verse). + * Most SWORD applications need to display verses in isolation or in HTML table cells, + * requiring each stored entry (i.e. verses) to be well-formed xml. + * This routine normalizes container elements which could cross verse boundaries into milestones. + * For most of these OSIS elements, there is a milestone form. However, p is not milestoneable. + * For this reason, p is transformed into lb elements. + * param t the tag to transform + * return the transformed tag or the original one + */ +XMLTag transformBSP(XMLTag t) { + static std::stack<XMLTag> bspTagStack; static int sID = 1; char buf[11]; // Support simplification transformations - if (!t->isEmpty()) { - if (!t->isEndTag()) { - tagStack.push(t); -#ifdef DEBUG_XFORM - cout << currentOsisID << ": xform push (" << tagStack.size() << ") " << t->getName() << endl; + if (t.isEmpty()) { +#ifdef DEBUG + if (debug & DEBUG_XFORM) { + cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl; + } #endif - // Transform <q> into <q sID=""/> except for <q who="Jesus"> - if ((!strcmp(t->getName(), "q")) && (!t->getAttribute("who") || strcmp(t->getAttribute("who"), "Jesus"))) { - t->setEmpty(true); - sprintf(buf, "q%d", sID++); - t->setAttribute("sID", buf); - } + return t; + } - // Transform <p> into <lb type="x-begin-paragraph"/> - else if (!strcmp(t->getName(), "p")) { - // note there is no process that should care about type, it is there for reversability - t->setText("<lb type=\"x-begin-paragraph\" />"); - } + const char* tagName = t.getName(); + if (!t.isEndTag()) { + // Transform <p> into <div type="paragraph"> and milestone it + if (!strcmp(tagName, "p")) { + t.setText("<div type=\"paragraph\" />"); + sprintf(buf, "gen%d", sID++); + t.setAttribute("sID", buf); + } + + // Transform <tag> into <tag sID="">, where tag is a milestoneable element. + // The following containers are milestoneable. + // abbr, closer, div, foreign, l, lg, salute, signed, speech + // Leaving out: + // abbr When would this ever cross a boundary? + // seg as it is used for a divineName hack + // foreign so that it can be easily italicized + else if (!strcmp(tagName, "chapter") || + !strcmp(tagName, "closer") || + !strcmp(tagName, "div") || + !strcmp(tagName, "l") || + !strcmp(tagName, "lg") || + !strcmp(tagName, "q") || + !strcmp(tagName, "salute") || + !strcmp(tagName, "signed") || + !strcmp(tagName, "speech") || + !strcmp(tagName, "verse") + ) { + t.setEmpty(true); + sprintf(buf, "gen%d", sID++); + t.setAttribute("sID", buf); + } + bspTagStack.push(t); +#ifdef DEBUG + if (debug & DEBUG_XFORM) { + cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl; + XMLTag topToken = bspTagStack.top(); + cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl; } - else { - XMLTag *topToken = tagStack.top(); -#ifdef DEBUG_XFORM - cout << currentOsisID << ": xform pop(" << tagStack.size() << ") " << topToken->getName() << endl; #endif - tagStack.pop(); + } + else { + XMLTag topToken = bspTagStack.top(); +#ifdef DEBUG + if (debug & DEBUG_XFORM) { + cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl; + } +#endif + bspTagStack.pop(); + + // Look for the milestoneable container tags handled above. + if (!strcmp(tagName, "chapter") || + !strcmp(tagName, "closer") || + !strcmp(tagName, "div") || + !strcmp(tagName, "l") || + !strcmp(tagName, "lg") || + !strcmp(tagName, "p") || + !strcmp(tagName, "q") || + !strcmp(tagName, "salute") || + !strcmp(tagName, "signed") || + !strcmp(tagName, "speech") || + !strcmp(tagName, "verse") + ) { + // make this a clone of the start tag with sID changed to eID + // Note: in the case of </p> the topToken is a <div type="paragraph"> + t = topToken; + t.setAttribute("eID", t.getAttribute("sID")); + t.setAttribute("sID", 0); + } + } - // If we have found an end tag for a <q> that was transformed then transform this one as well. - if ((!strcmp(t->getName(), "q")) && (!strcmp(topToken->getName(), "q")) && (!topToken->getAttribute("who") || strcmp(topToken->getAttribute("who"), "Jesus"))) { - // make this a clone of the start tag with sID changed to eID - *t = *topToken; - t->setAttribute("eID", t->getAttribute("sID")); - t->setAttribute("sID", 0); - } + return t; +} - // Look for paragraph tags. - // If we have found an end tag for a <p> that was transformed then transform this as well. - else if ((!strcmp(t->getName(), "p")) && (!strcmp(topToken->getName(), "lb"))) { - t->setText("<lb type=\"x-end-paragraph\" />"); - } +/** + * Write out all links in the module. + * Waiting is necessary because writeEntry might ultimately append + * text to a verse moving it's offset in the data file. + * While we are minimizing it by postponing the write until we have + * gathered the next verse, the following scenario is happening: + * A module is using linked verses and has some verses that are not + * in the chosen versification. If the out-of-canon verse happens following + * a linked verse, the out-of-canon verse is appended to the prior + * verse. Care has to be taken that the linked verses all point to + * the first of the set. + */ +void writeLinks() +{ + // Link all the verses + VerseKey destKey; + destKey.setVersificationSystem(currentVerse.getVersificationSystem()); + destKey.AutoNormalize(0); + destKey.Headings(1); + + VerseKey linkKey; + linkKey.setVersificationSystem(currentVerse.getVersificationSystem()); + linkKey.AutoNormalize(0); + linkKey.Headings(1); + for (unsigned int i = 0; i < linkedVerses.size(); i++) { + // The verseKeys is a list of verses + // where the first is the real verse + // and the others link to it. + ListKey verseKeys = linkedVerses[i]; + verseKeys.setPosition(TOP); + destKey = verseKeys.getElement(); + verseKeys.increment(1); + + while (!verseKeys.Error()) { + linkKey = verseKeys.getElement(); + verseKeys.increment(1); + linkToEntry(linkKey, destKey); } } - return t; } void usage(const char *app, const char *error = 0) { @@ -875,6 +1262,8 @@ void usage(const char *app, const char *error = 0) { if (error) fprintf(stderr, "\n%s: %s\n", app, error); fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app); + fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n"); + fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to read from standard input\n"); fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n"); fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n"); fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n"); @@ -882,15 +1271,133 @@ void usage(const char *app, const char *error = 0) { fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); - fprintf(stderr, " -N\t\t\t Do not convert UTF-8 or normalize UTF-8 to NFC\n"); - fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed, and then normalize to NFC"); - fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n"); - exit(-1); + fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n"); + fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n"); + fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n"); + fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n"); + fprintf(stderr, " -s <2|4>\t\t max text size per entry (default is 2).\n"); + fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large entries\n"); + fprintf(stderr, "\t\t\t\t in uncompressed modules (default is 65535 bytes)\n"); + fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n"); + fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:\n"); + VerseMgr *vmgr = VerseMgr::getSystemVerseMgr(); + StringList av11n = vmgr->getVersificationSystems(); + for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) { + fprintf(stderr, "\t\t\t\t\t%s\n", (*loop).c_str()); + } +#ifdef DEBUG + fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n"); + fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n"); + fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n"); + fprintf(stderr, "\t\t\t\t\t0 - no debugging\n"); + fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n"); + fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n"); + fprintf(stderr, "\t\t\t\t\t4 - quotes, especially Words of Christ (WOC)\n"); + fprintf(stderr, "\t\t\t\t\t8 - titles\n"); + fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n"); + fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n"); + fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n"); + fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n"); + fprintf(stderr, "\t\t\t\t\t256 - internal stack\n"); + fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n"); + fprintf(stderr, "\t\t\t\t This flag can be used more than once.\n"); +#endif + fprintf(stderr, "\n"); + fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n"); + fprintf(stderr, "\n"); + exit(EXIT_BAD_ARG); +} + +void processOSIS(istream& infile) { + activeOsisID[0] = '\0'; + + strcpy(currentOsisID,"N/A"); + + currentVerse.setVersificationSystem(v11n); + currentVerse.AutoNormalize(0); + currentVerse.Headings(1); // turn on mod/testmnt/book/chap headings + currentVerse.Persist(1); + + module->setKey(currentVerse); + module->setPosition(TOP); + + SWBuf token; + SWBuf text; + bool intoken = false; + bool inWhitespace = false; + bool seeingSpace = false; + char curChar = '\0'; + + while (infile.good()) { + + curChar = infile.get(); + + // skip the character if it is bad. infile.good() will catch the problem + if (curChar == -1) { + continue; + } + + if (!intoken && curChar == '<') { + intoken = true; + token = "<"; + continue; + } + + // Outside of tokens merge adjacent whitespace + if (!intoken) { + seeingSpace = isspace(curChar); + if (seeingSpace) { + if (inWhitespace) { + continue; + } + // convert all whitespace to blanks + curChar = ' '; + } + inWhitespace = seeingSpace; + } + + if (intoken && curChar == '>') { + intoken = false; + inWhitespace = false; + token.append('>'); + // take this isalpha if out to check for bugs in text + if ((isalpha(token[1])) || (isalpha(token[2]))) { + //cout << "Handle:" << token.c_str() << endl; + XMLTag t = transformBSP(token.c_str()); + + if (!handleToken(text, t)) { + text.append(t); + } + } + continue; + } + + if (intoken) { + token.append(curChar); + } + else { + switch (curChar) { + case '>' : text.append(">"); break; + case '<' : text.append("<"); break; + default : text.append(curChar); break; + } + } + } + + // Force the last entry from the text buffer. + text = ""; + writeEntry(text, true); + writeLinks(); + +#ifdef _ICU_ + if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted); + if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized); +#endif } int main(int argc, char **argv) { - fprintf(stderr, "You are running osis2mod: $Rev: 2169 $\n"); + fprintf(stderr, "You are running osis2mod: $Rev: 2400 $\n"); // Let's test our command line arguments if (argc < 3) { @@ -898,14 +1405,15 @@ int main(int argc, char **argv) { } // variables for arguments, holding defaults - const char* program = argv[0]; - const char* path = argv[1]; - const char* osisDoc = argv[2]; - int append = 0; - int compType = 0; - int iType = 4; - string cipherKey = ""; - + const char* program = argv[0]; + const char* path = argv[1]; + const char* osisDoc = argv[2]; + int append = 0; + SWBuf compType = ""; + bool isCommentary = false; + int iType = 4; + int entrySize = 0; + SWBuf cipherKey = ""; SWCompress *compressor = 0; for (int i = 3; i < argc; i++) { @@ -913,12 +1421,14 @@ int main(int argc, char **argv) { append = 1; } else if (!strcmp(argv[i], "-z")) { - if (compType) usage(*argv, "Cannot specify both -z and -Z"); - compType = 2; + if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); + if (entrySize) usage(*argv, "Cannot specify both -z and -s"); + compType = "ZIP"; } else if (!strcmp(argv[i], "-Z")) { - if (compType) usage(*argv, "Cannot specify both -z and -Z"); - compType = 1; + if (compType.size()) usage(*argv, "Cannot specify both -z and -Z"); + if (entrySize) usage(*argv, "Cannot specify both -Z and -s"); + compType = "LZSS"; } else if (!strcmp(argv[i], "-b")) { if (i+1 < argc) { @@ -934,62 +1444,127 @@ int main(int argc, char **argv) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires <cipher_key>"); } + else if (!strcmp(argv[i], "-v")) { + if (i+1 < argc) v11n = argv[++i]; + else usage(*argv, "-v requires <v11n>"); + } + else if (!strcmp(argv[i], "-s")) { + if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z"); + if (i+1 < argc) { + entrySize = atoi(argv[++i]); + if (entrySize == 2 || entrySize == 4) { + continue; + } + } + usage(*argv, "-s requires one of <2|4>"); + } + else if (!strcmp(argv[i], "-C")) { + isCommentary = true; + } +#ifdef DEBUG + else if (!strcmp(argv[i], "-d")) { + if (i+1 < argc) debug |= atoi(argv[++i]); + else usage(*argv, "-d requires <flags>"); + } +#endif else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str()); } - switch (compType) { // these are deleted by zText - case 0: break; - case 1: compressor = new LZSSCompress(); break; - case 2: compressor = new ZipCompress(); break; - } + if (compType == "ZIP") { + compressor = new ZipCompress(); + } + else if (compType = "LZSS") { + compressor = new LZSSCompress(); + } #ifndef _ICU_ if (normalize) { normalize = false; - cout << program << " is not compiled with support for ICU. Ignoring -n flag." << endl; + cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl; } #endif #ifdef DEBUG - cout << "path: " << path << " osisDoc: " << osisDoc << " create: " << append << " compressType: " << compType << " blockType: " << iType << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; - cout << ""; -// exit(-3); + if (debug & DEBUG_OTHER) { + cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl; + } #endif - if (!append) { // == 0 then create module // Try to initialize a default set of datafiles and indicies at our // datapath location passed to us from the user. - if ( compressor ) { - if ( zText::createModule(path, iType) ) { - fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path); - exit(-3); + if (compressor) { + if (zText::createModule(path, iType, v11n)) { + fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); + exit(EXIT_NO_CREATE); + } + } + else if (entrySize == 4) { + if (RawText4::createModule(path, v11n)) { + fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); + exit(EXIT_NO_CREATE); } } - else if (RawText::createModule(path)) { - fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path); - exit(-3); + else { + if (RawText::createModule(path, v11n)) { + fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path); + exit(EXIT_NO_CREATE); + } } } - // Let's see if we can open our input file - ifstream infile(osisDoc); - if (infile.fail()) { - fprintf(stderr, "error: %s: couldn't open input file: %s \n", program, osisDoc); - exit(-2); - } - // Do some initialization stuff if (compressor) { - module = new zText(path, 0, 0, iType, compressor); + // Create a compressed text module allowing very large entries + // Taking defaults except for first, fourth, fifth and last argument + module = new zText( + path, // ipath + 0, // iname + 0, // idesc + iType, // iblockType + compressor, // icomp + 0, // idisp + ENC_UNKNOWN, // enc + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // lang + v11n // versification + ); + } + else if (entrySize == 4) { + // Create a raw text module allowing very large entries + // Taking defaults except for first and last argument + module = new RawText4( + path, // ipath + 0, // iname + 0, // idesc + 0, // idisp + ENC_UNKNOWN, // encoding + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // ilang + v11n // versification + ); } - else{ - module = new RawText(path); // open our datapath with our RawText driver. + else { + // Create a raw text module allowing reasonable sized entries + // Taking defaults except for first and last argument + module = new RawText( + path, // ipath + 0, // iname + 0, // idesc + 0, // idisp + ENC_UNKNOWN, // encoding + DIRECTION_LTR, // dir + FMT_UNKNOWN, // markup + 0, // ilang + v11n // versification + ); } SWFilter *cipherFilter = 0; - if (!cipherKey.empty()) { + if (cipherKey.length()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->AddRawFilter(cipherFilter); @@ -997,94 +1572,29 @@ int main(int argc, char **argv) { if (!module->isWritable()) { fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" ); - exit(-1); + exit(EXIT_NO_WRITE); } - activeOsisID[0] = '\0'; - strcpy(currentOsisID,"N/A"); - - currentVerse = new VerseKey(); - currentVerse->AutoNormalize(0); - currentVerse->Headings(1); // turn on mod/testmnt/book/chap headings - currentVerse->Persist(1); - - module->setKey(*currentVerse); - - (*module) = TOP; - - SWBuf token; - SWBuf text; - bool intoken = false; - bool inWhitespace = false; - bool seeingSpace = false; - char curChar = '\0'; - - while (infile.good()) { - - curChar = infile.get(); - - // skip the character if it is bad. infile.good() will catch the problem - if (curChar == -1) { - continue; - } - - if (!intoken && curChar == '<') { - intoken = true; - token = "<"; - continue; - } - - // Outside of tokens merge adjacent whitespace - if (!intoken) { - seeingSpace = isspace(curChar); - if (seeingSpace) { - if (inWhitespace) { - continue; - } - // convert all whitespace to blanks - curChar = ' '; - } - inWhitespace = seeingSpace; - } - - if (intoken && curChar == '>') { - intoken = false; - inWhitespace = false; - token.append('>'); - // take this isalpha if out to check for bugs in text - if ((isalpha(token[1])) || (isalpha(token[2]))) { - //cout << "Handle:" << token.c_str() << endl; - XMLTag *t = new XMLTag(token.c_str()); - - if (!handleToken(text, transform(t))) { - text.append(*t); - } - } - continue; + // Either read from std::cin (aka stdin), when the argument is a '-' + // or from a specified file. + if (!strcmp(osisDoc, "-")) { + processOSIS(cin); + } + else { + // Let's see if we can open our input file + ifstream infile(osisDoc); + if (infile.fail()) { + fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc); + exit(EXIT_NO_READ); } - - if (intoken) - token.append(curChar); - else - switch (curChar) { - case '>' : text.append(">"); break; - case '<' : text.append("<"); break; - default : text.append(curChar); break; - } + processOSIS(infile); + infile.close(); } - // Force the last entry from the text buffer. - text = ""; - writeEntry(*currentVerse, text, true); delete module; - delete currentVerse; if (cipherFilter) delete cipherFilter; - infile.close(); -#ifdef _ICU_ - if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted); - if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized); -#endif + exit(0); // success } |