/****************************************************************************** * * swmodule.cpp - code for base class 'SWModule'. SWModule is the basis * for all types of modules (e.g. texts, commentaries, * maps, lexicons, etc.) * * $Id: swmodule.cpp 3515 2017-11-01 11:38:09Z scribe $ * * Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society * P. O. Box 2528 * Tempe, AZ 85280-2528 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * */ #include #include #include #include #include #include #include // KLUDGE for Search #include // KLUDGE for Search #include #include #include #ifndef _MSC_VER #include #endif #if defined(USECXX11REGEX) #include #ifndef REG_ICASE #define REG_ICASE std::regex::icase #endif #elif defined(USEICUREGEX) #include #ifndef REG_ICASE #define REG_ICASE UREGEX_CASE_INSENSITIVE #endif #else #include // GNU #endif #if defined USEXAPIAN #include #elif defined USELUCENE #include //Lucence includes //#include "CLucene.h" //#include "CLucene/util/Reader.h" //#include "CLucene/util/Misc.h" //#include "CLucene/util/dirent.h" using namespace lucene::index; using namespace lucene::analysis; using namespace lucene::util; using namespace lucene::store; using namespace lucene::document; using namespace lucene::queryParser; using namespace lucene::search; #endif using std::vector; SWORD_NAMESPACE_START SWModule::StdOutDisplay SWModule::rawdisp; typedef std::list StringList; /****************************************************************************** * SWModule Constructor - Initializes data for instance of SWModule * * ENT: imodname - Internal name for module * imoddesc - Name to display to user for module * idisp - Display object to use for displaying * imodtype - Type of Module (All modules will be displayed with * others of same type under their modtype heading * unicode - if this module is unicode */ SWModule::SWModule(const char *imodname, const char *imoddesc, SWDisplay *idisp, const char *imodtype, SWTextEncoding encoding, SWTextDirection direction, SWTextMarkup markup, const char *imodlang) { key = createKey(); entryBuf = ""; config = &ownConfig; modname = 0; error = 0; moddesc = 0; modtype = 0; modlang = 0; this->encoding = encoding; this->direction = direction; this->markup = markup; entrySize= -1; disp = (idisp) ? idisp : &rawdisp; stdstr(&modname, imodname); stdstr(&moddesc, imoddesc); stdstr(&modtype, imodtype); stdstr(&modlang, imodlang); stripFilters = new FilterList(); rawFilters = new FilterList(); renderFilters = new FilterList(); optionFilters = new OptionFilterList(); encodingFilters = new FilterList(); skipConsecutiveLinks = true; procEntAttr = true; } /****************************************************************************** * SWModule Destructor - Cleans up instance of SWModule */ SWModule::~SWModule() { if (modname) delete [] modname; if (moddesc) delete [] moddesc; if (modtype) delete [] modtype; if (modlang) delete [] modlang; if (key) { if (!key->isPersist()) delete key; } stripFilters->clear(); rawFilters->clear(); renderFilters->clear(); optionFilters->clear(); encodingFilters->clear(); entryAttributes.clear(); delete stripFilters; delete rawFilters; delete renderFilters; delete optionFilters; delete encodingFilters; } /****************************************************************************** * SWModule::createKey - Allocates a key of specific type for module * * RET: pointer to allocated key */ SWKey *SWModule::createKey() const { return new SWKey(); } /****************************************************************************** * SWModule::popError - Gets and clears error status * * RET: error status */ char SWModule::popError() { char retval = error; error = 0; if (!retval) retval = key->popError(); return retval; } /****************************************************************************** * SWModule::Name - Sets/gets module name * * ENT: imodname - value which to set modname * [0] - only get * * RET: pointer to modname */ const char *SWModule::getName() const { return modname; } /****************************************************************************** * SWModule::Description - Sets/gets module description * * ENT: imoddesc - value which to set moddesc * [0] - only get * * RET: pointer to moddesc */ const char *SWModule::getDescription() const { return moddesc; } /****************************************************************************** * SWModule::Type - Sets/gets module type * * ENT: imodtype - value which to set modtype * [0] - only get * * RET: pointer to modtype */ const char *SWModule::getType() const { return modtype; } /****************************************************************************** * SWModule::getDirection - Sets/gets module direction * * ENT: newdir - value which to set direction * [-1] - only get * * RET: char direction */ char SWModule::getDirection() const { return direction; } /****************************************************************************** * SWModule::Disp - Sets/gets display driver * * ENT: idisp - value which to set disp * [0] - only get * * RET: pointer to disp */ SWDisplay *SWModule::getDisplay() const { return disp; } void SWModule::setDisplay(SWDisplay *idisp) { disp = idisp; } /****************************************************************************** * * SWModule::Display - Calls this modules display object and passes itself * * * * RET: error status * */ char SWModule::display() { disp->display(*this); return 0; } /****************************************************************************** * SWModule::getKey - Gets the key from this module that points to the position * record * * RET: key object */ SWKey *SWModule::getKey() const { return key; } /****************************************************************************** * SWModule::setKey - Sets a key to this module for position to a particular * record * * ENT: ikey - key with which to set this module * * RET: error status */ char SWModule::setKey(const SWKey *ikey) { SWKey *oldKey = 0; if (key) { if (!key->isPersist()) // if we have our own copy oldKey = key; } if (!ikey->isPersist()) { // if we are to keep our own copy key = createKey(); *key = *ikey; } else key = (SWKey *)ikey; // if we are to just point to an external key if (oldKey) delete oldKey; return error = key->getError(); } /****************************************************************************** * SWModule::setPosition(SW_POSITION) - Positions this modules to an entry * * ENT: p - position (e.g. TOP, BOTTOM) * * RET: *this */ void SWModule::setPosition(SW_POSITION p) { *key = p; char saveError = key->popError(); switch (p) { case POS_TOP: this->increment(); this->decrement(); break; case POS_BOTTOM: this->decrement(); this->increment(); break; } error = saveError; } /****************************************************************************** * SWModule::increment - Increments module key a number of entries * * ENT: increment - Number of entries to jump forward * * RET: *this */ void SWModule::increment(int steps) { (*key) += steps; error = key->popError(); } /****************************************************************************** * SWModule::decrement - Decrements module key a number of entries * * ENT: decrement - Number of entries to jump backward * * RET: *this */ void SWModule::decrement(int steps) { (*key) -= steps; error = key->popError(); } /****************************************************************************** * SWModule::Search - Searches a module for a string * * ENT: istr - string for which to search * searchType - type of search to perform * >=0 - regex; (for backward compat, if > 0 then used as additional REGEX FLAGS) * -1 - phrase * -2 - multiword * -3 - entryAttrib (eg. Word//Lemma./G1234/) (Lemma with dot means check components (Lemma.[1-9]) also) * -4 - clucene * -5 - multilemma window; flags = window size * flags - options flags for search * justCheckIfSupported - if set, don't search, only tell if this * function supports requested search. * * RET: ListKey set to verses that contain istr */ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *scope, bool *justCheckIfSupported, void (*percent)(char, void *), void *percentUserData) { listKey.clear(); SWBuf term = istr; bool includeComponents = false; // for entryAttrib e.g., /Lemma.1/ SWBuf target = getConfigEntry("AbsoluteDataPath"); if (!target.endsWith("/") && !target.endsWith("\\")) { target.append('/'); } #if defined USEXAPIAN target.append("xapian"); #elif defined USELUCENE target.append("lucene"); #endif if (justCheckIfSupported) { *justCheckIfSupported = (searchType >= -3); #if defined USEXAPIAN if ((searchType == -4) && (FileMgr::existsDir(target))) { *justCheckIfSupported = true; } #elif defined USELUCENE if ((searchType == -4) && (IndexReader::indexExists(target.c_str()))) { *justCheckIfSupported = true; } #endif return listKey; } SWKey *saveKey = 0; SWKey *searchKey = 0; SWKey *resultKey = createKey(); SWKey *lastKey = createKey(); SWBuf lastBuf = ""; #ifdef USECXX11REGEX std::locale oldLocale; std::locale::global(std::locale("en_US.UTF-8")); std::regex preg; #elif defined(USEICUREGEX) RegexMatcher *matcher = 0; #else regex_t preg; #endif vector words; vector window; const char *sres; terminateSearch = false; char perc = 1; bool savePEA = isProcessEntryAttributes(); // determine if we might be doing special strip searches. useful for knowing if we can use shortcuts bool specialStrips = (getConfigEntry("LocalStripFilter") || (getConfig().has("GlobalOptionFilter", "UTF8GreekAccents")) || (getConfig().has("GlobalOptionFilter", "UTF8HebrewPoints")) || (getConfig().has("GlobalOptionFilter", "UTF8ArabicPoints")) || (strchr(istr, '<'))); setProcessEntryAttributes(searchType == -3); if (!key->isPersist()) { saveKey = createKey(); *saveKey = *key; } else saveKey = key; searchKey = (scope)?scope->clone():(key->isPersist())?key->clone():0; if (searchKey) { searchKey->setPersist(true); setKey(*searchKey); } (*percent)(perc, percentUserData); *this = BOTTOM; long highIndex = key->getIndex(); if (!highIndex) highIndex = 1; // avoid division by zero errors. *this = TOP; if (searchType >= 0) { #ifdef USECXX11REGEX preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended | searchType | flags); #elif defined(USEICUREGEX) UErrorCode status = U_ZERO_ERROR; matcher = new RegexMatcher(istr, searchType | flags, status); if (U_FAILURE(status)) { SWLog::getSystemLog()->logError("Error compiling Regex: %d", status); return listKey; } #else flags |=searchType|REG_NOSUB|REG_EXTENDED; int err = regcomp(&preg, istr, flags); if (err) { SWLog::getSystemLog()->logError("Error compiling Regex: %d", err); return listKey; } #endif } (*percent)(++perc, percentUserData); #if defined USEXAPIAN || defined USELUCENE (*percent)(10, percentUserData); if (searchType == -4) { // indexed search #if defined USEXAPIAN SWTRY { Xapian::Database database(target.c_str()); Xapian::QueryParser queryParser; queryParser.set_default_op(Xapian::Query::OP_AND); SWTRY { queryParser.set_stemmer(Xapian::Stem(getLanguage())); } SWCATCH(...) {} queryParser.set_stemming_strategy(queryParser.STEM_SOME); queryParser.add_prefix("content", "C"); queryParser.add_prefix("lemma", "L"); queryParser.add_prefix("morph", "M"); queryParser.add_prefix("prox", "P"); queryParser.add_prefix("proxlem", "PL"); queryParser.add_prefix("proxmorph", "PM"); #elif defined USELUCENE lucene::index::IndexReader *ir = 0; lucene::search::IndexSearcher *is = 0; Query *q = 0; Hits *h = 0; SWTRY { ir = IndexReader::open(target); is = new IndexSearcher(ir); const TCHAR *stopWords[] = { 0 }; standard::StandardAnalyzer analyzer(stopWords); #endif // parse the query #if defined USEXAPIAN Xapian::Query q = queryParser.parse_query(istr); Xapian::Enquire enquire = Xapian::Enquire(database); #elif defined USELUCENE q = QueryParser::parse((wchar_t *)utf8ToWChar(istr).getRawData(), _T("content"), &analyzer); #endif (*percent)(20, percentUserData); // perform the search #if defined USEXAPIAN enquire.set_query(q); Xapian::MSet h = enquire.get_mset(0, 99999); #elif defined USELUCENE h = is->search(q); #endif (*percent)(80, percentUserData); // iterate thru each good module position that meets the search bool checkBounds = getKey()->isBoundSet(); #if defined USEXAPIAN Xapian::MSetIterator i; for (i = h.begin(); i != h.end(); ++i) { // cout << "Document ID " << *i << "\t"; __u64 score = i.get_percent(); Xapian::Document doc = i.get_document(); *resultKey = doc.get_data().c_str(); #elif defined USELUCENE for (unsigned long i = 0; i < (unsigned long)h->length(); i++) { Document &doc = h->doc(i); // set a temporary verse key to this module position *resultKey = wcharToUTF8(doc.get(_T("key"))); //TODO Does a key always accept utf8? __u64 score = (__u64)((__u32)(h->score(i)*100)); #endif // check to see if it sets ok (within our bounds) and if not, skip if (checkBounds) { *getKey() = *resultKey; if (*getKey() != *resultKey) { continue; } } listKey << *resultKey; listKey.getElement()->userData = score; } (*percent)(98, percentUserData); } SWCATCH (...) { #if defined USEXAPIAN #elif defined USELUCENE q = 0; #endif // invalid clucene query } #if defined USEXAPIAN #elif defined USELUCENE delete h; delete q; delete is; if (ir) { ir->close(); } #endif } #endif // some pre-loop processing switch (searchType) { // phrase case -1: // let's see if we're told to ignore case. If so, then we'll touppstr our term if ((flags & REG_ICASE) == REG_ICASE) toupperstr(term); break; // multi-word case -2: case -5: // let's break the term down into our words vector while (1) { const char *word = term.stripPrefix(' '); if (!word) { words.push_back(term); break; } words.push_back(word); } if ((flags & REG_ICASE) == REG_ICASE) { for (unsigned int i = 0; i < words.size(); i++) { toupperstr(words[i]); } } break; // entry attributes case -3: // let's break the attribute segs down. We'll reuse our words vector for each segment while (1) { const char *word = term.stripPrefix('/'); if (!word) { words.push_back(term); break; } words.push_back(word); } if ((words.size()>2) && words[2].endsWith(".")) { includeComponents = true; words[2]--; } break; } // our main loop to iterate the module and find the stuff perc = 5; (*percent)(perc, percentUserData); while ((searchType != -4) && !popError() && !terminateSearch) { long mindex = key->getIndex(); float per = (float)mindex / highIndex; per *= 93; per += 5; char newperc = (char)per; if (newperc > perc) { perc = newperc; (*percent)(perc, percentUserData); } else if (newperc < perc) { #ifndef _MSC_VER std::cerr << "Serious error: new percentage complete is less than previous value\n"; std::cerr << "index: " << (key->getIndex()) << "\n"; std::cerr << "highIndex: " << highIndex << "\n"; std::cerr << "newperc ==" << (int)newperc << "%" << "is smaller than\n"; std::cerr << "perc == " << (int )perc << "% \n"; #endif } if (searchType >= 0) { SWBuf textBuf = stripText(); #ifdef USECXX11REGEX if (std::regex_match(std::string(textBuf.c_str()), preg)) { #elif defined(USEICUREGEX) UnicodeString stringToTest = textBuf.c_str(); matcher->reset(stringToTest); if (matcher->find()) { #else if (!regexec(&preg, textBuf, 0, 0, 0)) { #endif *resultKey = *getKey(); resultKey->clearBound(); listKey << *resultKey; lastBuf = ""; } #ifdef USECXX11REGEX else if (std::regex_match(std::string((lastBuf + ' ' + textBuf).c_str()), preg)) { #elif defined(USEICUREGEX) else { stringToTest = (lastBuf + ' ' + textBuf).c_str(); matcher->reset(stringToTest); if (matcher->find()) { #else else if (!regexec(&preg, lastBuf + ' ' + textBuf, 0, 0, 0)) { #endif lastKey->clearBound(); listKey << *lastKey; lastBuf = textBuf; } else { lastBuf = textBuf; } #if defined(USEICUREGEX) } #endif } // phrase else { SWBuf textBuf; switch (searchType) { // phrase case -1: textBuf = stripText(); if ((flags & REG_ICASE) == REG_ICASE) toupperstr(textBuf); sres = strstr(textBuf.c_str(), term.c_str()); if (sres) { //it's also in the stripText(), so we have a valid search result item now *resultKey = *getKey(); resultKey->clearBound(); listKey << *resultKey; } break; // multiword case -2: { // enclose our allocations int loopCount = 0; unsigned int foundWords = 0; do { textBuf = ((loopCount == 0)&&(!specialStrips)) ? getRawEntry() : stripText(); foundWords = 0; for (unsigned int i = 0; i < words.size(); i++) { if ((flags & REG_ICASE) == REG_ICASE) toupperstr(textBuf); sres = strstr(textBuf.c_str(), words[i].c_str()); if (!sres) { break; //for loop } foundWords++; } loopCount++; } while ( (loopCount < 2) && (foundWords == words.size())); if ((loopCount == 2) && (foundWords == words.size())) { //we found the right words in both raw and stripped text, which means it's a valid result item *resultKey = *getKey(); resultKey->clearBound(); listKey << *resultKey; } } break; // entry attributes case -3: { renderText(); // force parse AttributeTypeList &entryAttribs = getEntryAttributes(); AttributeTypeList::iterator i1Start, i1End; AttributeList::iterator i2Start, i2End; AttributeValue::iterator i3Start, i3End; if ((words.size()) && (words[0].length())) { // cout << "Word: " << words[0] << endl; for (i1Start = entryAttribs.begin(); i1Start != entryAttribs.end(); ++i1Start) { // cout << "stuff: " << i1Start->first.c_str() << endl; } i1Start = entryAttribs.find(words[0]); i1End = i1Start; if (i1End != entryAttribs.end()) { i1End++; } } else { i1Start = entryAttribs.begin(); i1End = entryAttribs.end(); } for (;i1Start != i1End; i1Start++) { if ((words.size()>1) && (words[1].length())) { i2Start = i1Start->second.find(words[1]); i2End = i2Start; if (i2End != i1Start->second.end()) i2End++; } else { i2Start = i1Start->second.begin(); i2End = i1Start->second.end(); } for (;i2Start != i2End; i2Start++) { if ((words.size()>2) && (words[2].length()) && (!includeComponents)) { i3Start = i2Start->second.find(words[2]); i3End = i3Start; if (i3End != i2Start->second.end()) i3End++; } else { i3Start = i2Start->second.begin(); i3End = i2Start->second.end(); } for (;i3Start != i3End; i3Start++) { if ((words.size()>3) && (words[3].length())) { if (includeComponents) { SWBuf key = i3Start->first.c_str(); key = key.stripPrefix('.', true); // we're iterating all 3 level keys, so be sure we match our // prefix (e.g., Lemma, Lemma.1, Lemma.2, etc.) if (key != words[2]) continue; } if (flags & SEARCHFLAG_MATCHWHOLEENTRY) { bool found = !(((flags & REG_ICASE) == REG_ICASE) ? sword::stricmp(i3Start->second.c_str(), words[3]) : strcmp(i3Start->second.c_str(), words[3])); sres = (found) ? i3Start->second.c_str() : 0; } else { sres = ((flags & REG_ICASE) == REG_ICASE) ? stristr(i3Start->second.c_str(), words[3]) : strstr(i3Start->second.c_str(), words[3]); } if (sres) { *resultKey = *getKey(); resultKey->clearBound(); listKey << *resultKey; break; } } } if (i3Start != i3End) break; } if (i2Start != i2End) break; } break; } // NOT DONE case -5: AttributeList &words = getEntryAttributes()["Word"]; SWBuf kjvWord = ""; SWBuf bibWord = ""; for (AttributeList::iterator it = words.begin(); it != words.end(); it++) { int parts = atoi(it->second["PartCount"]); SWBuf lemma = ""; SWBuf morph = ""; for (int i = 1; i <= parts; i++) { SWBuf key = ""; key = (parts == 1) ? "Lemma" : SWBuf().setFormatted("Lemma.%d", i).c_str(); AttributeValue::iterator li = it->second.find(key); if (li != it->second.end()) { if (i > 1) lemma += " "; key = (parts == 1) ? "LemmaClass" : SWBuf().setFormatted("LemmaClass.%d", i).c_str(); AttributeValue::iterator lci = it->second.find(key); if (lci != it->second.end()) { lemma += lci->second + ":"; } lemma += li->second; } key = (parts == 1) ? "Morph" : SWBuf().setFormatted("Morph.%d", i).c_str(); li = it->second.find(key); // silly. sometimes morph counts don't equal lemma counts if (i == 1 && parts != 1 && li == it->second.end()) { li = it->second.find("Morph"); } if (li != it->second.end()) { if (i > 1) morph += " "; key = (parts == 1) ? "MorphClass" : SWBuf().setFormatted("MorphClass.%d", i).c_str(); AttributeValue::iterator lci = it->second.find(key); // silly. sometimes morph counts don't equal lemma counts if (i == 1 && parts != 1 && lci == it->second.end()) { lci = it->second.find("MorphClass"); } if (lci != it->second.end()) { morph += lci->second + ":"; } morph += li->second; } // TODO: add src tags and maybe other attributes } while (window.size() < (unsigned)flags) { } } break; } // end switch } *lastKey = *getKey(); (*this)++; } // cleaup work if (searchType >= 0) { #ifdef USECXX11REGEX std::locale::global(oldLocale); #elif defined(USEICUREGEX) delete matcher; #else regfree(&preg); #endif } setKey(*saveKey); if (!saveKey->isPersist()) delete saveKey; if (searchKey) delete searchKey; delete resultKey; delete lastKey; listKey = TOP; setProcessEntryAttributes(savePEA); (*percent)(100, percentUserData); return listKey; } /****************************************************************************** * SWModule::stripText() - calls all stripfilters on current text * * ENT: buf - buf to massage instead of this modules current text * len - max len of buf * * RET: this module's text at current key location massaged by Strip filters */ const char *SWModule::stripText(const char *buf, int len) { static SWBuf local; local = renderText(buf, len, false); return local.c_str(); } /** SWModule::getRenderHeader() - Produces any header data which might be * useful which associated with the processing done with this filter. * A typical example is a suggested CSS style block for classed * containers. */ const char *SWModule::getRenderHeader() const { FilterList::const_iterator first = getRenderFilters().begin(); if (first != getRenderFilters().end()) { return (*first)->getHeader(); } return ""; } /****************************************************************************** * SWModule::renderText - calls all renderfilters on current module * position * * RET: this module's text at current key location massaged by renderText filters */ SWBuf SWModule::renderText() { return renderText((const char *)0); } /****************************************************************************** * SWModule::renderText - calls all renderfilters on provided text * or current module position provided text null * * ENT: buf - buffer to render * * RET: this module's text at current key location massaged by renderText filters * * NOTES: This method is only truly const if called with a provided text; using * module's current position may produce a new entry attributes map which * logically violates the const semantic, which is why the above method * which takes no params is not const, i.e., don't call this method with * null as text param, but instead use non-const method above. The public * interface for this method expects a value for the text param. We use it * internally sometimes calling with null to save duplication of code. */ SWBuf SWModule::renderText(const char *buf, int len, bool render) const { bool savePEA = isProcessEntryAttributes(); if (!buf) { entryAttributes.clear(); } else { setProcessEntryAttributes(false); } SWBuf local; if (buf) local = buf; SWBuf &tmpbuf = (buf) ? local : getRawEntryBuf(); SWKey *key = 0; static const char *null = ""; if (tmpbuf) { unsigned long size = (len < 0) ? ((getEntrySize()<0) ? strlen(tmpbuf) : getEntrySize()) : len; if (size > 0) { key = this->getKey(); optionFilter(tmpbuf, key); if (render) { renderFilter(tmpbuf, key); encodingFilter(tmpbuf, key); } else stripFilter(tmpbuf, key); } } else { tmpbuf = null; } setProcessEntryAttributes(savePEA); return tmpbuf; } /****************************************************************************** * SWModule::renderText - calls all renderfilters on current text * * ENT: tmpKey - key to use to grab text * * RET: this module's text at current key location massaged by RenderFilers */ SWBuf SWModule::renderText(const SWKey *tmpKey) { SWKey *saveKey; const char *retVal; if (!key->isPersist()) { saveKey = createKey(); *saveKey = *key; } else saveKey = key; setKey(*tmpKey); retVal = renderText(); setKey(*saveKey); if (!saveKey->isPersist()) delete saveKey; return retVal; } /****************************************************************************** * SWModule::stripText - calls all StripTextFilters on current text * * ENT: tmpKey - key to use to grab text * * RET: this module's text at specified key location massaged by Strip filters */ const char *SWModule::stripText(const SWKey *tmpKey) { SWKey *saveKey; const char *retVal; if (!key->isPersist()) { saveKey = createKey(); *saveKey = *key; } else saveKey = key; setKey(*tmpKey); retVal = stripText(); setKey(*saveKey); if (!saveKey->isPersist()) delete saveKey; return retVal; } /****************************************************************************** * SWModule::getBibliography -Returns bibliographic data for a module in the * requested format * * ENT: bibFormat format of the bibliographic data * * RET: bibliographic data in the requested format as a string (BibTeX by default) */ SWBuf SWModule::getBibliography(unsigned char bibFormat) const { SWBuf s; switch (bibFormat) { case BIB_BIBTEX: s.append("@Book {").append(modname).append(", Title = \"").append(moddesc).append("\", Publisher = \"CrossWire Bible Society\"}"); break; } return s; } const char *SWModule::getConfigEntry(const char *key) const { ConfigEntMap::iterator it = config->find(key); return (it != config->end()) ? it->second.c_str() : 0; } void SWModule::setConfig(ConfigEntMap *config) { this->config = config; } bool SWModule::hasSearchFramework() { #ifdef USELUCENE return true; #else return SWSearchable::hasSearchFramework(); #endif } void SWModule::deleteSearchFramework() { #ifdef USELUCENE SWBuf target = getConfigEntry("AbsoluteDataPath"); if (!target.endsWith("/") && !target.endsWith("\\")) { target.append('/'); } target.append("lucene"); FileMgr::removeDir(target.c_str()); #else SWSearchable::deleteSearchFramework(); #endif } signed char SWModule::createSearchFramework(void (*percent)(char, void *), void *percentUserData) { #if defined USELUCENE || defined USEXAPIAN SWBuf target = getConfigEntry("AbsoluteDataPath"); if (!target.endsWith("/") && !target.endsWith("\\")) { target.append('/'); } #if defined USEXAPIAN target.append("xapian"); #elif defined USELUCENE const int MAX_CONV_SIZE = 1024 * 1024; target.append("lucene"); #endif int status = FileMgr::createParent(target+"/dummy"); if (status) return -1; SWKey *saveKey = 0; SWKey *searchKey = 0; SWKey textkey; SWBuf c; // turn all filters to default values StringList filterSettings; for (OptionFilterList::iterator filter = optionFilters->begin(); filter != optionFilters->end(); filter++) { filterSettings.push_back((*filter)->getOptionValue()); (*filter)->setOptionValue(*((*filter)->getOptionValues().begin())); if ( (!strcmp("Greek Accents", (*filter)->getOptionName())) || (!strcmp("Hebrew Vowel Points", (*filter)->getOptionName())) || (!strcmp("Arabic Vowel Points", (*filter)->getOptionName())) ) { (*filter)->setOptionValue("Off"); } } // be sure we give CLucene enough file handles FileMgr::getSystemFileMgr()->flush(); // save key information so as not to disrupt original // module position if (!key->isPersist()) { saveKey = createKey(); *saveKey = *key; } else saveKey = key; searchKey = (key->isPersist())?key->clone():0; if (searchKey) { searchKey->setPersist(1); setKey(*searchKey); } bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch"); // lets create or open our search index #if defined USEXAPIAN Xapian::WritableDatabase database(target.c_str(), Xapian::DB_CREATE_OR_OPEN); Xapian::TermGenerator termGenerator; SWTRY { termGenerator.set_stemmer(Xapian::Stem(getLanguage())); } SWCATCH(...) {} #elif defined USELUCENE RAMDirectory *ramDir = 0; IndexWriter *coreWriter = 0; IndexWriter *fsWriter = 0; Directory *d = 0; const TCHAR *stopWords[] = { 0 }; standard::StandardAnalyzer *an = new standard::StandardAnalyzer(stopWords); ramDir = new RAMDirectory(); coreWriter = new IndexWriter(ramDir, an, true); coreWriter->setMaxFieldLength(MAX_CONV_SIZE); #endif char perc = 1; VerseKey *vkcheck = 0; vkcheck = SWDYNAMIC_CAST(VerseKey, key); VerseKey *chapMax = 0; if (vkcheck) chapMax = (VerseKey *)vkcheck->clone(); TreeKeyIdx *tkcheck = 0; tkcheck = SWDYNAMIC_CAST(TreeKeyIdx, key); *this = BOTTOM; long highIndex = key->getIndex(); if (!highIndex) highIndex = 1; // avoid division by zero errors. bool savePEA = isProcessEntryAttributes(); setProcessEntryAttributes(true); // prox chapter blocks // position module at the beginning *this = TOP; SWBuf proxBuf; SWBuf proxLem; SWBuf proxMorph; SWBuf strong; SWBuf morph; char err = popError(); while (!err) { long mindex = key->getIndex(); proxBuf = ""; proxLem = ""; proxMorph = ""; // computer percent complete so we can report to our progress callback float per = (float)mindex / highIndex; // between 5%-98% per *= 93; per += 5; char newperc = (char)per; if (newperc > perc) { perc = newperc; (*percent)(perc, percentUserData); } // get "content" field const char *content = stripText(); bool good = false; // start out entry #if defined USEXAPIAN Xapian::Document doc; termGenerator.set_document(doc); #elif defined USELUCENE Document *doc = new Document(); #endif // get "key" field SWBuf keyText = (vkcheck) ? vkcheck->getOSISRef() : getKeyText(); if (content && *content) { good = true; // build "strong" field AttributeTypeList::iterator words; AttributeList::iterator word; AttributeValue::iterator strongVal; AttributeValue::iterator morphVal; strong=""; morph=""; words = getEntryAttributes().find("Word"); if (words != getEntryAttributes().end()) { for (word = words->second.begin();word != words->second.end(); word++) { int partCount = atoi(word->second["PartCount"]); if (!partCount) partCount = 1; for (int i = 0; i < partCount; i++) { SWBuf tmp = "Lemma"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); strongVal = word->second.find(tmp); if (strongVal != word->second.end()) { // cheeze. skip empty article tags that weren't assigned to any text if (strongVal->second == "G3588") { if (word->second.find("Text") == word->second.end()) continue; // no text? let's skip } strong.append(strongVal->second); morph.append(strongVal->second); morph.append('@'); SWBuf tmp = "Morph"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); morphVal = word->second.find(tmp); if (morphVal != word->second.end()) { morph.append(morphVal->second); } strong.append(' '); morph.append(' '); } } } } #if defined USEXAPIAN doc.set_data(keyText.c_str()); #elif defined USELUCENE doc->add(*_CLNEW Field(_T("key"), (wchar_t *)utf8ToWChar(keyText).getRawData(), Field::STORE_YES | Field::INDEX_UNTOKENIZED)); #endif if (includeKeyInSearch) { c = keyText; c += " "; c += content; content = c.c_str(); } #if defined USEXAPIAN termGenerator.index_text(content); termGenerator.index_text(content, 1, "C"); #elif defined USELUCENE doc->add(*_CLNEW Field(_T("content"), (wchar_t *)utf8ToWChar(content).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); #endif if (strong.length() > 0) { #if defined USEXAPIAN termGenerator.index_text(strong.c_str(), 1, "L"); termGenerator.index_text(morph.c_str(), 1, "M"); #elif defined USELUCENE doc->add(*_CLNEW Field(_T("lemma"), (wchar_t *)utf8ToWChar(strong).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); doc->add(*_CLNEW Field(_T("morph"), (wchar_t *)utf8ToWChar(morph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); #endif //printf("setting fields (%s).\ncontent: %s\nlemma: %s\n", (const char *)*key, content, strong.c_str()); } //printf("setting fields (%s).\n", (const char *)*key); //fflush(stdout); } // don't write yet, cuz we have to see if we're the first of a prox block (5:1 or chapter5/verse1 // for VerseKeys use chapter if (vkcheck) { *chapMax = *vkcheck; // we're the first verse in a chapter if (vkcheck->getVerse() == 1) { *chapMax = MAXVERSE; VerseKey saveKey = *vkcheck; while ((!err) && (*vkcheck <= *chapMax)) { //printf("building proxBuf from (%s).\nproxBuf.c_str(): %s\n", (const char *)*key, proxBuf.c_str()); //printf("building proxBuf from (%s).\n", (const char *)*key); content = stripText(); if (content && *content) { // build "strong" field strong = ""; morph = ""; AttributeTypeList::iterator words; AttributeList::iterator word; AttributeValue::iterator strongVal; AttributeValue::iterator morphVal; words = getEntryAttributes().find("Word"); if (words != getEntryAttributes().end()) { for (word = words->second.begin();word != words->second.end(); word++) { int partCount = atoi(word->second["PartCount"]); if (!partCount) partCount = 1; for (int i = 0; i < partCount; i++) { SWBuf tmp = "Lemma"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); strongVal = word->second.find(tmp); if (strongVal != word->second.end()) { // cheeze. skip empty article tags that weren't assigned to any text if (strongVal->second == "G3588") { if (word->second.find("Text") == word->second.end()) continue; // no text? let's skip } strong.append(strongVal->second); morph.append(strongVal->second); morph.append('@'); SWBuf tmp = "Morph"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); morphVal = word->second.find(tmp); if (morphVal != word->second.end()) { morph.append(morphVal->second); } strong.append(' '); morph.append(' '); } } } } proxBuf += content; proxBuf.append(' '); proxLem += strong; proxMorph += morph; if (proxLem.length()) { proxLem.append("\n"); proxMorph.append("\n"); } } (*this)++; err = popError(); } err = 0; *vkcheck = saveKey; } } // for TreeKeys use siblings if we have no children else if (tkcheck) { if (!tkcheck->hasChildren()) { if (!tkcheck->previousSibling()) { do { //printf("building proxBuf from (%s).\n", (const char *)*key); //fflush(stdout); content = stripText(); if (content && *content) { // build "strong" field strong = ""; morph = ""; AttributeTypeList::iterator words; AttributeList::iterator word; AttributeValue::iterator strongVal; AttributeValue::iterator morphVal; words = getEntryAttributes().find("Word"); if (words != getEntryAttributes().end()) { for (word = words->second.begin();word != words->second.end(); word++) { int partCount = atoi(word->second["PartCount"]); if (!partCount) partCount = 1; for (int i = 0; i < partCount; i++) { SWBuf tmp = "Lemma"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); strongVal = word->second.find(tmp); if (strongVal != word->second.end()) { // cheeze. skip empty article tags that weren't assigned to any text if (strongVal->second == "G3588") { if (word->second.find("Text") == word->second.end()) continue; // no text? let's skip } strong.append(strongVal->second); morph.append(strongVal->second); morph.append('@'); SWBuf tmp = "Morph"; if (partCount > 1) tmp.appendFormatted(".%d", i+1); morphVal = word->second.find(tmp); if (morphVal != word->second.end()) { morph.append(morphVal->second); } strong.append(' '); morph.append(' '); } } } } proxBuf += content; proxBuf.append(' '); proxLem += strong; proxMorph += morph; if (proxLem.length()) { proxLem.append("\n"); proxMorph.append("\n"); } } } while (tkcheck->nextSibling()); tkcheck->parent(); tkcheck->firstChild(); } else tkcheck->nextSibling(); // reposition from our previousSibling test } } if (proxBuf.length() > 0) { #if defined USEXAPIAN termGenerator.index_text(proxBuf.c_str(), 1, "P"); #elif defined USELUCENE doc->add(*_CLNEW Field(_T("prox"), (wchar_t *)utf8ToWChar(proxBuf).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); #endif good = true; } if (proxLem.length() > 0) { #if defined USEXAPIAN termGenerator.index_text(proxLem.c_str(), 1, "PL"); termGenerator.index_text(proxMorph.c_str(), 1, "PM"); #elif defined USELUCENE doc->add(*_CLNEW Field(_T("proxlem"), (wchar_t *)utf8ToWChar(proxLem).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) ); doc->add(*_CLNEW Field(_T("proxmorph"), (wchar_t *)utf8ToWChar(proxMorph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) ); #endif good = true; } if (good) { //printf("writing (%s).\n", (const char *)*key); //fflush(stdout); #if defined USEXAPIAN SWBuf idTerm; idTerm.setFormatted("Q%ld", key->getIndex()); doc.add_boolean_term(idTerm.c_str()); database.replace_document(idTerm.c_str(), doc); #elif defined USELUCENE coreWriter->addDocument(doc); #endif } #if defined USEXAPIAN #elif defined USELUCENE delete doc; #endif (*this)++; err = popError(); } // Optimizing automatically happens with the call to addIndexes //coreWriter->optimize(); #if defined USEXAPIAN #elif defined USELUCENE coreWriter->close(); #ifdef CLUCENE2 d = FSDirectory::getDirectory(target.c_str()); #endif if (IndexReader::indexExists(target.c_str())) { #ifndef CLUCENE2 d = FSDirectory::getDirectory(target.c_str(), false); #endif if (IndexReader::isLocked(d)) { IndexReader::unlock(d); } fsWriter = new IndexWriter( d, an, false); } else { #ifndef CLUCENE2 d = FSDirectory::getDirectory(target.c_str(), true); #endif fsWriter = new IndexWriter(d, an, true); } Directory *dirs[] = { ramDir, 0 }; #ifdef CLUCENE2 lucene::util::ConstValueArray< lucene::store::Directory *>dirsa(dirs, 1); fsWriter->addIndexes(dirsa); #else fsWriter->addIndexes(dirs); #endif fsWriter->close(); delete ramDir; delete coreWriter; delete fsWriter; delete an; #endif // reposition module back to where it was before we were called setKey(*saveKey); if (!saveKey->isPersist()) delete saveKey; if (searchKey) delete searchKey; delete chapMax; setProcessEntryAttributes(savePEA); // reset option filters back to original values StringList::iterator origVal = filterSettings.begin(); for (OptionFilterList::iterator filter = optionFilters->begin(); filter != optionFilters->end(); filter++) { (*filter)->setOptionValue(*origVal++); } return 0; #else return SWSearchable::createSearchFramework(percent, percentUserData); #endif } /** OptionFilterBuffer a text buffer * @param filters the FilterList of filters to iterate * @param buf the buffer to filter * @param key key location from where this buffer was extracted */ void SWModule::filterBuffer(OptionFilterList *filters, SWBuf &buf, const SWKey *key) const { OptionFilterList::iterator it; for (it = filters->begin(); it != filters->end(); it++) { (*it)->processText(buf, key, this); } } /** FilterBuffer a text buffer * @param filters the FilterList of filters to iterate * @param buf the buffer to filter * @param key key location from where this buffer was extracted */ void SWModule::filterBuffer(FilterList *filters, SWBuf &buf, const SWKey *key) const { FilterList::iterator it; for (it = filters->begin(); it != filters->end(); it++) { (*it)->processText(buf, key, this); } } signed char SWModule::createModule(const char*) { return -1; } void SWModule::setEntry(const char*, long) { } void SWModule::linkEntry(const SWKey*) { } /****************************************************************************** * SWModule::prepText - Prepares the text before returning it to external * objects * * ENT: buf - buffer where text is stored and where to store the prep'd * text. */ void SWModule::prepText(SWBuf &buf) { unsigned int to, from; char space = 0, cr = 0, realdata = 0, nlcnt = 0; char *rawBuf = buf.getRawData(); for (to = from = 0; rawBuf[from]; from++) { switch (rawBuf[from]) { case 10: if (!realdata) continue; space = (cr) ? 0 : 1; cr = 0; nlcnt++; if (nlcnt > 1) { // *to++ = nl; rawBuf[to++] = 10; // *to++ = nl[1]; // nlcnt = 0; } continue; case 13: if (!realdata) continue; // *to++ = nl[0]; rawBuf[to++] = 10; space = 0; cr = 1; continue; } realdata = 1; nlcnt = 0; if (space) { space = 0; if (rawBuf[from] != ' ') { rawBuf[to++] = ' '; from--; continue; } } rawBuf[to++] = rawBuf[from]; } buf.setSize(to); while (to > 1) { // remove trailing excess to--; if ((rawBuf[to] == 10) || (rawBuf[to] == ' ')) buf.setSize(to); else break; } } SWORD_NAMESPACE_END