diff options
Diffstat (limited to 'src/modules/swmodule.cpp')
-rw-r--r-- | src/modules/swmodule.cpp | 218 |
1 files changed, 192 insertions, 26 deletions
diff --git a/src/modules/swmodule.cpp b/src/modules/swmodule.cpp index a2d7873..e6ceeda 100644 --- a/src/modules/swmodule.cpp +++ b/src/modules/swmodule.cpp @@ -4,7 +4,7 @@ * for all types of modules (e.g. texts, commentaries, * maps, lexicons, etc.) * - * $Id: swmodule.cpp 3282 2014-12-03 06:09:06Z greg.hellings $ + * $Id: swmodule.cpp 3515 2017-11-01 11:38:09Z scribe $ * * Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org) * CrossWire Bible Society @@ -39,16 +39,23 @@ #include <iostream> #endif -#ifdef USECXX11REGEX +#if defined(USECXX11REGEX) #include <regex> #ifndef REG_ICASE #define REG_ICASE std::regex::icase #endif +#elif defined(USEICUREGEX) +#include <unicode/regex.h> +#ifndef REG_ICASE +#define REG_ICASE UREGEX_CASE_INSENSITIVE +#endif #else #include <regex.h> // GNU #endif -#ifdef USELUCENE +#if defined USEXAPIAN +#include <xapian.h> +#elif defined USELUCENE #include <CLucene.h> //Lucence includes @@ -171,6 +178,7 @@ char SWModule::popError() char retval = error; error = 0; + if (!retval) retval = key->popError(); return retval; } @@ -295,7 +303,7 @@ char SWModule::setKey(const SWKey *ikey) { if (oldKey) delete oldKey; - return error = key->popError(); + return error = key->getError(); } @@ -313,13 +321,13 @@ void SWModule::setPosition(SW_POSITION p) { switch (p) { case POS_TOP: - (*this)++; - (*this)--; + this->increment(); + this->decrement(); break; case POS_BOTTOM: - (*this)--; - (*this)++; + this->decrement(); + this->increment(); break; } @@ -360,7 +368,7 @@ void SWModule::decrement(int steps) { * * ENT: istr - string for which to search * searchType - type of search to perform - * >=0 - regex + * >=0 - regex; (for backward compat, if > 0 then used as additional REGEX FLAGS) * -1 - phrase * -2 - multiword * -3 - entryAttrib (eg. Word//Lemma./G1234/) (Lemma with dot means check components (Lemma.[1-9]) also) @@ -379,16 +387,22 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc SWBuf term = istr; bool includeComponents = false; // for entryAttrib e.g., /Lemma.1/ -#ifdef USELUCENE SWBuf target = getConfigEntry("AbsoluteDataPath"); if (!target.endsWith("/") && !target.endsWith("\\")) { target.append('/'); } +#if defined USEXAPIAN + target.append("xapian"); +#elif defined USELUCENE target.append("lucene"); #endif if (justCheckIfSupported) { *justCheckIfSupported = (searchType >= -3); -#ifdef USELUCENE +#if defined USEXAPIAN + if ((searchType == -4) && (FileMgr::existsDir(target))) { + *justCheckIfSupported = true; + } +#elif defined USELUCENE if ((searchType == -4) && (IndexReader::indexExists(target.c_str()))) { *justCheckIfSupported = true; } @@ -407,6 +421,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc std::locale::global(std::locale("en_US.UTF-8")); std::regex preg; +#elif defined(USEICUREGEX) + RegexMatcher *matcher = 0; #else regex_t preg; #endif @@ -449,18 +465,48 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc *this = TOP; if (searchType >= 0) { #ifdef USECXX11REGEX - preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended & flags); + preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended | searchType | flags); +#elif defined(USEICUREGEX) + UErrorCode status = U_ZERO_ERROR; + matcher = new RegexMatcher(istr, searchType | flags, status); + if (U_FAILURE(status)) { + SWLog::getSystemLog()->logError("Error compiling Regex: %d", status); + return listKey; + } + #else flags |=searchType|REG_NOSUB|REG_EXTENDED; - regcomp(&preg, istr, flags); + int err = regcomp(&preg, istr, flags); + if (err) { + SWLog::getSystemLog()->logError("Error compiling Regex: %d", err); + return listKey; + } #endif } (*percent)(++perc, percentUserData); -#ifdef USELUCENE - if (searchType == -4) { // lucene +#if defined USEXAPIAN || defined USELUCENE + (*percent)(10, percentUserData); + if (searchType == -4) { // indexed search +#if defined USEXAPIAN + SWTRY { + Xapian::Database database(target.c_str()); + Xapian::QueryParser queryParser; + queryParser.set_default_op(Xapian::Query::OP_AND); + SWTRY { + queryParser.set_stemmer(Xapian::Stem(getLanguage())); + } SWCATCH(...) {} + queryParser.set_stemming_strategy(queryParser.STEM_SOME); + queryParser.add_prefix("content", "C"); + queryParser.add_prefix("lemma", "L"); + queryParser.add_prefix("morph", "M"); + queryParser.add_prefix("prox", "P"); + queryParser.add_prefix("proxlem", "PL"); + queryParser.add_prefix("proxmorph", "PM"); + +#elif defined USELUCENE lucene::index::IndexReader *ir = 0; lucene::search::IndexSearcher *is = 0; @@ -469,22 +515,44 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc SWTRY { ir = IndexReader::open(target); is = new IndexSearcher(ir); - (*percent)(10, percentUserData); - const TCHAR *stopWords[] = { 0 }; standard::StandardAnalyzer analyzer(stopWords); +#endif + + // parse the query +#if defined USEXAPIAN + Xapian::Query q = queryParser.parse_query(istr); + Xapian::Enquire enquire = Xapian::Enquire(database); +#elif defined USELUCENE q = QueryParser::parse((wchar_t *)utf8ToWChar(istr).getRawData(), _T("content"), &analyzer); +#endif (*percent)(20, percentUserData); + + // perform the search +#if defined USEXAPIAN + enquire.set_query(q); + Xapian::MSet h = enquire.get_mset(0, 99999); +#elif defined USELUCENE h = is->search(q); +#endif (*percent)(80, percentUserData); // iterate thru each good module position that meets the search bool checkBounds = getKey()->isBoundSet(); +#if defined USEXAPIAN + Xapian::MSetIterator i; + for (i = h.begin(); i != h.end(); ++i) { +// cout << "Document ID " << *i << "\t"; + __u64 score = i.get_percent(); + Xapian::Document doc = i.get_document(); + *resultKey = doc.get_data().c_str(); +#elif defined USELUCENE for (unsigned long i = 0; i < (unsigned long)h->length(); i++) { Document &doc = h->doc(i); - // set a temporary verse key to this module position *resultKey = wcharToUTF8(doc.get(_T("key"))); //TODO Does a key always accept utf8? + __u64 score = (__u64)((__u32)(h->score(i)*100)); +#endif // check to see if it sets ok (within our bounds) and if not, skip if (checkBounds) { @@ -494,14 +562,19 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc } } listKey << *resultKey; - listKey.getElement()->userData = (__u64)((__u32)(h->score(i)*100)); + listKey.getElement()->userData = score; } (*percent)(98, percentUserData); } SWCATCH (...) { +#if defined USEXAPIAN +#elif defined USELUCENE q = 0; +#endif // invalid clucene query } +#if defined USEXAPIAN +#elif defined USELUCENE delete h; delete q; @@ -509,6 +582,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc if (ir) { ir->close(); } +#endif } #endif @@ -587,6 +661,11 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc SWBuf textBuf = stripText(); #ifdef USECXX11REGEX if (std::regex_match(std::string(textBuf.c_str()), preg)) { +#elif defined(USEICUREGEX) + UnicodeString stringToTest = textBuf.c_str(); + matcher->reset(stringToTest); + + if (matcher->find()) { #else if (!regexec(&preg, textBuf, 0, 0, 0)) { #endif @@ -597,6 +676,12 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc } #ifdef USECXX11REGEX else if (std::regex_match(std::string((lastBuf + ' ' + textBuf).c_str()), preg)) { +#elif defined(USEICUREGEX) + else { + stringToTest = (lastBuf + ' ' + textBuf).c_str(); + matcher->reset(stringToTest); + + if (matcher->find()) { #else else if (!regexec(&preg, lastBuf + ' ' + textBuf, 0, 0, 0)) { #endif @@ -607,6 +692,9 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc else { lastBuf = textBuf; } +#if defined(USEICUREGEX) + } +#endif } // phrase @@ -731,6 +819,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc } break; } + // NOT DONE case -5: AttributeList &words = getEntryAttributes()["Word"]; SWBuf kjvWord = ""; @@ -789,6 +878,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc if (searchType >= 0) { #ifdef USECXX11REGEX std::locale::global(oldLocale); +#elif defined(USEICUREGEX) + delete matcher; #else regfree(&preg); #endif @@ -846,14 +937,33 @@ const char *SWModule::getRenderHeader() const { /****************************************************************************** - * SWModule::renderText - calls all renderfilters on current text + * SWModule::renderText - calls all renderfilters on current module + * position + * + * RET: this module's text at current key location massaged by renderText filters + */ +SWBuf SWModule::renderText() { + return renderText((const char *)0); +} + +/****************************************************************************** + * SWModule::renderText - calls all renderfilters on provided text + * or current module position provided text null * - * ENT: buf - buffer to Render instead of current module position + * ENT: buf - buffer to render * * RET: this module's text at current key location massaged by renderText filters + * + * NOTES: This method is only truly const if called with a provided text; using + * module's current position may produce a new entry attributes map which + * logically violates the const semantic, which is why the above method + * which takes no params is not const, i.e., don't call this method with + * null as text param, but instead use non-const method above. The public + * interface for this method expects a value for the text param. We use it + * internally sometimes calling with null to save duplication of code. */ - SWBuf SWModule::renderText(const char *buf, int len, bool render) { +SWBuf SWModule::renderText(const char *buf, int len, bool render) const { bool savePEA = isProcessEntryAttributes(); if (!buf) { entryAttributes.clear(); @@ -873,7 +983,7 @@ const char *SWModule::getRenderHeader() const { if (tmpbuf) { unsigned long size = (len < 0) ? ((getEntrySize()<0) ? strlen(tmpbuf) : getEntrySize()) : len; if (size > 0) { - key = (SWKey *)*this; + key = this->getKey(); optionFilter(tmpbuf, key); @@ -1010,12 +1120,17 @@ void SWModule::deleteSearchFramework() { signed char SWModule::createSearchFramework(void (*percent)(char, void *), void *percentUserData) { -#ifdef USELUCENE +#if defined USELUCENE || defined USEXAPIAN SWBuf target = getConfigEntry("AbsoluteDataPath"); if (!target.endsWith("/") && !target.endsWith("\\")) { target.append('/'); } +#if defined USEXAPIAN + target.append("xapian"); +#elif defined USELUCENE + const int MAX_CONV_SIZE = 1024 * 1024; target.append("lucene"); +#endif int status = FileMgr::createParent(target+"/dummy"); if (status) return -1; @@ -1024,7 +1139,6 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void SWKey textkey; SWBuf c; - const int MAX_CONV_SIZE = 1024 * 1024; // turn all filters to default values StringList filterSettings; @@ -1058,6 +1172,17 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void setKey(*searchKey); } + bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch"); + + // lets create or open our search index +#if defined USEXAPIAN + Xapian::WritableDatabase database(target.c_str(), Xapian::DB_CREATE_OR_OPEN); + Xapian::TermGenerator termGenerator; + SWTRY { + termGenerator.set_stemmer(Xapian::Stem(getLanguage())); + } SWCATCH(...) {} + +#elif defined USELUCENE RAMDirectory *ramDir = 0; IndexWriter *coreWriter = 0; IndexWriter *fsWriter = 0; @@ -1065,11 +1190,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void const TCHAR *stopWords[] = { 0 }; standard::StandardAnalyzer *an = new standard::StandardAnalyzer(stopWords); - bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch"); ramDir = new RAMDirectory(); coreWriter = new IndexWriter(ramDir, an, true); coreWriter->setMaxFieldLength(MAX_CONV_SIZE); +#endif @@ -1126,7 +1251,12 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void bool good = false; // start out entry +#if defined USEXAPIAN + Xapian::Document doc; + termGenerator.set_document(doc); +#elif defined USELUCENE Document *doc = new Document(); +#endif // get "key" field SWBuf keyText = (vkcheck) ? vkcheck->getOSISRef() : getKeyText(); if (content && *content) { @@ -1172,7 +1302,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void } } +#if defined USEXAPIAN + doc.set_data(keyText.c_str()); +#elif defined USELUCENE doc->add(*_CLNEW Field(_T("key"), (wchar_t *)utf8ToWChar(keyText).getRawData(), Field::STORE_YES | Field::INDEX_UNTOKENIZED)); +#endif if (includeKeyInSearch) { c = keyText; @@ -1181,11 +1315,21 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void content = c.c_str(); } +#if defined USEXAPIAN + termGenerator.index_text(content); + termGenerator.index_text(content, 1, "C"); +#elif defined USELUCENE doc->add(*_CLNEW Field(_T("content"), (wchar_t *)utf8ToWChar(content).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); +#endif if (strong.length() > 0) { +#if defined USEXAPIAN + termGenerator.index_text(strong.c_str(), 1, "L"); + termGenerator.index_text(morph.c_str(), 1, "M"); +#elif defined USELUCENE doc->add(*_CLNEW Field(_T("lemma"), (wchar_t *)utf8ToWChar(strong).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); doc->add(*_CLNEW Field(_T("morph"), (wchar_t *)utf8ToWChar(morph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); +#endif //printf("setting fields (%s).\ncontent: %s\nlemma: %s\n", (const char *)*key, content, strong.c_str()); } @@ -1330,20 +1474,39 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void if (proxBuf.length() > 0) { +#if defined USEXAPIAN + termGenerator.index_text(proxBuf.c_str(), 1, "P"); +#elif defined USELUCENE doc->add(*_CLNEW Field(_T("prox"), (wchar_t *)utf8ToWChar(proxBuf).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED)); +#endif good = true; } if (proxLem.length() > 0) { +#if defined USEXAPIAN + termGenerator.index_text(proxLem.c_str(), 1, "PL"); + termGenerator.index_text(proxMorph.c_str(), 1, "PM"); +#elif defined USELUCENE doc->add(*_CLNEW Field(_T("proxlem"), (wchar_t *)utf8ToWChar(proxLem).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) ); doc->add(*_CLNEW Field(_T("proxmorph"), (wchar_t *)utf8ToWChar(proxMorph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) ); +#endif good = true; } if (good) { //printf("writing (%s).\n", (const char *)*key); //fflush(stdout); +#if defined USEXAPIAN + SWBuf idTerm; + idTerm.setFormatted("Q%ld", key->getIndex()); + doc.add_boolean_term(idTerm.c_str()); + database.replace_document(idTerm.c_str(), doc); +#elif defined USELUCENE coreWriter->addDocument(doc); +#endif } +#if defined USEXAPIAN +#elif defined USELUCENE delete doc; +#endif (*this)++; err = popError(); @@ -1351,6 +1514,8 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void // Optimizing automatically happens with the call to addIndexes //coreWriter->optimize(); +#if defined USEXAPIAN +#elif defined USELUCENE coreWriter->close(); #ifdef CLUCENE2 @@ -1385,6 +1550,7 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void delete coreWriter; delete fsWriter; delete an; +#endif // reposition module back to where it was before we were called setKey(*saveKey); |