summaryrefslogtreecommitdiff
path: root/src/modules/swmodule.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/swmodule.cpp')
-rw-r--r--src/modules/swmodule.cpp218
1 files changed, 192 insertions, 26 deletions
diff --git a/src/modules/swmodule.cpp b/src/modules/swmodule.cpp
index a2d7873..e6ceeda 100644
--- a/src/modules/swmodule.cpp
+++ b/src/modules/swmodule.cpp
@@ -4,7 +4,7 @@
* for all types of modules (e.g. texts, commentaries,
* maps, lexicons, etc.)
*
- * $Id: swmodule.cpp 3282 2014-12-03 06:09:06Z greg.hellings $
+ * $Id: swmodule.cpp 3515 2017-11-01 11:38:09Z scribe $
*
* Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org)
* CrossWire Bible Society
@@ -39,16 +39,23 @@
#include <iostream>
#endif
-#ifdef USECXX11REGEX
+#if defined(USECXX11REGEX)
#include <regex>
#ifndef REG_ICASE
#define REG_ICASE std::regex::icase
#endif
+#elif defined(USEICUREGEX)
+#include <unicode/regex.h>
+#ifndef REG_ICASE
+#define REG_ICASE UREGEX_CASE_INSENSITIVE
+#endif
#else
#include <regex.h> // GNU
#endif
-#ifdef USELUCENE
+#if defined USEXAPIAN
+#include <xapian.h>
+#elif defined USELUCENE
#include <CLucene.h>
//Lucence includes
@@ -171,6 +178,7 @@ char SWModule::popError()
char retval = error;
error = 0;
+ if (!retval) retval = key->popError();
return retval;
}
@@ -295,7 +303,7 @@ char SWModule::setKey(const SWKey *ikey) {
if (oldKey)
delete oldKey;
- return error = key->popError();
+ return error = key->getError();
}
@@ -313,13 +321,13 @@ void SWModule::setPosition(SW_POSITION p) {
switch (p) {
case POS_TOP:
- (*this)++;
- (*this)--;
+ this->increment();
+ this->decrement();
break;
case POS_BOTTOM:
- (*this)--;
- (*this)++;
+ this->decrement();
+ this->increment();
break;
}
@@ -360,7 +368,7 @@ void SWModule::decrement(int steps) {
*
* ENT: istr - string for which to search
* searchType - type of search to perform
- * >=0 - regex
+ * >=0 - regex; (for backward compat, if > 0 then used as additional REGEX FLAGS)
* -1 - phrase
* -2 - multiword
* -3 - entryAttrib (eg. Word//Lemma./G1234/) (Lemma with dot means check components (Lemma.[1-9]) also)
@@ -379,16 +387,22 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
SWBuf term = istr;
bool includeComponents = false; // for entryAttrib e.g., /Lemma.1/
-#ifdef USELUCENE
SWBuf target = getConfigEntry("AbsoluteDataPath");
if (!target.endsWith("/") && !target.endsWith("\\")) {
target.append('/');
}
+#if defined USEXAPIAN
+ target.append("xapian");
+#elif defined USELUCENE
target.append("lucene");
#endif
if (justCheckIfSupported) {
*justCheckIfSupported = (searchType >= -3);
-#ifdef USELUCENE
+#if defined USEXAPIAN
+ if ((searchType == -4) && (FileMgr::existsDir(target))) {
+ *justCheckIfSupported = true;
+ }
+#elif defined USELUCENE
if ((searchType == -4) && (IndexReader::indexExists(target.c_str()))) {
*justCheckIfSupported = true;
}
@@ -407,6 +421,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
std::locale::global(std::locale("en_US.UTF-8"));
std::regex preg;
+#elif defined(USEICUREGEX)
+ RegexMatcher *matcher = 0;
#else
regex_t preg;
#endif
@@ -449,18 +465,48 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
*this = TOP;
if (searchType >= 0) {
#ifdef USECXX11REGEX
- preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended & flags);
+ preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended | searchType | flags);
+#elif defined(USEICUREGEX)
+ UErrorCode status = U_ZERO_ERROR;
+ matcher = new RegexMatcher(istr, searchType | flags, status);
+ if (U_FAILURE(status)) {
+ SWLog::getSystemLog()->logError("Error compiling Regex: %d", status);
+ return listKey;
+ }
+
#else
flags |=searchType|REG_NOSUB|REG_EXTENDED;
- regcomp(&preg, istr, flags);
+ int err = regcomp(&preg, istr, flags);
+ if (err) {
+ SWLog::getSystemLog()->logError("Error compiling Regex: %d", err);
+ return listKey;
+ }
#endif
}
(*percent)(++perc, percentUserData);
-#ifdef USELUCENE
- if (searchType == -4) { // lucene
+#if defined USEXAPIAN || defined USELUCENE
+ (*percent)(10, percentUserData);
+ if (searchType == -4) { // indexed search
+#if defined USEXAPIAN
+ SWTRY {
+ Xapian::Database database(target.c_str());
+ Xapian::QueryParser queryParser;
+ queryParser.set_default_op(Xapian::Query::OP_AND);
+ SWTRY {
+ queryParser.set_stemmer(Xapian::Stem(getLanguage()));
+ } SWCATCH(...) {}
+ queryParser.set_stemming_strategy(queryParser.STEM_SOME);
+ queryParser.add_prefix("content", "C");
+ queryParser.add_prefix("lemma", "L");
+ queryParser.add_prefix("morph", "M");
+ queryParser.add_prefix("prox", "P");
+ queryParser.add_prefix("proxlem", "PL");
+ queryParser.add_prefix("proxmorph", "PM");
+
+#elif defined USELUCENE
lucene::index::IndexReader *ir = 0;
lucene::search::IndexSearcher *is = 0;
@@ -469,22 +515,44 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
SWTRY {
ir = IndexReader::open(target);
is = new IndexSearcher(ir);
- (*percent)(10, percentUserData);
-
const TCHAR *stopWords[] = { 0 };
standard::StandardAnalyzer analyzer(stopWords);
+#endif
+
+ // parse the query
+#if defined USEXAPIAN
+ Xapian::Query q = queryParser.parse_query(istr);
+ Xapian::Enquire enquire = Xapian::Enquire(database);
+#elif defined USELUCENE
q = QueryParser::parse((wchar_t *)utf8ToWChar(istr).getRawData(), _T("content"), &analyzer);
+#endif
(*percent)(20, percentUserData);
+
+ // perform the search
+#if defined USEXAPIAN
+ enquire.set_query(q);
+ Xapian::MSet h = enquire.get_mset(0, 99999);
+#elif defined USELUCENE
h = is->search(q);
+#endif
(*percent)(80, percentUserData);
// iterate thru each good module position that meets the search
bool checkBounds = getKey()->isBoundSet();
+#if defined USEXAPIAN
+ Xapian::MSetIterator i;
+ for (i = h.begin(); i != h.end(); ++i) {
+// cout << "Document ID " << *i << "\t";
+ __u64 score = i.get_percent();
+ Xapian::Document doc = i.get_document();
+ *resultKey = doc.get_data().c_str();
+#elif defined USELUCENE
for (unsigned long i = 0; i < (unsigned long)h->length(); i++) {
Document &doc = h->doc(i);
-
// set a temporary verse key to this module position
*resultKey = wcharToUTF8(doc.get(_T("key"))); //TODO Does a key always accept utf8?
+ __u64 score = (__u64)((__u32)(h->score(i)*100));
+#endif
// check to see if it sets ok (within our bounds) and if not, skip
if (checkBounds) {
@@ -494,14 +562,19 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
}
}
listKey << *resultKey;
- listKey.getElement()->userData = (__u64)((__u32)(h->score(i)*100));
+ listKey.getElement()->userData = score;
}
(*percent)(98, percentUserData);
}
SWCATCH (...) {
+#if defined USEXAPIAN
+#elif defined USELUCENE
q = 0;
+#endif
// invalid clucene query
}
+#if defined USEXAPIAN
+#elif defined USELUCENE
delete h;
delete q;
@@ -509,6 +582,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
if (ir) {
ir->close();
}
+#endif
}
#endif
@@ -587,6 +661,11 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
SWBuf textBuf = stripText();
#ifdef USECXX11REGEX
if (std::regex_match(std::string(textBuf.c_str()), preg)) {
+#elif defined(USEICUREGEX)
+ UnicodeString stringToTest = textBuf.c_str();
+ matcher->reset(stringToTest);
+
+ if (matcher->find()) {
#else
if (!regexec(&preg, textBuf, 0, 0, 0)) {
#endif
@@ -597,6 +676,12 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
}
#ifdef USECXX11REGEX
else if (std::regex_match(std::string((lastBuf + ' ' + textBuf).c_str()), preg)) {
+#elif defined(USEICUREGEX)
+ else {
+ stringToTest = (lastBuf + ' ' + textBuf).c_str();
+ matcher->reset(stringToTest);
+
+ if (matcher->find()) {
#else
else if (!regexec(&preg, lastBuf + ' ' + textBuf, 0, 0, 0)) {
#endif
@@ -607,6 +692,9 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
else {
lastBuf = textBuf;
}
+#if defined(USEICUREGEX)
+ }
+#endif
}
// phrase
@@ -731,6 +819,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
}
break;
}
+ // NOT DONE
case -5:
AttributeList &words = getEntryAttributes()["Word"];
SWBuf kjvWord = "";
@@ -789,6 +878,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
if (searchType >= 0) {
#ifdef USECXX11REGEX
std::locale::global(oldLocale);
+#elif defined(USEICUREGEX)
+ delete matcher;
#else
regfree(&preg);
#endif
@@ -846,14 +937,33 @@ const char *SWModule::getRenderHeader() const {
/******************************************************************************
- * SWModule::renderText - calls all renderfilters on current text
+ * SWModule::renderText - calls all renderfilters on current module
+ * position
+ *
+ * RET: this module's text at current key location massaged by renderText filters
+ */
+SWBuf SWModule::renderText() {
+ return renderText((const char *)0);
+}
+
+/******************************************************************************
+ * SWModule::renderText - calls all renderfilters on provided text
+ * or current module position provided text null
*
- * ENT: buf - buffer to Render instead of current module position
+ * ENT: buf - buffer to render
*
* RET: this module's text at current key location massaged by renderText filters
+ *
+ * NOTES: This method is only truly const if called with a provided text; using
+ * module's current position may produce a new entry attributes map which
+ * logically violates the const semantic, which is why the above method
+ * which takes no params is not const, i.e., don't call this method with
+ * null as text param, but instead use non-const method above. The public
+ * interface for this method expects a value for the text param. We use it
+ * internally sometimes calling with null to save duplication of code.
*/
- SWBuf SWModule::renderText(const char *buf, int len, bool render) {
+SWBuf SWModule::renderText(const char *buf, int len, bool render) const {
bool savePEA = isProcessEntryAttributes();
if (!buf) {
entryAttributes.clear();
@@ -873,7 +983,7 @@ const char *SWModule::getRenderHeader() const {
if (tmpbuf) {
unsigned long size = (len < 0) ? ((getEntrySize()<0) ? strlen(tmpbuf) : getEntrySize()) : len;
if (size > 0) {
- key = (SWKey *)*this;
+ key = this->getKey();
optionFilter(tmpbuf, key);
@@ -1010,12 +1120,17 @@ void SWModule::deleteSearchFramework() {
signed char SWModule::createSearchFramework(void (*percent)(char, void *), void *percentUserData) {
-#ifdef USELUCENE
+#if defined USELUCENE || defined USEXAPIAN
SWBuf target = getConfigEntry("AbsoluteDataPath");
if (!target.endsWith("/") && !target.endsWith("\\")) {
target.append('/');
}
+#if defined USEXAPIAN
+ target.append("xapian");
+#elif defined USELUCENE
+ const int MAX_CONV_SIZE = 1024 * 1024;
target.append("lucene");
+#endif
int status = FileMgr::createParent(target+"/dummy");
if (status) return -1;
@@ -1024,7 +1139,6 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
SWKey textkey;
SWBuf c;
- const int MAX_CONV_SIZE = 1024 * 1024;
// turn all filters to default values
StringList filterSettings;
@@ -1058,6 +1172,17 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
setKey(*searchKey);
}
+ bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
+
+ // lets create or open our search index
+#if defined USEXAPIAN
+ Xapian::WritableDatabase database(target.c_str(), Xapian::DB_CREATE_OR_OPEN);
+ Xapian::TermGenerator termGenerator;
+ SWTRY {
+ termGenerator.set_stemmer(Xapian::Stem(getLanguage()));
+ } SWCATCH(...) {}
+
+#elif defined USELUCENE
RAMDirectory *ramDir = 0;
IndexWriter *coreWriter = 0;
IndexWriter *fsWriter = 0;
@@ -1065,11 +1190,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
const TCHAR *stopWords[] = { 0 };
standard::StandardAnalyzer *an = new standard::StandardAnalyzer(stopWords);
- bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
ramDir = new RAMDirectory();
coreWriter = new IndexWriter(ramDir, an, true);
coreWriter->setMaxFieldLength(MAX_CONV_SIZE);
+#endif
@@ -1126,7 +1251,12 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
bool good = false;
// start out entry
+#if defined USEXAPIAN
+ Xapian::Document doc;
+ termGenerator.set_document(doc);
+#elif defined USELUCENE
Document *doc = new Document();
+#endif
// get "key" field
SWBuf keyText = (vkcheck) ? vkcheck->getOSISRef() : getKeyText();
if (content && *content) {
@@ -1172,7 +1302,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
}
}
+#if defined USEXAPIAN
+ doc.set_data(keyText.c_str());
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("key"), (wchar_t *)utf8ToWChar(keyText).getRawData(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+#endif
if (includeKeyInSearch) {
c = keyText;
@@ -1181,11 +1315,21 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
content = c.c_str();
}
+#if defined USEXAPIAN
+ termGenerator.index_text(content);
+ termGenerator.index_text(content, 1, "C");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("content"), (wchar_t *)utf8ToWChar(content).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
if (strong.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(strong.c_str(), 1, "L");
+ termGenerator.index_text(morph.c_str(), 1, "M");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("lemma"), (wchar_t *)utf8ToWChar(strong).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
doc->add(*_CLNEW Field(_T("morph"), (wchar_t *)utf8ToWChar(morph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
//printf("setting fields (%s).\ncontent: %s\nlemma: %s\n", (const char *)*key, content, strong.c_str());
}
@@ -1330,20 +1474,39 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
if (proxBuf.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(proxBuf.c_str(), 1, "P");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("prox"), (wchar_t *)utf8ToWChar(proxBuf).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
good = true;
}
if (proxLem.length() > 0) {
+#if defined USEXAPIAN
+ termGenerator.index_text(proxLem.c_str(), 1, "PL");
+ termGenerator.index_text(proxMorph.c_str(), 1, "PM");
+#elif defined USELUCENE
doc->add(*_CLNEW Field(_T("proxlem"), (wchar_t *)utf8ToWChar(proxLem).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
doc->add(*_CLNEW Field(_T("proxmorph"), (wchar_t *)utf8ToWChar(proxMorph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
+#endif
good = true;
}
if (good) {
//printf("writing (%s).\n", (const char *)*key);
//fflush(stdout);
+#if defined USEXAPIAN
+ SWBuf idTerm;
+ idTerm.setFormatted("Q%ld", key->getIndex());
+ doc.add_boolean_term(idTerm.c_str());
+ database.replace_document(idTerm.c_str(), doc);
+#elif defined USELUCENE
coreWriter->addDocument(doc);
+#endif
}
+#if defined USEXAPIAN
+#elif defined USELUCENE
delete doc;
+#endif
(*this)++;
err = popError();
@@ -1351,6 +1514,8 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
// Optimizing automatically happens with the call to addIndexes
//coreWriter->optimize();
+#if defined USEXAPIAN
+#elif defined USELUCENE
coreWriter->close();
#ifdef CLUCENE2
@@ -1385,6 +1550,7 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
delete coreWriter;
delete fsWriter;
delete an;
+#endif
// reposition module back to where it was before we were called
setKey(*saveKey);