1 files changed, 192 insertions, 26 deletions
diff --git a/src/modules/swmodule.cpp b/src/modules/swmodule.cpp
index a2d7873..e6ceeda 100644
--- a/src/modules/swmodule.cpp
+++ b/src/modules/swmodule.cpp
@@ -4,7 +4,7 @@
  *			for all types of modules (e.g. texts, commentaries,
  *			maps, lexicons, etc.)
  *
- * $Id: swmodule.cpp 3282 2014-12-03 06:09:06Z greg.hellings $
+ * $Id: swmodule.cpp 3515 2017-11-01 11:38:09Z scribe $
  *
  * Copyright 1999-2013 CrossWire Bible Society (http://www.crosswire.org)
  *	CrossWire Bible Society
@@ -39,16 +39,23 @@
 #include <iostream>
 #endif
 
-#ifdef USECXX11REGEX
+#if defined(USECXX11REGEX)
 #include <regex>
 #ifndef REG_ICASE
 #define REG_ICASE std::regex::icase
 #endif
+#elif defined(USEICUREGEX)
+#include <unicode/regex.h>
+#ifndef REG_ICASE
+#define REG_ICASE UREGEX_CASE_INSENSITIVE
+#endif
 #else
 #include <regex.h>	// GNU
 #endif
 
-#ifdef USELUCENE
+#if defined USEXAPIAN
+#include <xapian.h>
+#elif defined USELUCENE
 #include <CLucene.h>
 
 //Lucence includes
@@ -171,6 +178,7 @@ char SWModule::popError()
 	char retval = error;
 
 	error = 0;
+	if (!retval) retval = key->popError();
 	return retval;
 }
 
@@ -295,7 +303,7 @@ char SWModule::setKey(const SWKey *ikey) {
 	if (oldKey)
 		delete oldKey;
 
-	return error = key->popError();
+	return error = key->getError();
 }
 
 
@@ -313,13 +321,13 @@ void SWModule::setPosition(SW_POSITION p) {
 
 	switch (p) {
 	case POS_TOP:
-		(*this)++;
-		(*this)--;
+		this->increment();
+		this->decrement();
 		break;
 
 	case POS_BOTTOM:
-		(*this)--;
-		(*this)++;
+		this->decrement();
+		this->increment();
 		break;
 	}
 
@@ -360,7 +368,7 @@ void SWModule::decrement(int steps) {
  *
  * ENT:	istr		- string for which to search
  * 	searchType	- type of search to perform
- *				>=0 - regex
+ *				>=0 - regex; (for backward compat, if > 0 then used as additional REGEX FLAGS)
  *				-1  - phrase
  *				-2  - multiword
  *				-3  - entryAttrib (eg. Word//Lemma./G1234/)	 (Lemma with dot means check components (Lemma.[1-9]) also)
@@ -379,16 +387,22 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 	SWBuf term = istr;
 	bool includeComponents = false;	// for entryAttrib e.g., /Lemma.1/ 
 
-#ifdef USELUCENE
 	SWBuf target = getConfigEntry("AbsoluteDataPath");
 	if (!target.endsWith("/") && !target.endsWith("\\")) {
 		target.append('/');
 	}
+#if defined USEXAPIAN
+	target.append("xapian");
+#elif defined USELUCENE
 	target.append("lucene");
 #endif
 	if (justCheckIfSupported) {
 		*justCheckIfSupported = (searchType >= -3);
-#ifdef USELUCENE
+#if defined USEXAPIAN
+		if ((searchType == -4) && (FileMgr::existsDir(target))) {
+			*justCheckIfSupported = true;
+		}
+#elif defined USELUCENE
 		if ((searchType == -4) && (IndexReader::indexExists(target.c_str()))) {
 			*justCheckIfSupported = true;
 		}
@@ -407,6 +421,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 	std::locale::global(std::locale("en_US.UTF-8"));
 
 	std::regex preg;
+#elif defined(USEICUREGEX)
+	RegexMatcher *matcher = 0;
 #else
 	regex_t preg;
 #endif
@@ -449,18 +465,48 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 	*this = TOP;
 	if (searchType >= 0) {
 #ifdef USECXX11REGEX
-		preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended & flags);
+		preg = std::regex((SWBuf(".*")+istr+".*").c_str(), std::regex_constants::extended | searchType | flags);
+#elif defined(USEICUREGEX)
+		UErrorCode        status    = U_ZERO_ERROR;
+		matcher = new RegexMatcher(istr, searchType | flags, status);
+		if (U_FAILURE(status)) {
+			SWLog::getSystemLog()->logError("Error compiling Regex: %d", status);
+			return listKey;
+		}
+
 #else
 		flags |=searchType|REG_NOSUB|REG_EXTENDED;
-		regcomp(&preg, istr, flags);
+		int err = regcomp(&preg, istr, flags);
+		if (err) {
+			SWLog::getSystemLog()->logError("Error compiling Regex: %d", err);
+			return listKey;
+		}
 #endif
 	}
 
 	(*percent)(++perc, percentUserData);
 
 
-#ifdef USELUCENE
-	if (searchType == -4) {	// lucene
+#if defined USEXAPIAN || defined USELUCENE
+	(*percent)(10, percentUserData);
+	if (searchType == -4) {	// indexed search
+#if defined USEXAPIAN
+		SWTRY {
+			Xapian::Database database(target.c_str());
+			Xapian::QueryParser queryParser;
+			queryParser.set_default_op(Xapian::Query::OP_AND);
+			SWTRY {
+				queryParser.set_stemmer(Xapian::Stem(getLanguage()));
+			} SWCATCH(...) {}
+			queryParser.set_stemming_strategy(queryParser.STEM_SOME);
+			queryParser.add_prefix("content", "C");
+			queryParser.add_prefix("lemma", "L");
+			queryParser.add_prefix("morph", "M");
+			queryParser.add_prefix("prox", "P");
+			queryParser.add_prefix("proxlem", "PL");
+			queryParser.add_prefix("proxmorph", "PM");
+
+#elif defined USELUCENE
 		
 		lucene::index::IndexReader    *ir = 0;
 		lucene::search::IndexSearcher *is = 0;
@@ -469,22 +515,44 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 		SWTRY {
 			ir = IndexReader::open(target);
 			is = new IndexSearcher(ir);
-			(*percent)(10, percentUserData);
-
 			const TCHAR *stopWords[] = { 0 };
 			standard::StandardAnalyzer analyzer(stopWords);
+#endif
+
+			// parse the query
+#if defined USEXAPIAN
+			Xapian::Query q = queryParser.parse_query(istr);
+			Xapian::Enquire enquire = Xapian::Enquire(database);
+#elif defined USELUCENE
 			q = QueryParser::parse((wchar_t *)utf8ToWChar(istr).getRawData(), _T("content"), &analyzer);
+#endif
 			(*percent)(20, percentUserData);
+
+			// perform the search
+#if defined USEXAPIAN
+			enquire.set_query(q);
+			Xapian::MSet h = enquire.get_mset(0, 99999);
+#elif defined USELUCENE
 			h = is->search(q);
+#endif
 			(*percent)(80, percentUserData);
 
 			// iterate thru each good module position that meets the search
 			bool checkBounds = getKey()->isBoundSet();
+#if defined USEXAPIAN
+			Xapian::MSetIterator i;
+			for (i = h.begin(); i != h.end(); ++i) {
+//				cout << "Document ID " << *i << "\t";
+				__u64 score = i.get_percent();
+				Xapian::Document doc = i.get_document();
+				*resultKey = doc.get_data().c_str();
+#elif defined USELUCENE
 			for (unsigned long i = 0; i < (unsigned long)h->length(); i++) {
 				Document &doc = h->doc(i);
-
 				// set a temporary verse key to this module position
 				*resultKey = wcharToUTF8(doc.get(_T("key"))); //TODO Does a key always accept utf8?
+				__u64 score = (__u64)((__u32)(h->score(i)*100));
+#endif
 
 				// check to see if it sets ok (within our bounds) and if not, skip
 				if (checkBounds) {
@@ -494,14 +562,19 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 					}
 				}
 				listKey << *resultKey;
-				listKey.getElement()->userData = (__u64)((__u32)(h->score(i)*100));
+				listKey.getElement()->userData = score;
 			}
 			(*percent)(98, percentUserData);
 		}
 		SWCATCH (...) {
+#if defined USEXAPIAN
+#elif defined USELUCENE
 			q = 0;
+#endif
 			// invalid clucene query
 		}
+#if defined USEXAPIAN
+#elif defined USELUCENE
 		delete h;
 		delete q;
 
@@ -509,6 +582,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 		if (ir) {
 			ir->close();
 		}
+#endif
 	}
 #endif
 
@@ -587,6 +661,11 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 			SWBuf textBuf = stripText();
 #ifdef USECXX11REGEX
 			if (std::regex_match(std::string(textBuf.c_str()), preg)) {
+#elif defined(USEICUREGEX)
+			UnicodeString stringToTest = textBuf.c_str();
+			matcher->reset(stringToTest);
+
+			if (matcher->find()) {
 #else
 			if (!regexec(&preg, textBuf, 0, 0, 0)) {
 #endif
@@ -597,6 +676,12 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 			}
 #ifdef USECXX11REGEX
 			else if (std::regex_match(std::string((lastBuf + ' ' + textBuf).c_str()), preg)) {
+#elif defined(USEICUREGEX)
+			else {
+				stringToTest = (lastBuf + ' ' + textBuf).c_str();
+				matcher->reset(stringToTest);
+
+				if (matcher->find()) {
 #else
 			else if (!regexec(&preg, lastBuf + ' ' + textBuf, 0, 0, 0)) {
 #endif
@@ -607,6 +692,9 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 			else {
 				lastBuf = textBuf;
 			}
+#if defined(USEICUREGEX)
+			}
+#endif
 		}
 
 		// phrase
@@ -731,6 +819,7 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 				}
 				break;
 			}
+			// NOT DONE
 			case -5:
 				AttributeList &words = getEntryAttributes()["Word"];
 				SWBuf kjvWord = "";
@@ -789,6 +878,8 @@ ListKey &SWModule::search(const char *istr, int searchType, int flags, SWKey *sc
 	if (searchType >= 0) {
 #ifdef USECXX11REGEX
 		std::locale::global(oldLocale);
+#elif defined(USEICUREGEX)
+		delete matcher;
 #else
 		regfree(&preg);
 #endif
@@ -846,14 +937,33 @@ const char *SWModule::getRenderHeader() const {
 
 
 /******************************************************************************
- * SWModule::renderText 	- calls all renderfilters on current text
+ * SWModule::renderText 	- calls all renderfilters on current module
+ *				position
+ *
+ * RET: this module's text at current key location massaged by renderText filters
+ */
+SWBuf SWModule::renderText() {
+	return renderText((const char *)0);
+}
+
+/******************************************************************************
+ * SWModule::renderText 	- calls all renderfilters on provided text
+ *				or current module position provided text null
  *
- * ENT:	buf	- buffer to Render instead of current module position
+ * ENT:	buf	- buffer to render
  *
  * RET: this module's text at current key location massaged by renderText filters
+ *
+ * NOTES: This method is only truly const if called with a provided text; using
+ * module's current position may produce a new entry attributes map which
+ * logically violates the const semantic, which is why the above method
+ * which takes no params is not const, i.e., don't call this method with
+ * null as text param, but instead use non-const method above.  The public
+ * interface for this method expects a value for the text param.  We use it
+ * internally sometimes calling with null to save duplication of code.
  */
 
- SWBuf SWModule::renderText(const char *buf, int len, bool render) {
+SWBuf SWModule::renderText(const char *buf, int len, bool render) const {
 	bool savePEA = isProcessEntryAttributes();
 	if (!buf) {
 		entryAttributes.clear();
@@ -873,7 +983,7 @@ const char *SWModule::getRenderHeader() const {
 	if (tmpbuf) {
 		unsigned long size = (len < 0) ? ((getEntrySize()<0) ? strlen(tmpbuf) : getEntrySize()) : len;
 		if (size > 0) {
-			key = (SWKey *)*this;
+			key = this->getKey();
 
 			optionFilter(tmpbuf, key);
 	
@@ -1010,12 +1120,17 @@ void SWModule::deleteSearchFramework() {
 
 signed char SWModule::createSearchFramework(void (*percent)(char, void *), void *percentUserData) {
 
-#ifdef USELUCENE
+#if defined USELUCENE || defined USEXAPIAN
 	SWBuf target = getConfigEntry("AbsoluteDataPath");
 	if (!target.endsWith("/") && !target.endsWith("\\")) {
 		target.append('/');
 	}
+#if defined USEXAPIAN
+	target.append("xapian");
+#elif defined USELUCENE
+	const int MAX_CONV_SIZE = 1024 * 1024;
 	target.append("lucene");
+#endif
 	int status = FileMgr::createParent(target+"/dummy");
 	if (status) return -1;
 
@@ -1024,7 +1139,6 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 	SWKey textkey;
 	SWBuf c;
 
-	const int MAX_CONV_SIZE = 1024 * 1024;
 
 	// turn all filters to default values
 	StringList filterSettings;
@@ -1058,6 +1172,17 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 		setKey(*searchKey);
 	}
 
+	bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
+
+	// lets create or open our search index
+#if defined USEXAPIAN
+	Xapian::WritableDatabase database(target.c_str(), Xapian::DB_CREATE_OR_OPEN);
+	Xapian::TermGenerator termGenerator;
+	SWTRY {
+		termGenerator.set_stemmer(Xapian::Stem(getLanguage()));
+	} SWCATCH(...) {}
+
+#elif defined USELUCENE
 	RAMDirectory *ramDir = 0;
 	IndexWriter *coreWriter = 0;
 	IndexWriter *fsWriter = 0;
@@ -1065,11 +1190,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 
 	const TCHAR *stopWords[] = { 0 };
 	standard::StandardAnalyzer *an = new standard::StandardAnalyzer(stopWords);
-	bool includeKeyInSearch = getConfig().has("SearchOption", "IncludeKeyInSearch");
 
 	ramDir = new RAMDirectory();
 	coreWriter = new IndexWriter(ramDir, an, true);
 	coreWriter->setMaxFieldLength(MAX_CONV_SIZE);
+#endif
 
 
 
@@ -1126,7 +1251,12 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 		bool good = false;
 
 		// start out entry
+#if defined USEXAPIAN
+		Xapian::Document doc;
+		termGenerator.set_document(doc);
+#elif defined USELUCENE
 		Document *doc = new Document();
+#endif
 		// get "key" field
 		SWBuf keyText = (vkcheck) ? vkcheck->getOSISRef() : getKeyText();
 		if (content && *content) {
@@ -1172,7 +1302,11 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 				}
 			}
 
+#if defined USEXAPIAN
+			doc.set_data(keyText.c_str());
+#elif defined USELUCENE
 			doc->add(*_CLNEW Field(_T("key"), (wchar_t *)utf8ToWChar(keyText).getRawData(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+#endif
 
 			if (includeKeyInSearch) {
 				c = keyText;
@@ -1181,11 +1315,21 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 				content = c.c_str();
 			}
 
+#if defined USEXAPIAN
+			termGenerator.index_text(content);
+			termGenerator.index_text(content, 1, "C");
+#elif defined USELUCENE
 			doc->add(*_CLNEW Field(_T("content"), (wchar_t *)utf8ToWChar(content).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
 
 			if (strong.length() > 0) {
+#if defined USEXAPIAN
+				termGenerator.index_text(strong.c_str(), 1, "L");
+				termGenerator.index_text(morph.c_str(), 1, "M");
+#elif defined USELUCENE
 				doc->add(*_CLNEW Field(_T("lemma"), (wchar_t *)utf8ToWChar(strong).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
 				doc->add(*_CLNEW Field(_T("morph"), (wchar_t *)utf8ToWChar(morph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
 //printf("setting fields (%s).\ncontent: %s\nlemma: %s\n", (const char *)*key, content, strong.c_str());
 			}
 
@@ -1330,20 +1474,39 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 
 		if (proxBuf.length() > 0) {
 
+#if defined USEXAPIAN
+			termGenerator.index_text(proxBuf.c_str(), 1, "P");
+#elif defined USELUCENE
 			doc->add(*_CLNEW Field(_T("prox"), (wchar_t *)utf8ToWChar(proxBuf).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED));
+#endif
 			good = true;
 		}
 		if (proxLem.length() > 0) {
+#if defined USEXAPIAN
+			termGenerator.index_text(proxLem.c_str(), 1, "PL");
+			termGenerator.index_text(proxMorph.c_str(), 1, "PM");
+#elif defined USELUCENE
 			doc->add(*_CLNEW Field(_T("proxlem"), (wchar_t *)utf8ToWChar(proxLem).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
 			doc->add(*_CLNEW Field(_T("proxmorph"), (wchar_t *)utf8ToWChar(proxMorph).getRawData(), Field::STORE_NO | Field::INDEX_TOKENIZED) );
+#endif
 			good = true;
 		}
 		if (good) {
 //printf("writing (%s).\n", (const char *)*key);
 //fflush(stdout);
+#if defined USEXAPIAN
+			SWBuf idTerm;
+			idTerm.setFormatted("Q%ld", key->getIndex());
+			doc.add_boolean_term(idTerm.c_str());
+			database.replace_document(idTerm.c_str(), doc);
+#elif defined USELUCENE
 			coreWriter->addDocument(doc);
+#endif
 		}
+#if defined USEXAPIAN
+#elif defined USELUCENE
 		delete doc;
+#endif
 
 		(*this)++;
 		err = popError();
@@ -1351,6 +1514,8 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 
 	// Optimizing automatically happens with the call to addIndexes
 	//coreWriter->optimize();
+#if defined USEXAPIAN
+#elif defined USELUCENE
 	coreWriter->close();
 
 #ifdef CLUCENE2
@@ -1385,6 +1550,7 @@ signed char SWModule::createSearchFramework(void (*percent)(char, void *), void
 	delete coreWriter;
 	delete fsWriter;
 	delete an;
+#endif
 
 	// reposition module back to where it was before we were called
 	setKey(*saveKey);