summaryrefslogtreecommitdiff
path: root/src/modules/filters/osismorphsegmentation.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/filters/osismorphsegmentation.cpp')
-rw-r--r--src/modules/filters/osismorphsegmentation.cpp106
1 files changed, 106 insertions, 0 deletions
diff --git a/src/modules/filters/osismorphsegmentation.cpp b/src/modules/filters/osismorphsegmentation.cpp
new file mode 100644
index 0000000..bf32581
--- /dev/null
+++ b/src/modules/filters/osismorphsegmentation.cpp
@@ -0,0 +1,106 @@
+/******************************************************************************
+ *
+ * osismorphsegmentation - SWFilter descendant to toggle splitting of morphemes
+ * (for morpheme segmented Hebrew in the WLC)
+ */
+
+
+#include <osismorphsegmentation.h>
+#include <stdlib.h>
+#include <utilxml.h>
+#include <swmodule.h>
+#include <swbuf.h>
+
+SWORD_NAMESPACE_START
+
+const char oName[] = "Morpheme Segmentation";
+const char oTip[] = "Toggles Morpheme Segmentation On and Off, when present";
+
+const SWBuf choices[3] = {"Off", "On", ""};
+const StringList oValues(&choices[0], &choices[2]);
+
+OSISMorphSegmentation::OSISMorphSegmentation() : SWOptionFilter(oName, oTip, &oValues) {
+ setOptionValue("Off");
+}
+
+
+OSISMorphSegmentation::~OSISMorphSegmentation() {}
+
+
+char OSISMorphSegmentation::processText(SWBuf &text, const SWKey * /*key*/, const SWModule *module) {
+ SWBuf token;
+ bool intoken = false;
+ bool hide = false;
+
+ SWBuf orig( text );
+ const char *from = orig.c_str();
+
+ XMLTag tag;
+ SWBuf tagText = "";
+ unsigned int morphemeNum = 0;
+ bool inMorpheme = false;
+ SWBuf buf;
+
+ for (text = ""; *from; ++from) {
+ if (*from == '<') {
+ intoken = true;
+ token = "";
+ continue;
+ }
+
+ if (*from == '>') { // process tokens
+ intoken = false;
+
+ if (!strncmp(token.c_str(), "seg ", 4) || !strncmp(token.c_str(), "/seg", 4)) {
+ tag = token;
+
+ if (!tag.isEndTag() && tag.getAttribute("type") && !strcmp("morph", tag.getAttribute("type"))) { //<seg type="morph"> start tag
+ hide = !option; //only hide if option is Off
+ tagText = "";
+ inMorpheme = true;
+ }
+
+ if (tag.isEndTag()) {
+ buf.setFormatted("%.3d", morphemeNum++);
+ module->getEntryAttributes()["Morpheme"][buf]["body"] = tagText;
+ inMorpheme = false;
+ }
+ if (hide) { //hides start and end tags as long as hide is set
+
+ if (tag.isEndTag()) { //</seg>
+ hide = false;
+ }
+
+ continue; //leave out the current token
+ }
+ } //end of seg tag handling
+
+ text.append('<');
+ text.append(token);
+ text.append('>');
+
+ if (inMorpheme) {
+ tagText.append('<');
+ tagText.append(token);
+ tagText.append('>');
+ }
+
+ hide = false;
+
+ continue;
+ } //end of intoken part
+
+ if (intoken) { //copy token
+ token.append(*from);
+ }
+ else { //copy text which is not inside of a tag
+ text.append(*from);
+ if (inMorpheme) {
+ tagText.append(*from);
+ }
+ }
+ }
+ return 0;
+}
+
+SWORD_NAMESPACE_END