diff options
Diffstat (limited to 'src/modules/filters/osisplain.cpp')
-rw-r--r-- | src/modules/filters/osisplain.cpp | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/src/modules/filters/osisplain.cpp b/src/modules/filters/osisplain.cpp new file mode 100644 index 0000000..6e583a4 --- /dev/null +++ b/src/modules/filters/osisplain.cpp @@ -0,0 +1,261 @@ +/****************************************************************************** + * + * osisplain.cpp - An SWFilter that provides stripping of OSIS tags + * + * $Id: osisplain.cpp 2984 2013-09-20 12:18:45Z scribe $ + * + * Copyright 2003-2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include <stdlib.h> +#include <osisplain.h> +#include <ctype.h> +#include <versekey.h> +#include <stringmgr.h> + +SWORD_NAMESPACE_START + + +namespace { + + class MyUserData : public BasicFilterUserData { + public: + SWBuf w; + XMLTag tag; + VerseKey *vk; + char testament; + SWBuf hiType; + MyUserData(const SWModule *module, const SWKey *key) : BasicFilterUserData(module, key) {} + }; +} + + +OSISPlain::OSISPlain() { + setTokenStart("<"); + setTokenEnd(">"); + + setEscapeStart("&"); + setEscapeEnd(";"); + + setEscapeStringCaseSensitive(true); + + addEscapeStringSubstitute("amp", "&"); + addEscapeStringSubstitute("apos", "'"); + addEscapeStringSubstitute("lt", "<"); + addEscapeStringSubstitute("gt", ">"); + addEscapeStringSubstitute("quot", "\""); + + setTokenCaseSensitive(true); + addTokenSubstitute("title", "\n"); + addTokenSubstitute("/title", "\n"); + addTokenSubstitute("/l", "\n"); + addTokenSubstitute("lg", "\n"); + addTokenSubstitute("/lg", "\n"); +} + +BasicFilterUserData *OSISPlain::createUserData(const SWModule *module, const SWKey *key) { + MyUserData *u = new MyUserData(module, key); + u->vk = SWDYNAMIC_CAST(VerseKey, u->key); + u->testament = (u->vk) ? u->vk->getTestament() : 2; // default to NT + return u; +} + + +bool OSISPlain::handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData) { + // manually process if it wasn't a simple substitution + if (!substituteToken(buf, token)) { + MyUserData *u = (MyUserData *)userData; + if (((*token == 'w') && (token[1] == ' ')) || + ((*token == '/') && (token[1] == 'w') && (!token[2]))) { + u->tag = token; + + bool start = false; + if (*token == 'w') { + if (token[strlen(token)-1] != '/') { + u->w = token; + return true; + } + start = true; + } + u->tag = (start) ? token : u->w.c_str(); + bool show = true; // to handle unplaced article in kjv2003-- temporary till combined + + SWBuf lastText = (start) ? "stuff" : u->lastTextNode.c_str(); + + const char *attrib; + const char *val; + if ((attrib = u->tag.getAttribute("xlit"))) { + val = strchr(attrib, ':'); + val = (val) ? (val + 1) : attrib; + buf.append(" <"); + buf.append(val); + buf.append('>'); + } + if ((attrib = u->tag.getAttribute("gloss"))) { + val = strchr(attrib, ':'); + val = (val) ? (val + 1) : attrib; + buf.append(" <"); + buf.append(val); + buf.append('>'); + } + if ((attrib = u->tag.getAttribute("lemma"))) { + int count = u->tag.getAttributePartCount("lemma", ' '); + int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0 + do { + char gh; + attrib = u->tag.getAttribute("lemma", i, ' '); + if (i < 0) i = 0; // to handle our -1 condition + val = strchr(attrib, ':'); + val = (val) ? (val + 1) : attrib; + if ((strchr("GH", *val)) && (isdigit(val[1]))) { + gh = *val; + val++; + } + else { + gh = (u->testament>1) ? 'G' : 'H'; + } + if ((!strcmp(val, "3588")) && (lastText.length() < 1)) + show = false; + else { + buf.append(" <"); + buf.append(gh); + buf.append(val); + buf.append(">"); + } + } while (++i < count); + } + if ((attrib = u->tag.getAttribute("morph")) && (show)) { + int count = u->tag.getAttributePartCount("morph", ' '); + int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0 + do { + attrib = u->tag.getAttribute("morph", i, ' '); + if (i < 0) i = 0; // to handle our -1 condition + val = strchr(attrib, ':'); + val = (val) ? (val + 1) : attrib; + if ((*val == 'T') && (strchr("GH", val[1])) && (isdigit(val[2]))) + val+=2; + buf.append(" ("); + buf.append(val); + buf.append(')'); + } while (++i < count); + } + if ((attrib = u->tag.getAttribute("POS"))) { + val = strchr(attrib, ':'); + val = (val) ? (val + 1) : attrib; + + buf.append(" <"); + buf.append(val); + buf.append('>'); + } + } + + // <note> tag + else if (!strncmp(token, "note", 4)) { + if (!strstr(token, "strongsMarkup")) { // leave strong's markup notes out, in the future we'll probably have different option filters to turn different note types on or off + buf.append(" ["); + } + else u->suspendTextPassThru = true; + } + else if (!strncmp(token, "/note", 5)) { + if (!u->suspendTextPassThru) + buf.append("] "); + else u->suspendTextPassThru = false; + } + + // <p> paragraph tag + else if (((*token == 'p') && ((token[1] == ' ') || (!token[1]))) || + ((*token == '/') && (token[1] == 'p') && (!token[2]))) { + userData->supressAdjacentWhitespace = true; + buf.append('\n'); + } + + // Milestoned paragraph, created by osis2mod + // <div type="paragraph" sID... /> + // <div type="paragraph" eID... /> + else if (!strcmp(u->tag.getName(), "div") && u->tag.getAttribute("type") && !strcmp(u->tag.getAttribute("type"), "paragraph") && + (u->tag.isEmpty() && (u->tag.getAttribute("sID") || u->tag.getAttribute("eID")))) { + userData->supressAdjacentWhitespace = true; + buf.append('\n'); + } + + // <lb .../> + else if (!strncmp(token, "lb", 2)) { + userData->supressAdjacentWhitespace = true; + buf.append('\n'); + } + else if (!strncmp(token, "l", 1) && strstr(token, "eID")) { + userData->supressAdjacentWhitespace = true; + buf.append('\n'); + } + else if (!strncmp(token, "/divineName", 11)) { + // Get the end portion of the string, and upper case it + char* end = buf.getRawData(); + end += buf.size() - u->lastTextNode.size(); + toupperstr(end); + } + else if (!strncmp(token, "hi", 2)) { + + // handle both OSIS 'type' and TEI 'rend' attributes + // there is no officially supported OSIS overline attribute, + // thus either TEI overline or OSIS x-overline would be best, + // but we have used "ol" in the past, as well. Once a valid + // OSIS overline attribute is made available, these should all + // eventually be deprecated and never documented that they are supported. + if (strstr(token, "rend=\"ol\"") || strstr(token, "rend=\"x-overline\"") || strstr(token, "rend=\"overline\"") + || strstr(token, "type=\"ol\"") || strstr(token, "type=\"x-overline\"") || strstr(token, "type=\"overline\"")) { + u->hiType = "overline"; + } + else u->hiType = ""; + u->suspendTextPassThru = true; + } + else if (!strncmp(token, "/hi", 3)) { + if (u->hiType == "overline") { + const unsigned char *b = (const unsigned char *)u->lastTextNode.c_str(); + while (*b) { + const unsigned char *o = b; + if (getUniCharFromUTF8(&b)) { + while (o != b) buf.append(*(o++)); + buf.append((unsigned char)0xCC); + buf.append((unsigned char)0x85); + } + } + } + else { + buf.append("*"); + buf.append(u->lastTextNode); + buf.append("*"); + } + u->suspendTextPassThru = false; + } + + // <milestone type="line"/> + else if (!strncmp(token, "milestone", 9)) { + const char* type = strstr(token+10, "type=\""); + if (type && strncmp(type+6, "line", 4)) { //we check for type != line + userData->supressAdjacentWhitespace = true; + buf.append('\n'); + } + } + + else { + return false; // we still didn't handle token + } + } + return true; +} + + +SWORD_NAMESPACE_END |