diff options
Diffstat (limited to 'src/modules/filters/thmlosis.cpp')
-rw-r--r-- | src/modules/filters/thmlosis.cpp | 591 |
1 files changed, 591 insertions, 0 deletions
diff --git a/src/modules/filters/thmlosis.cpp b/src/modules/filters/thmlosis.cpp new file mode 100644 index 0000000..3acd959 --- /dev/null +++ b/src/modules/filters/thmlosis.cpp @@ -0,0 +1,591 @@ +/****************************************************************************** + * + * thmlosis.cpp - filter to convert ThML to OSIS + * + * $Id: thmlosis.cpp 2833 2013-06-29 06:40:28Z chrislit $ + * + * Copyright 2002-2013 CrossWire Bible Society (http://www.crosswire.org) + * CrossWire Bible Society + * P. O. Box 2528 + * Tempe, AZ 85280-2528 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <ctype.h> +#include <thmlosis.h> +#include <swmodule.h> +#include <swlog.h> +#include <versekey.h> +#include <utilstr.h> +#include <utilxml.h> + + +SWORD_NAMESPACE_START + +ThMLOSIS::ThMLOSIS() { +} + + +ThMLOSIS::~ThMLOSIS() { +} + + +char ThMLOSIS::processText(SWBuf &text, const SWKey *key, const SWModule *module) { + char token[2048]; // cheese. Fix. + int tokpos = 0; + bool intoken = false; + bool keepToken = false; + bool ampersand = false; + +// static QuoteStack quoteStack; + + bool lastspace = false; + char val[128]; + SWBuf buf; + char *valto; + char *ch; + + const char *wordStart = text.c_str(); + const char *wordEnd = NULL; + + const char *textStart = NULL; + const char *textEnd = NULL; + + bool suspendTextPassThru = false; + bool handled = false; + bool newText = false; + bool newWord = false; + +// SWBuf tmp; + SWBuf divEnd = ""; + + SWBuf orig = text; + const char* from = orig.c_str(); + + text = ""; + for (from = orig.c_str(); *from; ++from) { + + // handle silly <variant word> items in greek whnu, remove when module is fixed + if ((*from == '<') && (*(from+1) < 0)) { + text += "<"; + continue; + } + + if (*from == '<') { //start of new token detected + intoken = true; + tokpos = 0; + token[0] = 0; + token[1] = 0; + token[2] = 0; + ampersand = false; + textEnd = from-1; + wordEnd = text.c_str() + text.length();//not good, instead of wordEnd = to! + +// wordEnd = to; + continue; + } + + if (*from == '&') { + intoken = true; + tokpos = 0; + token[0] = 0; + token[1] = 0; + token[2] = 0; + ampersand = true; + continue; + } + + if (*from == ';' && ampersand) { + intoken = false; + ampersand = false; + + if (*token == '#') { + text += '&'; + text += token; + text += ';'; + } + else if (!strncmp("nbsp", token, 4)) text += ' '; + else if (!strncmp("quot", token, 4)) text += '"'; + else if (!strncmp("amp", token, 3)) text += '&'; + else if (!strncmp("lt", token, 2)) text += '<'; + else if (!strncmp("gt", token, 2)) text += '>'; + else if (!strncmp("brvbar", token, 6)) text += '¦'; + else if (!strncmp("sect", token, 4)) text += '§'; + else if (!strncmp("copy", token, 4)) text += '©'; + else if (!strncmp("laquo", token, 5)) text += '«'; + else if (!strncmp("reg", token, 3)) text += '®'; + else if (!strncmp("acute", token, 5)) text += '´'; + else if (!strncmp("para", token, 4)) text += '¶'; + else if (!strncmp("raquo", token, 5)) text += '»'; + else if (!strncmp("Aacute", token, 6)) text += 'Á'; + else if (!strncmp("Agrave", token, 6)) text += 'À'; + else if (!strncmp("Acirc", token, 5)) text += 'Â'; + else if (!strncmp("Auml", token, 4)) text += 'Ä'; + else if (!strncmp("Atilde", token, 6)) text += 'Ã'; + else if (!strncmp("Aring", token, 5)) text += 'Å'; + else if (!strncmp("aacute", token, 6)) text += 'á'; + else if (!strncmp("agrave", token, 6)) text += 'à'; + else if (!strncmp("acirc", token, 5)) text += 'â'; + else if (!strncmp("auml", token, 4)) text += 'ä'; + else if (!strncmp("atilde", token, 6)) text += 'ã'; + else if (!strncmp("aring", token, 5)) text += 'å'; + else if (!strncmp("Eacute", token, 6)) text += 'É'; + else if (!strncmp("Egrave", token, 6)) text += 'È'; + else if (!strncmp("Ecirc", token, 5)) text += 'Ê'; + else if (!strncmp("Euml", token, 4)) text += 'Ë'; + else if (!strncmp("eacute", token, 6)) text += 'é'; + else if (!strncmp("egrave", token, 6)) text += 'è'; + else if (!strncmp("ecirc", token, 5)) text += 'ê'; + else if (!strncmp("euml", token, 4)) text += 'ë'; + else if (!strncmp("Iacute", token, 6)) text += 'Í'; + else if (!strncmp("Igrave", token, 6)) text += 'Ì'; + else if (!strncmp("Icirc", token, 5)) text += 'Î'; + else if (!strncmp("Iuml", token, 4)) text += 'Ï'; + else if (!strncmp("iacute", token, 6)) text += 'í'; + else if (!strncmp("igrave", token, 6)) text += 'ì'; + else if (!strncmp("icirc", token, 5)) text += 'î'; + else if (!strncmp("iuml", token, 4)) text += 'ï'; + else if (!strncmp("Oacute", token, 6)) text += 'Ó'; + else if (!strncmp("Ograve", token, 6)) text += 'Ò'; + else if (!strncmp("Ocirc", token, 5)) text += 'Ô'; + else if (!strncmp("Ouml", token, 4)) text += 'Ö'; + else if (!strncmp("Otilde", token, 6)) text += 'Õ'; + else if (!strncmp("oacute", token, 6)) text += 'ó'; + else if (!strncmp("ograve", token, 6)) text += 'ò'; + else if (!strncmp("ocirc", token, 5)) text += 'ô'; + else if (!strncmp("ouml", token, 4)) text += 'ö'; + else if (!strncmp("otilde", token, 6)) text += 'õ'; + else if (!strncmp("Uacute", token, 6)) text += 'Ú'; + else if (!strncmp("Ugrave", token, 6)) text += 'Ù'; + else if (!strncmp("Ucirc", token, 5)) text += 'Û'; + else if (!strncmp("Uuml", token, 4)) text += 'Ü'; + else if (!strncmp("uacute", token, 6)) text += 'ú'; + else if (!strncmp("ugrave", token, 6)) text += 'ù'; + else if (!strncmp("ucirc", token, 5)) text += 'û'; + else if (!strncmp("uuml", token, 4)) text += 'ü'; + else if (!strncmp("Yacute", token, 6)) text += 'Ý'; + else if (!strncmp("yacute", token, 6)) text += 'ý'; + else if (!strncmp("yuml", token, 4)) text += 'ÿ'; + + else if (!strncmp("deg", token, 3)) text += '°'; + else if (!strncmp("plusmn", token, 6)) text += '±'; + else if (!strncmp("sup2", token, 4)) text += '²'; + else if (!strncmp("sup3", token, 4)) text += '³'; + else if (!strncmp("sup1", token, 4)) text += '¹'; + else if (!strncmp("nbsp", token, 4)) text += 'º'; + else if (!strncmp("pound", token, 5)) text += '£'; + else if (!strncmp("cent", token, 4)) text += '¢'; + else if (!strncmp("frac14", token, 6)) text += '¼'; + else if (!strncmp("frac12", token, 6)) text += '½'; + else if (!strncmp("frac34", token, 6)) text += '¾'; + else if (!strncmp("iquest", token, 6)) text += '¿'; + else if (!strncmp("iexcl", token, 5)) text += '¡'; + else if (!strncmp("ETH", token, 3)) text += 'Ð'; + else if (!strncmp("eth", token, 3)) text += 'ð'; + else if (!strncmp("THORN", token, 5)) text += 'Þ'; + else if (!strncmp("thorn", token, 5)) text += 'þ'; + else if (!strncmp("AElig", token, 5)) text += 'Æ'; + else if (!strncmp("aelig", token, 5)) text += 'æ'; + else if (!strncmp("Oslash", token, 6)) text += 'Ø'; + else if (!strncmp("curren", token, 6)) text += '¤'; + else if (!strncmp("Ccedil", token, 6)) text += 'Ç'; + else if (!strncmp("ccedil", token, 6)) text += 'ç'; + else if (!strncmp("szlig", token, 5)) text += 'ß'; + else if (!strncmp("Ntilde", token, 6)) text += 'Ñ'; + else if (!strncmp("ntilde", token, 6)) text += 'ñ'; + else if (!strncmp("yen", token, 3)) text += '¥'; + else if (!strncmp("not", token, 3)) text += '¬'; + else if (!strncmp("ordf", token, 4)) text += 'ª'; + else if (!strncmp("uml", token, 3)) text += '¨'; + else if (!strncmp("shy", token, 3)) text += ''; + else if (!strncmp("macr", token, 4)) text += '¯'; + else if (!strncmp("micro", token, 5)) text += "µ"; + else if (!strncmp("middot", token, 6)) text +="·"; + else if (!strncmp("cedil", token, 5)) text += "¸"; + else if (!strncmp("ordm", token, 4)) text += "º"; + else if (!strncmp("times", token, 5)) text += "×"; + else if (!strncmp("divide", token, 6)) text +="÷"; + else if (!strncmp("oslash", token, 6)) text +="ø"; + continue; + } + + // handle silly <variant word> items in greek whnu, remove when module is fixed + if ((*from == '>') && (*(from-1) < 0)) { + text += ">"; + continue; + } + + if (*from == '>') { // process tokens + intoken = false; + keepToken = false; + suspendTextPassThru = false; + newWord = true; + handled = false; + + while (wordStart < (text.c_str() + text.length())) { //hack + if (strchr(";,. :?!()'\"", *wordStart) && wordStart[0] && wordStart[1]) + wordStart++; + else break; + } + while (wordEnd > wordStart) { + if (strchr(" ,;:.?!()'\"", *wordEnd)) + wordEnd--; + else break; + } + + // variants + if (!strncmp(token, "div type=\"variant\"", 18)) { + XMLTag tag = token; + text.append("<seg type=\"x-variant\""); + SWBuf cls = "x-class:"; + cls += tag.getAttribute("class"); + if (cls.length()>8) + text.appendFormatted(" subType=\"%s\"", cls.c_str()); + + text += ">"; + divEnd = "</seg>"; + newText = true; + lastspace = false; + handled = true; + } + // section titles + if (!strcmp(token, "div class=\"sechead\"")) { +// pushString(&to, "<title>"); + text.append("<title>"); + divEnd = "</title>"; + newText = true; + lastspace = false; + handled = true; + } + else if (!strcmp(token, "/div")) { + //pushString(&to, divEnd.c_str()); + text.append(divEnd); + lastspace = false; + handled = true; + } + // Scripture Reference + if (!strncmp(token, "scripRef", 8)) { + // pushString(buf, "<reference osisRef=\""); + suspendTextPassThru = true; + newText = true; + handled = true; + } + else if (!strncmp(token, "/scripRef", 9)) { + SWBuf tmp; + tmp = ""; + tmp.append(textStart, (int)(textEnd - textStart)+1); + //pushString(&to, convertToOSIS(tmp.c_str(), key)); + text.append(VerseKey::convertToOSIS(tmp.c_str(), key)); + suspendTextPassThru = false; + handled = true; + } +// Usage of italics to represent transChange isn't domaninant; +// solution: mark in OSIS instead, assume no semantics other than emphasis +// of italicized text +// if (!strcmp(module->Type(), "Biblical Texts")) { +// // Italics assume transchange for Biblical texts +// if (!stricmp(token, "i")) { +// pushString(&to, "<transChange type=\"added\">"); +// newText = true; +// lastspace = false; +// handled = true; +// } +// else if (!stricmp(token, "/i")) { +// pushString(&to, "</transChange>"); +// lastspace = false; +// handled = true; +// } +// } +// else { +// // otherwise, italics are just italics +//-- end italics for transchange + if (!stricmp(token, "i")) { +// pushString(&to, "<hi type=\"i\">"); + text.append("<hi type=\"i\">"); + newText = true; + lastspace = false; + handled = true; + } + else if (!stricmp(token, "/i")) { +// pushString(&to, "</hi>"); + text.append("</hi>"); + lastspace = false; + handled = true; + } +// } + + if (!strcmp(token, "b")) { +// pushString(&to, "<hi type=\"b\">"); + text.append("<hi type=\"b\">"); + newText = true; + lastspace = false; + handled = true; + } + else if (!strcmp(token, "/b")) { +// pushString(&to, "</hi>"); + text.append("</hi>"); + lastspace = false; + handled = true; + } + + // Footnote + if (!strncmp(token, "note", 4)) { + //pushString(&to, "<note>"); + text.append("<note>"); + newText = true; + lastspace = false; + handled = true; + } + else if (!strcmp(token, "/note")) { + // pushString(&to, "</note>"); + text.append("</note>"); + lastspace = false; + handled = true; + } + + // Figure + else if (!strncmp(token, "img ", 4)) { + const char *src = strstr(token, "src"); + if (!src) // assert we have a src attribute + continue; +// return false; + + //pushString(&to, "<figure src=\""); + text.append("<figure src=\""); + + const char* end = strchr(src+2, '"'); //start search behind src=" + + if (end) { //append the path + text.append(src+2, end - (src+2)); + } + +// const char *c; +// for (c = src;((*c) && (*c != '"')); c++); + +// uncomment for SWORD absolute path logic +// if (*(c+1) == '/') { +// pushString(buf, "file:"); +// pushString(buf, module->getConfigEntry("AbsoluteDataPath")); +// if (*((*buf)-1) == '/') +// c++; // skip '/' +// } +// end of uncomment for asolute path logic + +// for (c++;((*c) && (*c != '"')); c++) +// *to++ = *c; + + //pushString(&to, "\" />"); + text.append("\" />"); + handled = true; + } + + // Strongs numbers + else if (!strnicmp(token, "sync type=\"Strongs\" ", 20)) { // Strongs + valto = val; + for (unsigned int i = 27; token[i] != '\"' && i < 150; i++) + *valto++ = token[i]; + *valto = 0; + if (atoi((!isdigit(*val))?val+1:val) < 5627) { + // normal strongs number + strstrip(val); + + if (!strncmp(wordStart, "<w ", 3)) { + const char *attStart = strstr(wordStart, "lemma"); + if (attStart) { //existing morph attribute, append this one to it + attStart += 7; + buf = ""; + buf.appendFormatted("strong:%s ", val); + } + else { // no lemma attribute + attStart = wordStart + 3; + buf = ""; + buf.appendFormatted(buf, "lemma=\"strong:%s\" ", val); + } + + text.insert(attStart - text.c_str(), buf); + } + else { //wordStart doesn't point to an existing <w> attribute! + buf = ""; + buf.appendFormatted("<w lemma=\"strong:%s\">", val); + text.insert(wordStart - text.c_str(), buf); + text += "</w>"; + lastspace = false; + } + } + // OLB verb morph, leave it out of OSIS tag + else { + } + handled = true; + } + + // Morphology + else if (!strncmp(token, "sync type=\"morph\"", 17)) { + SWBuf cls = ""; + SWBuf morph = ""; + for (ch = token+17; *ch; ch++) { + if (!strncmp(ch, "class=\"", 7)) { + valto = val; + for (unsigned int i = 7; ch[i] != '\"' && i < 127; i++) + *valto++ = ch[i]; + *valto = 0; + strstrip(val); + cls = val; + } + if (!strncmp(ch, "value=\"", 7)) { + valto = val; + for (unsigned int i = 7; ch[i] != '\"' && i < 127; i++) + *valto++ = ch[i]; + *valto = 0; + strstrip(val); + morph = val; + } + } + if (!strncmp(wordStart, "<w ", 3)) { + const char *attStart = strstr(wordStart, "morph"); + if (attStart) { //existing morph attribute, append this one to it + attStart += 7; + buf = ""; + buf.appendFormatted("%s:%s ", ((cls.length())?cls.c_str():"robinson"), morph.c_str()); + } + else { // no lemma attribute + attStart = wordStart + 3; + buf = ""; + buf.appendFormatted("morph=\"%s:%s\" ", ((cls.length())?cls.c_str():"robinson"), morph.c_str()); + } + + text.insert(attStart - text.c_str(), buf); //hack, we have to + } + else { //no existing <w> attribute fond + buf = ""; + buf.appendFormatted("<w morph=\"%s:%s\">", ((cls.length())?cls.c_str():"robinson"), morph.c_str()); + text.insert(wordStart - text.c_str(), buf); + text += "</w>"; + lastspace = false; + + } + handled = true; + } + + if (!keepToken) { + if (!handled) { + SWLog::getSystemLog()->logError("Unprocessed Token: <%s> in key %s", token, key ? (const char*)*key : "<unknown>"); +// exit(-1); + } + if (from[1] && strchr(" ,;.:?!()'\"", from[1])) { + if (lastspace) { + text--; + } + } + if (newText) { + textStart = from+1; + newText = false; + } + continue; + } + + // if not a strongs token, keep token in text + text.appendFormatted("<%s>", token); + + if (newText) { + textStart = text.c_str() + text.length(); + newWord = false; + } + continue; + } + if (intoken) { + if ((tokpos < 2045) && ((*from != 10)&&(*from != 13))) { + token[tokpos++] = *from; + token[tokpos+2] = 0; + } + } + else { + switch (*from) { + case '\'': + case '\"': + case '`': +// quoteStack.handleQuote(fromStart, from, &to); + text += *from; + //from++; //this line removes chars after an apostrophe! Needs fixing. + break; + default: + if (newWord && (*from != ' ')) { + wordStart = text.c_str() + text.length(); + newWord = false; + + //fix this if required? + //memset(to, 0, 10); + + } + + if (!suspendTextPassThru) { + text += (*from); + lastspace = (*from == ' '); + } + } + } + } + + VerseKey *vkey = SWDYNAMIC_CAST(VerseKey, key); + if (vkey) { + SWBuf ref = ""; + if (vkey->getVerse()) { + ref.appendFormatted("\t\t<verse osisID=\"%s\">", vkey->getOSISRef()); + } + + if (ref.length() > 0) { + + text = ref + text; + + if (vkey->getVerse()) { + VerseKey *tmp = (VerseKey *)vkey->clone(); + *tmp = *vkey; + tmp->setAutoNormalize(false); + tmp->setIntros(true); + + text += "</verse>"; + + *tmp = MAXVERSE; + if (*vkey == *tmp) { + tmp->setVerse(0); +// sprintf(ref, "\t</div>"); +// pushString(&to, ref); + *tmp = MAXCHAPTER; + *tmp = MAXVERSE; + if (*vkey == *tmp) { + tmp->setChapter(0); + tmp->setVerse(0); +// sprintf(ref, "\t</div>"); +// pushString(&to, ref); +/* + if (!quoteStack.empty()) { + SWLog::getSystemLog()->logError("popping unclosed quote at end of book"); + quoteStack.clear(); + } +*/ + } + } + delete tmp; + } +// else if (vkey->getChapter()) { +// sprintf(ref, "\t<div type=\"chapter\" osisID=\"%s\">", vkey->getOSISRef()); +// } +// else sprintf(ref, "\t<div type=\"book\" osisID=\"%s\">", vkey->getOSISRef()); + } + } + return 0; +} + + +SWORD_NAMESPACE_END |