summaryrefslogtreecommitdiff
path: root/src/frontend/im/hebrewmcim.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/frontend/im/hebrewmcim.cpp')
-rw-r--r--src/frontend/im/hebrewmcim.cpp653
1 files changed, 653 insertions, 0 deletions
diff --git a/src/frontend/im/hebrewmcim.cpp b/src/frontend/im/hebrewmcim.cpp
new file mode 100644
index 0000000..9ec55a9
--- /dev/null
+++ b/src/frontend/im/hebrewmcim.cpp
@@ -0,0 +1,653 @@
+
+/**
+ * Title: Keyboard mapping for Michigan-Claremont Hebrew input
+ * Description:
+ * Copyright: Copyright (c) 2001 CrossWire Bible Society under the terms of the GNU GPL
+ * Company:
+ * @author Troy A. Griffitts
+ * @version 1.0
+ */
+
+#include <hebrewmcim.h>
+
+HebrewMCIM::HebrewMCIM()
+ :SWInputMethod() {
+
+ init();
+}
+
+
+int *HebrewMCIM::translate(char in) {
+ int retVal = 0;
+ static int retString[5];
+ int retStringIndex = 0;
+
+ memset(retString, 0, 5);
+
+ if (getState() > 1) {
+ if (getState() >= 12) { // serious issue with internal structure
+ setState(0);
+ retString[retStringIndex++] = in;
+ return retString;
+ }
+ map<int, int>::iterator find = subst2[getState()].find(in);
+ if (find != subst2[getState()].end())
+ retVal = find->second;
+ else retVal = in;
+
+ setState(0);
+ retString[retStringIndex++] = retVal;
+ return retString;
+ }
+ else {
+ retVal = subst[in];
+
+ if (retVal == 0) {
+ setState(0);
+ retString[retStringIndex++] = in;
+ return retString;
+ }
+ if (retVal > 100) {
+ setState(1);
+ retString[retStringIndex++] = retVal;
+ return retString;
+ }
+ if (retVal == 50) { // multiChar
+ setState(1);
+ int *chars = multiChars[in];
+ if (chars != 0) {
+ retString[retStringIndex++] = chars[0];
+ retString[retStringIndex++] = chars[1];
+ return retString;
+ }
+ }
+ }
+ setState(retVal);
+ return 0;
+}
+
+
+void HebrewMCIM::init() {
+ memset(subst, 0, 255);
+
+ subst[')'] = 1488;
+ subst['B'] = 1489;
+ subst['G'] = 1490;
+ subst['D'] = 1491;
+ subst['H'] = 1492;
+ subst['W'] = 1493;
+ subst['Z'] = 1494;
+ subst['X'] = 1495;
+ subst['+'] = 1496;
+ subst['Y'] = 1497;
+
+ subst['k'] = 1498; // finals
+ subst['m'] = 1501;
+ subst['n'] = 1503;
+ subst['c'] = 1509;
+
+ subst['P'] = 1508;
+ subst['K'] = 1499;
+ subst['L'] = 1500;
+ subst['M'] = 1502;
+ subst['N'] = 1504;
+ subst['S'] = 1505;
+ subst['('] = 1506;
+ subst['p'] = 1507;
+ subst['C'] = 1510;
+ subst['Q'] = 1511;
+ subst['R'] = 1512;
+ subst['#'] = 1513;
+
+ // special multiChars
+ subst['&'] = 50;
+ subst['$'] = 50;
+
+ static int x[] = {1513, 1474};
+ multiChars['&'] = x;
+ static int y[] = {1513, 1473};
+ multiChars['$'] = y;
+
+ subst['T'] = 1514;
+
+ // VOWELS
+ subst['A'] = 1463;
+ subst['F'] = 1464;
+ subst['E'] = 1462;
+ subst['"'] = 1461;
+ subst['I'] = 1460;
+ subst['O'] = 1465;
+ subst['U'] = 1467;
+
+
+
+ // OTHER DIACRITICS
+ subst['.'] = 1468;
+ subst['-'] = 1470;
+ subst[','] = 1471;
+
+ // Compound input
+
+ // CANTILLATION
+
+ subst[':'] = 2;
+ subst2[2]['A'] = 1458;
+ subst2[2]['E'] = 1457;
+ subst2[2]['F'] = 1459;
+
+
+ /* Telisha qetana is postpositive as in '04' above. However, Michigan
+# code '24' is for a medial telisha. Graphically, there is no
+# difference.
+ */
+ subst['2'] = 5;
+ subst2[5]['4'] = 1449;
+
+
+ /* Note Michigan encoding distinguishes between medial metheg '35' (occuring
+# on the left of the vowel), and the ordinary meteg '95' (occuring on the
+# right of the vowel). It is also used for silluq.
+ */
+ subst['3'] = 6;
+ subst2[6]['3'] = 1433;
+ subst2[6]['5'] = 1469;
+
+
+ /* The Michigan code of telisha gedola in medial position. Graphically,
+# there is no difference.
+ */
+ subst['4'] = 7;
+ subst2[7]['4'] = 1440;
+
+ subst['6'] = 8;
+ subst2[8]['0'] = 1451;
+ subst2[8]['1'] = 1436;
+
+ subst['1'] = 4;
+ subst2[4]['0'] = 1434;
+
+ /* In the poetic books, prepositive dehi occurs; it's unclear whether
+# tipeha also occurs in the poetic books. Otherwise, we could simply
+# check for what book in the Tanach we are in. Michigan uses the same
+# code for each.
+ */
+
+ subst2[4]['3'] = 1430;
+
+ /* This is the poetic accent mugrash, which also includes rebia, but is
+# encoded separately as '81' in the Michigan text.
+ */
+ subst2[4]['1'] = 1437;
+ subst2[4]['4'] = 1440;
+
+
+ subst['0'] = 3;
+ subst2[3]['0'] = 1475;
+ subst2[3]['1'] = 1426;
+
+ /* According to BHS, zarqa and sinnor are both postpositive. However,
+# the Michigan encoding uses one code for both. The Unicode zarqa
+# (0x0598) is definitely NOT postpositive. And further, the shape of
+# the symbol is different in BHS and Uniocde. This needs further
+# research to determine what's going on here. For now, we follow BHS
+# and use the postpositive Unicode zinor or both accents.
+ */
+
+ subst2[3]['2'] = 1454;
+
+ /* Pashta is postpositive, and the Unicode equivalent reflects
+# this. However, there is a poetic equivalent -- azla legarmeh --
+# which is not postpositive, but no equivalent code point exists in
+# Unicode. The Michigan encoding does not distinguish between the two,
+# although it could be algorithmically determined.
+ */
+
+ subst2[3]['3'] = 1433;
+ subst2[3]['4'] = 1449;
+ subst2[3]['5'] = 1472;
+
+
+ /* This is the Unicode Hebrew *accent*; there is also another Hebrew
+# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
+# traditional rounded marks, rather than the alternate straight
+# marks.
+ */
+
+ subst2[8]['2'] = 1438;
+
+ // Also known as azla
+ subst2[8]['3'] = 1448;
+ subst2[8]['4'] = 1452;
+ subst2[8]['5'] = 1427;
+
+
+ subst['8'] = 9;
+ subst2[9]['0'] = 1428;
+ subst2[9]['1'] = 1431;
+
+ /* Note, this accent is actually sinnorit, but it does not exist as a
+# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
+# is postpositive, while sinnorit is not. ZARQA is as close as I can
+# get to this.
+ */
+ subst2[9]['2'] = 1432;
+
+ /* The Unicode form does not match the form used by BHS, but the names
+# are the same.
+ */
+ subst2[9]['3'] = 1441;
+ subst2[9]['4'] = 1439;
+ subst2[9]['5'] = 1429;
+
+ subst['7'] = 10;
+ subst2[10]['0'] = 1444;
+ subst2[10]['1'] = 1445;
+ subst2[10]['2'] = 1446;
+ subst2[10]['3'] = 1430; // also '13', '73' also is used for majela
+ subst2[10]['4'] = 1443;
+ subst2[10]['5'] = 1469; // this is silluq; should appear to the left of the vowel
+
+ subst['9'] = 11;
+ subst2[11]['1'] = 1435;
+ subst2[11]['2'] = 1425;
+ subst2[11]['3'] = 1450;
+ subst2[11]['4'] = 1447;
+ subst2[11]['5'] = 1469; // should appear to the right of the vowel
+
+}
+
+ /*
+
+
+# CANTILLION MARKS
+
+ my $ETNAHTA = '&#1425;';
+# officially the Unicode name for this symbol was "SEGOL." However, that is
+# not a unique name, conflicting with the vowel of the same name. Further,
+# the position of the symbol is different. I have changed the name of the
+# accent to "SEGOLTA," the traditional name for this accent.
+ my $SEGOLTA = '&#1426;';
+ my $SHALSHELET = '&#1427;';
+ my $ZAQEF_QATAN = '&#1428;';
+ my $ZAQEF_GADOL = '&#1429;';
+ my $TIPEHA = '&#1430;';
+ my $REVIA = '&#1431;';
+ my $ZARQA = '&#1432;';
+ my $PASHTA = '&#1433;';
+ my $YETIV = '&#1434;';
+ my $TEVIR = '&#1435;';
+ my $GERESH = '&#1436;';
+ my $GERESH_MUQDAM = '&#1437;';
+ my $GERSHAYIM = '&#1438;';
+ my $QARNEY_PARA = '&#1439;';
+ my $TELISHA_GEDOLA = '&#1440;';
+ my $PAZER = '&#1441;';
+ my $MUNAH = '&#1443;';
+ my $MAHAPAKH = '&#1444;';
+ my $MERKHA = '&#1445;';
+ my $MERKHA_KEFULA = '&#1446;';
+ my $DARGA = '&#1447;';
+ my $QADMA = '&#1448;';
+ my $TELISHA_QETANA = '&#1449;';
+ my $YERAH_BEN_YOMO = '&#1450;';
+ my $OLE = '&#1451;';
+ my $ILUY = '&#1452;';
+ my $DEHI = '&#1453;';
+ my $ZINOR = '&#1454;';
+# HEBREW MARK
+ my $MASORA_CIRCLE = '&#1455;';
+# HEBREW EXTENDED-A points and punctuation
+ my $SHEVA = '&#1456;';
+ my $HATAF_SEGOL = '&#1457;';
+ my $HATAF_PATAH = '&#1458;';
+ my $HATAF_QAMATS = '&#1459;';
+ my $HIRIQ = '&#1460;';
+ my $TSERE = '&#1461;';
+ my $SEGOL = '&#1462;';
+# furtive Patah is not a distinct character
+ my $PATAH = '&#1463;';
+ my $QAMATS = '&#1464;';
+ my $HOLAM = '&#1465;';
+ my $QUBUTS = '&#1467;';
+# also used as shuruq
+# falls within the base letter
+ my $DAGESH_OR_MAPIQ = '&#1468;';
+# also used as siluq
+ my $METAG = '&#1469;';
+ my $MAQAF = '&#1470;';
+ my $RAFE = '&#1471;';
+# Also used for legarmeh
+# may be treated as spacing punctuation, not as a point
+ my $PASEQ = '&#1472;';
+ my $SHIN_DOT = '&#1473;';
+ my $SIN_DOT = '&#1474;';
+ my $SOF_PASUQ = '&#1475;';
+# HEBREW MARK
+ my $UPPER_DOT = '&#1476;';
+# HEBREW LETTERS based on ISO 8859-8
+# aleph
+# x (alef symbol - 2135)
+ my $ALEF = '&#1488;';
+# x (bet symbol - 2136)
+ my $BET = '&#1489;';
+# x (gimel symbol - 2137)
+ my $GIMEL = '&#1490;';
+# x (dalet symbol - 2138)
+ my $DALET = '&#1491;';
+ my $HE = '&#1492;';
+ my $VAV = '&#1493;';
+ my $ZAYIN = '&#1494;';
+ my $HET = '&#1495;';
+ my $TET = '&#1496;';
+ my $YOD = '&#1497;';
+ my $FINAL_KAF = '&#1498;';
+ my $KAF = '&#1499;';
+ my $LAMED = '&#1500;';
+ my $FINAL_MEM = '&#1501;';
+ my $MEM = '&#1502;';
+ my $FINAL_NUN = '&#1503;';
+ my $NUN = '&#1504;';
+ my $SAMEKH = '&#1505;';
+ my $AYIN = '&#1506;';
+ my $FINAL_PE = '&#1507;';
+ my $PE = '&#1508;';
+ my $FINAL_TSADI = '&#1509;';
+# also known as zade
+ my $TSADI = '&#1510;';
+ my $QOF = '&#1511;';
+ my $RESH = '&#1512;';
+ my $SHIN = '&#1513;';
+ my $TAV = '&#1514;';
+# Yiddish digraphs
+# Hebrew Ligature
+# tsvey vovn
+ my $DOUBLE_VAV = '&#1520;';
+ my $VAV_YOD = '&#1521;';
+# tsvey yudn
+ my $DOUBLE_YOD = '&#1522;';
+
+# Additional punctuation
+ my $PUNCT_GERESH = '&#1523;';
+ my $PUNCT_GERSHAYIM = '&#1524;';
+# Reserved: 0x05F5"
+# x (hebrew point judeo-spanish varika - FB1E)
+#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E
+
+#############################
+# End of Unicode 2.0 Hebrew #
+#############################
+
+# A hash whose key is a Michagan code, and whose value is a Unicode
+# equvalent
+
+ char subst[] = new char [255];
+ subst[')'] = 1488;
+ 'B' => $BET,
+ 'G' => $GIMEL,
+ 'D' => $DALET,
+ 'H' => $HE,
+ 'W' => $VAV,
+ 'Z' => $ZAYIN,
+ 'X' => $HET,
+ '+' => $TET,
+ 'Y' => $YOD,
+ 'K' => $KAF,
+ 'L' => $LAMED,
+ 'M' => $MEM,
+ 'N' => $NUN,
+ 'S' => $SAMEKH,
+ '(' => $AYIN,
+ 'P' => $PE,
+ 'C' => $TSADI,
+ 'Q' => $QOF,
+ 'R' => $RESH,
+ '#' => $SHIN, # the letter shin without a point
+ '&' => ($SHIN . $SIN_DOT),
+ '$' => ($SHIN . $SHIN_DOT), # '
+ 'T' => $TAV,
+# VOWELS
+ 'A' => $PATAH,
+ 'F' => $QAMATS,
+ 'E' => $SEGOL,
+ '"' => $TSERE,
+ 'I' => $HIRIQ,
+ 'O' => $HOLAM,
+ 'U' => $QUBUTS,
+ ':' => $SHEVA,
+ ':A' => $HATAF_PATAH,
+ ':E' => $HATAF_SEGOL,
+ ':F' => $HATAF_QAMATS,
+# OTHER DIACRITICS
+ '.' => $DAGESH_OR_MAPIQ,
+ '-' => $MAQAF,
+ ',' => $RAFE,
+# CANTILLATION
+ '00' => $SOF_PASUQ,
+ '01' => $SEGOLTA,
+# According to BHS, zarqa and sinnor are both postpositive. However,
+# the Michigan encoding uses one code for both. The Unicode zarqa
+# (0x0598) is definitely NOT postpositive. And further, the shape of
+# the symbol is different in BHS and Uniocde. This needs further
+# research to determine what's going on here. For now, we follow BHS
+# and use the postpositive Unicode zinor or both accents.
+ '02' => $ZINOR,
+# Pashta is postpositive, and the Unicode equivalent reflects
+# this. However, there is a poetic equivalent -- azla legarmeh --
+# which is not postpositive, but no equivalent code point exists in
+# Unicode. The Michigan encoding does not distinguish between the two,
+# although it could be algorithmically determined.
+ '03' => $PASHTA,
+ '04' => $TELISHA_QETANA,
+ '05' => $PASEQ,
+ '10' => $YETIV,
+# In the poetic books, prepositive dehi occurs; it's unclear whether
+# tipeha also occurs in the poetic books. Otherwise, we could simply
+# check for what book in the Tanach we are in. Michigan uses the same
+# code for each.
+ '13' => $TIPEHA, # also $DEHI
+# This is the poetic accent mugrash, which also includes rebia, but is
+# encoded separately as '81' in the Michigan text.
+ '11' => $GERESH_MUQDAM,
+ '14' => $TELISHA_GEDOLA,
+# Telisha qetana is postpositive as in '04' above. However, Michigan
+# code '24' is for a medial telisha. Graphically, there is no
+# difference.
+ '24' => $TELISHA_QETANA,
+ '33' => $PASHTA,
+# The Michigan code of telisha gedola in medial position. Graphically,
+# there is no difference.
+ '44' => $TELISHA_GEDOLA,
+ '60' => $OLE,
+ '61' => $GERESH,
+# This is the Unicode Hebrew *accent*; there is also another Hebrew
+# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
+# traditional rounded marks, rather than the alternate straight
+# marks.
+ '62' => $GERSHAYIM,
+# Also known as azla
+ '63' => $QADMA,
+ '64' => $ILUY,
+ '65' => $SHALSHELET,
+ '80' => $ZAQEF_QATAN,
+ '81' => $REVIA,
+# Note, this accent is actually sinnorit, but it does not exist as a
+# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
+# is postpositive, while sinnorit is not. ZARQA is as close as I can
+# get to this.
+ '82' => $ZARQA,
+# The Unicode form does not match the form used by BHS, but the names
+# are the same.
+ '83' => $PAZER,
+ '84' => $QARNEY_PARA,
+ '85' => $ZAQEF_GADOL,
+# Note Michigan encoding distinguishes between medial metheg '35' (occuring
+# on the left of the vowel), and the ordinary meteg '95' (occuring on the
+# right of the vowel). It is also used for silluq.
+ '35' => $METAG,
+ '70' => $MAHAPAKH,
+ '71' => $MERKHA,
+ '72' => $MERKHA_KEFULA,
+ '73' => $TIPEHA, # also '13', '73' also is used for majela
+ '74' => $MUNAH,
+ '75' => $METAG, # this is silluq; should appear to the left of the vowel
+ '91' => $TEVIR,
+ '92' => $ETNAHTA,
+ '93' => $YERAH_BEN_YOMO,
+ '94' => $DARGA,
+ '95' => $METAG, # should appear to the right of the vowel
+
+# Not used by the Michigan Encoding
+# $UPPER_DOT = '05C4';
+ );
+
+# declare other variables
+ my (@bhsLines,
+ @bhsVerse,
+ @entity_line) = ();
+
+ my ($i,
+ $verse,
+ $word,
+ $character) = 0;
+
+ my ($element,
+ $saveGuttural) = "";
+
+# read in a line
+ while (<>) {
+# Process one verse
+# iterate over every character and change to XML decimal entity
+ CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {
+ # find and convert final kaf, mem, nun, pe, tsade
+ ( # if final form
+ $bhsVerse[$i] =~ /[KMNPC]/
+ )
+ &&
+ (
+ ( # whitespace or
+ $bhsVerse[$i+1] =~ /[ \-?]/
+ )
+ ||
+ ( # EOL or
+ $i == ( scalar(@bhsVerse) - 1 )
+ )
+ ||
+ ( # sof pasuq or
+ ( $bhsVerse[$i+1] =~ /0/ ) &&
+ ( $bhsVerse[$i+2] =~ /0/ )
+ )
+ ||
+ ( # one accent followed by white, eol or
+ (
+ ( $bhsVerse[$i+1] =~ /\d/ ) &&
+ ( $bhsVerse[$i+2] =~ /\d/ )
+ ) &&
+ (
+ ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||
+ ( $i == ( scalar(@bhsVerse) - 1 ) )
+ )
+ )
+ ||
+ ( # two accents followed by white, eol
+ (
+ ( $bhsVerse[$i+1] =~ /\d/ ) &&
+ ( $bhsVerse[$i+2] =~ /\d/ ) &&
+ ( $bhsVerse[$i+3] =~ /\d/ ) &&
+ ( $bhsVerse[$i+4] =~ /\d/ )
+ ) &&
+ (
+ ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||
+ ( $i == ( scalar(@bhsVerse) - 1 ) )
+ )
+ )
+ ||
+ ( # followed by a vowel and white, eol, sof pasuq
+ ( $bhsVerse[$i+1] =~ /[:F]/ ) &&
+ ( # followed by
+ ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or
+ ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or
+ ( # sof pasuq
+ ( $bhsVerse[$i+2] =~ /0/ ) &&
+ ( $bhsVerse[$i+3] =~ /0/ )
+ )
+ )
+ )
+ ) # end of what follows after final letter
+ &&
+ do {
+ $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }
+ && next CHAR;
+ $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }
+ && next CHAR;
+ $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }
+ && next CHAR;
+ $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }
+ && next CHAR;
+ $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }
+ && next CHAR;
+ };
+ # find and convert "furtive patach"
+ ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach
+ ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural
+ ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel
+ ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq
+ ( $bhsVerse[$i-3] =~ /W/ ) ) || #
+ ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)
+ ( $bhsVerse[$i-3] =~ /O/ ) ) || #
+ ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod
+ ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&
+ do {
+ $saveGuttural = pop @entity_line; # snip off the gutteral
+ push @entity_line,$PATAH; # push on the patach
+ push @entity_line,$saveGuttural; # push back on the gutteral
+ next CHAR;
+ };
+ # convert cantillation
+ # since we have previously dealt with all other cases of
+ # numbers, two digit patterns are all we have to search for
+ $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {
+ push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
+ $i++; # accents are two digits long, so advance past the 2nd digit
+ next CHAR;
+ };
+ # convert katef vowels, which are two characters long
+ $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {
+ push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
+ $i++;
+ next CHAR;
+ };
+ # convert everything else
+ push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};
+ } # end CHAR
+# print the line to standard output with XML character-level encoding
+# each character has the following format:
+# <c id="1kg1.verse#.word#.character#">&#1234;</c>
+
+# set up the verse element
+ $word = 1;
+ $character = 1;
+ print "<verse>\n<word>\n";
+# print each character element
+# if there is a space, then close the word entity, open a new word
+# entity, increment the word number, reset the character number to
+# zero.
+ foreach $element (@entity_line) {
+ if ( $element =~ " " ) {
+ $word++;
+ $character = 1;
+ print "</word>\n<word>\n";
+ next;
+ }
+ print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";
+ $character++;
+ }
+# close the verse element
+ print "</word></verse>\n";
+# reinitialize variables
+ @bhsVerse = ();
+ @entity_line = ();
+ @bhsLines = ();
+ } # end while
+# close the XML document
+ print "</body>\n";
+ */