summaryrefslogtreecommitdiff
path: root/src/frontend/im/hebrewmcim.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/frontend/im/hebrewmcim.cpp')
-rw-r--r--src/frontend/im/hebrewmcim.cpp653
1 files changed, 0 insertions, 653 deletions
diff --git a/src/frontend/im/hebrewmcim.cpp b/src/frontend/im/hebrewmcim.cpp
deleted file mode 100644
index 9ec55a9..0000000
--- a/src/frontend/im/hebrewmcim.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-
-/**
- * Title: Keyboard mapping for Michigan-Claremont Hebrew input
- * Description:
- * Copyright: Copyright (c) 2001 CrossWire Bible Society under the terms of the GNU GPL
- * Company:
- * @author Troy A. Griffitts
- * @version 1.0
- */
-
-#include <hebrewmcim.h>
-
-HebrewMCIM::HebrewMCIM()
- :SWInputMethod() {
-
- init();
-}
-
-
-int *HebrewMCIM::translate(char in) {
- int retVal = 0;
- static int retString[5];
- int retStringIndex = 0;
-
- memset(retString, 0, 5);
-
- if (getState() > 1) {
- if (getState() >= 12) { // serious issue with internal structure
- setState(0);
- retString[retStringIndex++] = in;
- return retString;
- }
- map<int, int>::iterator find = subst2[getState()].find(in);
- if (find != subst2[getState()].end())
- retVal = find->second;
- else retVal = in;
-
- setState(0);
- retString[retStringIndex++] = retVal;
- return retString;
- }
- else {
- retVal = subst[in];
-
- if (retVal == 0) {
- setState(0);
- retString[retStringIndex++] = in;
- return retString;
- }
- if (retVal > 100) {
- setState(1);
- retString[retStringIndex++] = retVal;
- return retString;
- }
- if (retVal == 50) { // multiChar
- setState(1);
- int *chars = multiChars[in];
- if (chars != 0) {
- retString[retStringIndex++] = chars[0];
- retString[retStringIndex++] = chars[1];
- return retString;
- }
- }
- }
- setState(retVal);
- return 0;
-}
-
-
-void HebrewMCIM::init() {
- memset(subst, 0, 255);
-
- subst[')'] = 1488;
- subst['B'] = 1489;
- subst['G'] = 1490;
- subst['D'] = 1491;
- subst['H'] = 1492;
- subst['W'] = 1493;
- subst['Z'] = 1494;
- subst['X'] = 1495;
- subst['+'] = 1496;
- subst['Y'] = 1497;
-
- subst['k'] = 1498; // finals
- subst['m'] = 1501;
- subst['n'] = 1503;
- subst['c'] = 1509;
-
- subst['P'] = 1508;
- subst['K'] = 1499;
- subst['L'] = 1500;
- subst['M'] = 1502;
- subst['N'] = 1504;
- subst['S'] = 1505;
- subst['('] = 1506;
- subst['p'] = 1507;
- subst['C'] = 1510;
- subst['Q'] = 1511;
- subst['R'] = 1512;
- subst['#'] = 1513;
-
- // special multiChars
- subst['&'] = 50;
- subst['$'] = 50;
-
- static int x[] = {1513, 1474};
- multiChars['&'] = x;
- static int y[] = {1513, 1473};
- multiChars['$'] = y;
-
- subst['T'] = 1514;
-
- // VOWELS
- subst['A'] = 1463;
- subst['F'] = 1464;
- subst['E'] = 1462;
- subst['"'] = 1461;
- subst['I'] = 1460;
- subst['O'] = 1465;
- subst['U'] = 1467;
-
-
-
- // OTHER DIACRITICS
- subst['.'] = 1468;
- subst['-'] = 1470;
- subst[','] = 1471;
-
- // Compound input
-
- // CANTILLATION
-
- subst[':'] = 2;
- subst2[2]['A'] = 1458;
- subst2[2]['E'] = 1457;
- subst2[2]['F'] = 1459;
-
-
- /* Telisha qetana is postpositive as in '04' above. However, Michigan
-# code '24' is for a medial telisha. Graphically, there is no
-# difference.
- */
- subst['2'] = 5;
- subst2[5]['4'] = 1449;
-
-
- /* Note Michigan encoding distinguishes between medial metheg '35' (occuring
-# on the left of the vowel), and the ordinary meteg '95' (occuring on the
-# right of the vowel). It is also used for silluq.
- */
- subst['3'] = 6;
- subst2[6]['3'] = 1433;
- subst2[6]['5'] = 1469;
-
-
- /* The Michigan code of telisha gedola in medial position. Graphically,
-# there is no difference.
- */
- subst['4'] = 7;
- subst2[7]['4'] = 1440;
-
- subst['6'] = 8;
- subst2[8]['0'] = 1451;
- subst2[8]['1'] = 1436;
-
- subst['1'] = 4;
- subst2[4]['0'] = 1434;
-
- /* In the poetic books, prepositive dehi occurs; it's unclear whether
-# tipeha also occurs in the poetic books. Otherwise, we could simply
-# check for what book in the Tanach we are in. Michigan uses the same
-# code for each.
- */
-
- subst2[4]['3'] = 1430;
-
- /* This is the poetic accent mugrash, which also includes rebia, but is
-# encoded separately as '81' in the Michigan text.
- */
- subst2[4]['1'] = 1437;
- subst2[4]['4'] = 1440;
-
-
- subst['0'] = 3;
- subst2[3]['0'] = 1475;
- subst2[3]['1'] = 1426;
-
- /* According to BHS, zarqa and sinnor are both postpositive. However,
-# the Michigan encoding uses one code for both. The Unicode zarqa
-# (0x0598) is definitely NOT postpositive. And further, the shape of
-# the symbol is different in BHS and Uniocde. This needs further
-# research to determine what's going on here. For now, we follow BHS
-# and use the postpositive Unicode zinor or both accents.
- */
-
- subst2[3]['2'] = 1454;
-
- /* Pashta is postpositive, and the Unicode equivalent reflects
-# this. However, there is a poetic equivalent -- azla legarmeh --
-# which is not postpositive, but no equivalent code point exists in
-# Unicode. The Michigan encoding does not distinguish between the two,
-# although it could be algorithmically determined.
- */
-
- subst2[3]['3'] = 1433;
- subst2[3]['4'] = 1449;
- subst2[3]['5'] = 1472;
-
-
- /* This is the Unicode Hebrew *accent*; there is also another Hebrew
-# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
-# traditional rounded marks, rather than the alternate straight
-# marks.
- */
-
- subst2[8]['2'] = 1438;
-
- // Also known as azla
- subst2[8]['3'] = 1448;
- subst2[8]['4'] = 1452;
- subst2[8]['5'] = 1427;
-
-
- subst['8'] = 9;
- subst2[9]['0'] = 1428;
- subst2[9]['1'] = 1431;
-
- /* Note, this accent is actually sinnorit, but it does not exist as a
-# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
-# is postpositive, while sinnorit is not. ZARQA is as close as I can
-# get to this.
- */
- subst2[9]['2'] = 1432;
-
- /* The Unicode form does not match the form used by BHS, but the names
-# are the same.
- */
- subst2[9]['3'] = 1441;
- subst2[9]['4'] = 1439;
- subst2[9]['5'] = 1429;
-
- subst['7'] = 10;
- subst2[10]['0'] = 1444;
- subst2[10]['1'] = 1445;
- subst2[10]['2'] = 1446;
- subst2[10]['3'] = 1430; // also '13', '73' also is used for majela
- subst2[10]['4'] = 1443;
- subst2[10]['5'] = 1469; // this is silluq; should appear to the left of the vowel
-
- subst['9'] = 11;
- subst2[11]['1'] = 1435;
- subst2[11]['2'] = 1425;
- subst2[11]['3'] = 1450;
- subst2[11]['4'] = 1447;
- subst2[11]['5'] = 1469; // should appear to the right of the vowel
-
-}
-
- /*
-
-
-# CANTILLION MARKS
-
- my $ETNAHTA = '&#1425;';
-# officially the Unicode name for this symbol was "SEGOL." However, that is
-# not a unique name, conflicting with the vowel of the same name. Further,
-# the position of the symbol is different. I have changed the name of the
-# accent to "SEGOLTA," the traditional name for this accent.
- my $SEGOLTA = '&#1426;';
- my $SHALSHELET = '&#1427;';
- my $ZAQEF_QATAN = '&#1428;';
- my $ZAQEF_GADOL = '&#1429;';
- my $TIPEHA = '&#1430;';
- my $REVIA = '&#1431;';
- my $ZARQA = '&#1432;';
- my $PASHTA = '&#1433;';
- my $YETIV = '&#1434;';
- my $TEVIR = '&#1435;';
- my $GERESH = '&#1436;';
- my $GERESH_MUQDAM = '&#1437;';
- my $GERSHAYIM = '&#1438;';
- my $QARNEY_PARA = '&#1439;';
- my $TELISHA_GEDOLA = '&#1440;';
- my $PAZER = '&#1441;';
- my $MUNAH = '&#1443;';
- my $MAHAPAKH = '&#1444;';
- my $MERKHA = '&#1445;';
- my $MERKHA_KEFULA = '&#1446;';
- my $DARGA = '&#1447;';
- my $QADMA = '&#1448;';
- my $TELISHA_QETANA = '&#1449;';
- my $YERAH_BEN_YOMO = '&#1450;';
- my $OLE = '&#1451;';
- my $ILUY = '&#1452;';
- my $DEHI = '&#1453;';
- my $ZINOR = '&#1454;';
-# HEBREW MARK
- my $MASORA_CIRCLE = '&#1455;';
-# HEBREW EXTENDED-A points and punctuation
- my $SHEVA = '&#1456;';
- my $HATAF_SEGOL = '&#1457;';
- my $HATAF_PATAH = '&#1458;';
- my $HATAF_QAMATS = '&#1459;';
- my $HIRIQ = '&#1460;';
- my $TSERE = '&#1461;';
- my $SEGOL = '&#1462;';
-# furtive Patah is not a distinct character
- my $PATAH = '&#1463;';
- my $QAMATS = '&#1464;';
- my $HOLAM = '&#1465;';
- my $QUBUTS = '&#1467;';
-# also used as shuruq
-# falls within the base letter
- my $DAGESH_OR_MAPIQ = '&#1468;';
-# also used as siluq
- my $METAG = '&#1469;';
- my $MAQAF = '&#1470;';
- my $RAFE = '&#1471;';
-# Also used for legarmeh
-# may be treated as spacing punctuation, not as a point
- my $PASEQ = '&#1472;';
- my $SHIN_DOT = '&#1473;';
- my $SIN_DOT = '&#1474;';
- my $SOF_PASUQ = '&#1475;';
-# HEBREW MARK
- my $UPPER_DOT = '&#1476;';
-# HEBREW LETTERS based on ISO 8859-8
-# aleph
-# x (alef symbol - 2135)
- my $ALEF = '&#1488;';
-# x (bet symbol - 2136)
- my $BET = '&#1489;';
-# x (gimel symbol - 2137)
- my $GIMEL = '&#1490;';
-# x (dalet symbol - 2138)
- my $DALET = '&#1491;';
- my $HE = '&#1492;';
- my $VAV = '&#1493;';
- my $ZAYIN = '&#1494;';
- my $HET = '&#1495;';
- my $TET = '&#1496;';
- my $YOD = '&#1497;';
- my $FINAL_KAF = '&#1498;';
- my $KAF = '&#1499;';
- my $LAMED = '&#1500;';
- my $FINAL_MEM = '&#1501;';
- my $MEM = '&#1502;';
- my $FINAL_NUN = '&#1503;';
- my $NUN = '&#1504;';
- my $SAMEKH = '&#1505;';
- my $AYIN = '&#1506;';
- my $FINAL_PE = '&#1507;';
- my $PE = '&#1508;';
- my $FINAL_TSADI = '&#1509;';
-# also known as zade
- my $TSADI = '&#1510;';
- my $QOF = '&#1511;';
- my $RESH = '&#1512;';
- my $SHIN = '&#1513;';
- my $TAV = '&#1514;';
-# Yiddish digraphs
-# Hebrew Ligature
-# tsvey vovn
- my $DOUBLE_VAV = '&#1520;';
- my $VAV_YOD = '&#1521;';
-# tsvey yudn
- my $DOUBLE_YOD = '&#1522;';
-
-# Additional punctuation
- my $PUNCT_GERESH = '&#1523;';
- my $PUNCT_GERSHAYIM = '&#1524;';
-# Reserved: 0x05F5"
-# x (hebrew point judeo-spanish varika - FB1E)
-#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E
-
-#############################
-# End of Unicode 2.0 Hebrew #
-#############################
-
-# A hash whose key is a Michagan code, and whose value is a Unicode
-# equvalent
-
- char subst[] = new char [255];
- subst[')'] = 1488;
- 'B' => $BET,
- 'G' => $GIMEL,
- 'D' => $DALET,
- 'H' => $HE,
- 'W' => $VAV,
- 'Z' => $ZAYIN,
- 'X' => $HET,
- '+' => $TET,
- 'Y' => $YOD,
- 'K' => $KAF,
- 'L' => $LAMED,
- 'M' => $MEM,
- 'N' => $NUN,
- 'S' => $SAMEKH,
- '(' => $AYIN,
- 'P' => $PE,
- 'C' => $TSADI,
- 'Q' => $QOF,
- 'R' => $RESH,
- '#' => $SHIN, # the letter shin without a point
- '&' => ($SHIN . $SIN_DOT),
- '$' => ($SHIN . $SHIN_DOT), # '
- 'T' => $TAV,
-# VOWELS
- 'A' => $PATAH,
- 'F' => $QAMATS,
- 'E' => $SEGOL,
- '"' => $TSERE,
- 'I' => $HIRIQ,
- 'O' => $HOLAM,
- 'U' => $QUBUTS,
- ':' => $SHEVA,
- ':A' => $HATAF_PATAH,
- ':E' => $HATAF_SEGOL,
- ':F' => $HATAF_QAMATS,
-# OTHER DIACRITICS
- '.' => $DAGESH_OR_MAPIQ,
- '-' => $MAQAF,
- ',' => $RAFE,
-# CANTILLATION
- '00' => $SOF_PASUQ,
- '01' => $SEGOLTA,
-# According to BHS, zarqa and sinnor are both postpositive. However,
-# the Michigan encoding uses one code for both. The Unicode zarqa
-# (0x0598) is definitely NOT postpositive. And further, the shape of
-# the symbol is different in BHS and Uniocde. This needs further
-# research to determine what's going on here. For now, we follow BHS
-# and use the postpositive Unicode zinor or both accents.
- '02' => $ZINOR,
-# Pashta is postpositive, and the Unicode equivalent reflects
-# this. However, there is a poetic equivalent -- azla legarmeh --
-# which is not postpositive, but no equivalent code point exists in
-# Unicode. The Michigan encoding does not distinguish between the two,
-# although it could be algorithmically determined.
- '03' => $PASHTA,
- '04' => $TELISHA_QETANA,
- '05' => $PASEQ,
- '10' => $YETIV,
-# In the poetic books, prepositive dehi occurs; it's unclear whether
-# tipeha also occurs in the poetic books. Otherwise, we could simply
-# check for what book in the Tanach we are in. Michigan uses the same
-# code for each.
- '13' => $TIPEHA, # also $DEHI
-# This is the poetic accent mugrash, which also includes rebia, but is
-# encoded separately as '81' in the Michigan text.
- '11' => $GERESH_MUQDAM,
- '14' => $TELISHA_GEDOLA,
-# Telisha qetana is postpositive as in '04' above. However, Michigan
-# code '24' is for a medial telisha. Graphically, there is no
-# difference.
- '24' => $TELISHA_QETANA,
- '33' => $PASHTA,
-# The Michigan code of telisha gedola in medial position. Graphically,
-# there is no difference.
- '44' => $TELISHA_GEDOLA,
- '60' => $OLE,
- '61' => $GERESH,
-# This is the Unicode Hebrew *accent*; there is also another Hebrew
-# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
-# traditional rounded marks, rather than the alternate straight
-# marks.
- '62' => $GERSHAYIM,
-# Also known as azla
- '63' => $QADMA,
- '64' => $ILUY,
- '65' => $SHALSHELET,
- '80' => $ZAQEF_QATAN,
- '81' => $REVIA,
-# Note, this accent is actually sinnorit, but it does not exist as a
-# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
-# is postpositive, while sinnorit is not. ZARQA is as close as I can
-# get to this.
- '82' => $ZARQA,
-# The Unicode form does not match the form used by BHS, but the names
-# are the same.
- '83' => $PAZER,
- '84' => $QARNEY_PARA,
- '85' => $ZAQEF_GADOL,
-# Note Michigan encoding distinguishes between medial metheg '35' (occuring
-# on the left of the vowel), and the ordinary meteg '95' (occuring on the
-# right of the vowel). It is also used for silluq.
- '35' => $METAG,
- '70' => $MAHAPAKH,
- '71' => $MERKHA,
- '72' => $MERKHA_KEFULA,
- '73' => $TIPEHA, # also '13', '73' also is used for majela
- '74' => $MUNAH,
- '75' => $METAG, # this is silluq; should appear to the left of the vowel
- '91' => $TEVIR,
- '92' => $ETNAHTA,
- '93' => $YERAH_BEN_YOMO,
- '94' => $DARGA,
- '95' => $METAG, # should appear to the right of the vowel
-
-# Not used by the Michigan Encoding
-# $UPPER_DOT = '05C4';
- );
-
-# declare other variables
- my (@bhsLines,
- @bhsVerse,
- @entity_line) = ();
-
- my ($i,
- $verse,
- $word,
- $character) = 0;
-
- my ($element,
- $saveGuttural) = "";
-
-# read in a line
- while (<>) {
-# Process one verse
-# iterate over every character and change to XML decimal entity
- CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {
- # find and convert final kaf, mem, nun, pe, tsade
- ( # if final form
- $bhsVerse[$i] =~ /[KMNPC]/
- )
- &&
- (
- ( # whitespace or
- $bhsVerse[$i+1] =~ /[ \-?]/
- )
- ||
- ( # EOL or
- $i == ( scalar(@bhsVerse) - 1 )
- )
- ||
- ( # sof pasuq or
- ( $bhsVerse[$i+1] =~ /0/ ) &&
- ( $bhsVerse[$i+2] =~ /0/ )
- )
- ||
- ( # one accent followed by white, eol or
- (
- ( $bhsVerse[$i+1] =~ /\d/ ) &&
- ( $bhsVerse[$i+2] =~ /\d/ )
- ) &&
- (
- ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||
- ( $i == ( scalar(@bhsVerse) - 1 ) )
- )
- )
- ||
- ( # two accents followed by white, eol
- (
- ( $bhsVerse[$i+1] =~ /\d/ ) &&
- ( $bhsVerse[$i+2] =~ /\d/ ) &&
- ( $bhsVerse[$i+3] =~ /\d/ ) &&
- ( $bhsVerse[$i+4] =~ /\d/ )
- ) &&
- (
- ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||
- ( $i == ( scalar(@bhsVerse) - 1 ) )
- )
- )
- ||
- ( # followed by a vowel and white, eol, sof pasuq
- ( $bhsVerse[$i+1] =~ /[:F]/ ) &&
- ( # followed by
- ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or
- ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or
- ( # sof pasuq
- ( $bhsVerse[$i+2] =~ /0/ ) &&
- ( $bhsVerse[$i+3] =~ /0/ )
- )
- )
- )
- ) # end of what follows after final letter
- &&
- do {
- $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }
- && next CHAR;
- $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }
- && next CHAR;
- $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }
- && next CHAR;
- $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }
- && next CHAR;
- $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }
- && next CHAR;
- };
- # find and convert "furtive patach"
- ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach
- ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural
- ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel
- ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq
- ( $bhsVerse[$i-3] =~ /W/ ) ) || #
- ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)
- ( $bhsVerse[$i-3] =~ /O/ ) ) || #
- ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod
- ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&
- do {
- $saveGuttural = pop @entity_line; # snip off the gutteral
- push @entity_line,$PATAH; # push on the patach
- push @entity_line,$saveGuttural; # push back on the gutteral
- next CHAR;
- };
- # convert cantillation
- # since we have previously dealt with all other cases of
- # numbers, two digit patterns are all we have to search for
- $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {
- push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
- $i++; # accents are two digits long, so advance past the 2nd digit
- next CHAR;
- };
- # convert katef vowels, which are two characters long
- $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {
- push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
- $i++;
- next CHAR;
- };
- # convert everything else
- push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};
- } # end CHAR
-# print the line to standard output with XML character-level encoding
-# each character has the following format:
-# <c id="1kg1.verse#.word#.character#">&#1234;</c>
-
-# set up the verse element
- $word = 1;
- $character = 1;
- print "<verse>\n<word>\n";
-# print each character element
-# if there is a space, then close the word entity, open a new word
-# entity, increment the word number, reset the character number to
-# zero.
- foreach $element (@entity_line) {
- if ( $element =~ " " ) {
- $word++;
- $character = 1;
- print "</word>\n<word>\n";
- next;
- }
- print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";
- $character++;
- }
-# close the verse element
- print "</word></verse>\n";
-# reinitialize variables
- @bhsVerse = ();
- @entity_line = ();
- @bhsLines = ();
- } # end while
-# close the XML document
- print "</body>\n";
- */