summaryrefslogtreecommitdiff
path: root/src/morfologik/tools/MorphEncoder.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/morfologik/tools/MorphEncoder.java')
-rw-r--r--src/morfologik/tools/MorphEncoder.java399
1 files changed, 0 insertions, 399 deletions
diff --git a/src/morfologik/tools/MorphEncoder.java b/src/morfologik/tools/MorphEncoder.java
deleted file mode 100644
index 236c1aa..0000000
--- a/src/morfologik/tools/MorphEncoder.java
+++ /dev/null
@@ -1,399 +0,0 @@
-package morfologik.tools;
-
-import java.io.UnsupportedEncodingException;
-
-import morfologik.fsa.FSA5;
-
-/**
- * A class that converts tabular data to fsa morphological format. Three formats
- * are supported:
- * <ul>
- * <li><b>standard</b>, see {@link #standardEncode}</li>
- * <li><b>prefix</b>, see {@link #prefixEncode}</li>
- * <li><b>infix</b>, see {@link #infixEncode}</li>
- * </ul>
- */
-public final class MorphEncoder {
- private final byte annotationSeparator;
-
- private static final int MAX_PREFIX_LEN = 3;
- private static final int MAX_INFIX_LEN = 3;
-
- private static final String UTF8 = "UTF-8";
-
- public MorphEncoder() {
- this(FSA5.DEFAULT_ANNOTATION);
- }
-
- public MorphEncoder(byte annotationSeparator) {
- this.annotationSeparator = annotationSeparator;
- }
-
- public static int commonPrefix(final byte[] s1, final byte[] s2) {
- final int maxLen = Math.min(s1.length, s2.length);
- for (int i = 0; i < maxLen; i++) {
- if (s1[i] != s2[i]) {
- return i;
- }
- }
- return maxLen;
- }
-
- private static byte[] subsequence(final byte[] bytes, final int start) {
- final byte[] newArray = new byte[bytes.length - start];
- System.arraycopy(bytes, start, newArray, 0, bytes.length - start);
- return newArray;
- }
-
- private static int copyTo(byte[] dst, final int pos, final byte[] src) {
- System.arraycopy(src, 0, dst, pos, src.length);
- return src.length;
- }
-
- private static int copyTo(byte[] dst, final int pos, final byte src) {
- byte[] single = new byte[1];
- single[0] = src;
- System.arraycopy(single, 0, dst, pos, 1);
- return 1;
- }
-
- /**
- * This method converts the wordForm, wordLemma and tag to the form:
- *
- * <pre>
- * wordForm + Kending + tags
- * </pre>
- *
- * where '+' is a separator, K is a character that specifies how many
- * characters should be deleted from the end of the inflected form to
- * produce the lexeme by concatenating the stripped string with the ending.
- *
- */
- public byte[] standardEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- final int prefix = commonPrefix(wordForm, wordLemma);
- final int len = wordLemma.length - prefix;
- int pos = 0;
- // 3 = 2 separators and K character
- int arrayLen = l1 + len + 3;
- if (wordTag != null) { //wordTag may be empty for stemming
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- return bytes;
- }
-
- /**
- * This method converts wordform, wordLemma and the tag to the form:
- * <p>
- *
- * <pre>
- * inflected_form + LKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, L is the number of characters to be deleted
- * from the beginning of the word ("A" means none, "B" means one, "C" - 2,
- * etc.), K is a character that specifies how many characters should be
- * deleted from the end of the inflected form to produce the lexeme by
- * concatenating the stripped string with the ending ("A" means none,
- * "B' - 1, "C" - 2, and so on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- */
- public byte[] prefixEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- final int prefix = commonPrefix(wordForm, wordLemma);
-
- // 4 = 2 separators + LK characters
- int arrayLen = l1 + wordLemma.length + 4;
- if (wordTag != null) {
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- int pos = 0;
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- int prefixFound = 0;
- int prefix1 = 0;
- final int max = Math.min(wordForm.length, MAX_PREFIX_LEN);
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- if (prefixFound == 0) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- }
- } else {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- final byte[] finalArray = new byte[pos];
- System.arraycopy(bytes, 0, finalArray, 0, pos);
- return finalArray;
- }
-
- /**
- * This method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + MLKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, M is the position of characters to be deleted
- * towards the beginning of the inflected form ("A" means from the
- * beginning, "B" from the second character, "C" - from the third one, and
- * so on), L is the number of characters to be deleted from the position
- * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a
- * character that specifies how many characters should be deleted from the
- * end of the inflected form to produce the lexeme by concatenating the
- * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so
- * on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- */
- public byte[] infixEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- int prefixFound = 0;
- int prefix1 = 0;
- final int prefix = commonPrefix(wordForm, wordLemma);
- final int max = Math.min(l1, MAX_INFIX_LEN);
-
- // 5 = 2 separators + MLK characters
- int arrayLen = l1 + wordLemma.length + 5;
- if (wordTag != null) {
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- int pos = 0;
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- // we may have a prefix
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- if (prefixFound == 0) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- }
- } else { // prefix found but we have to check the infix
-
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- int prefix2 = 0;
- int infixFound = 0;
- final int max2 = Math.min(l1 - prefix, MAX_INFIX_LEN);
- for (int i = 1; i <= max2; i++) {
- prefix2 = commonPrefix(subsequence(wordForm, prefix + i),
- subsequence(wordLemma, prefix));
- if (prefix2 > 2) {
- infixFound = i;
- break;
- }
- }
-
- if (prefixFound > infixFound) {
- if (prefixFound > 0 && (prefix1 > prefix)) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos,
- (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((l1 - prefixFound
- - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- } else {
- // infixFound == 0 && prefixFound == 0
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- } else if (infixFound > 0 && prefix2 > 0) {
- // we have an infix, , and if there seems to be a prefix,
- // the infix is longer
- pos += copyTo(bytes, pos, (byte) ((prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((infixFound + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix - prefix2
- - infixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- subsequence(wordLemma, prefix + prefix2));
- } else {
- // we have an infix, and if there seems to be a prefix,
- // the infix is longer
- // but the common prefix of two words is longer
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
-
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- final byte[] finalArray = new byte[pos];
- System.arraycopy(bytes, 0, finalArray, 0, pos);
- return finalArray;
- }
-
- /**
- * Converts a byte array to a given encoding.
- *
- * @param str
- * Byte-array to be converted.
- * @return Java String. If decoding is unsuccessful, the string is empty.
- */
- protected static String asString(final byte[] str, final String encoding) {
- try {
- return new String(str, encoding);
- } catch (UnsupportedEncodingException e) {
- return "";
- }
- }
-
- /**
- * A UTF-8 variant of {@link #standardEncode(byte[], byte[], byte[])} This
- * method converts the wordForm, wordLemma and tag to the form:
- *
- * <pre>
- * wordForm + Kending + tags
- * </pre>
- *
- * where '+' is a separator, K is a character that specifies how many
- * characters should be deleted from the end of the inflected form to
- * produce the lexeme by concatenating the stripped string with the ending.
- *
- * @throws UnsupportedEncodingException
- */
- public String standardEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(standardEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-
- /**
- * A UTF-8 variant of {@link #prefixEncode(byte[], byte[], byte[])} This
- * method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + LKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, L is the number of characters to be deleted
- * from the beginning of the word ("A" means none, "B" means one, "C" - 2,
- * etc.), K is a character that specifies how many characters should be
- * deleted from the end of the inflected form to produce the lexeme by
- * concatenating the stripped string with the ending ("A" means none,
- * "B' - 1, "C" - 2, and so on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- * @throws UnsupportedEncodingException
- */
- public String prefixEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(prefixEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-
- /**
- * A UTF-8 variant of {@link #infixEncode(byte[], byte[], byte[])}.
- *
- * This method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + MLKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, M is the position of characters to be deleted
- * towards the beginning of the inflected form ("A" means from the
- * beginning, "B" from the second character, "C" - from the third one, and
- * so on), L is the number of characters to be deleted from the position
- * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a
- * character that specifies how many characters should be deleted from the
- * end of the inflected form to produce the lexeme by concatenating the
- * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so
- * on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- * @throws UnsupportedEncodingException
- */
- public String infixEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(infixEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-}