summaryrefslogtreecommitdiff
path: root/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java')
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java361
1 files changed, 361 insertions, 0 deletions
diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java
new file mode 100644
index 0000000..37cd0cc
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java
@@ -0,0 +1,361 @@
+package morfologik.tools;
+
+import morfologik.stemming.EncoderType;
+
+import com.carrotsearch.hppc.ByteArrayList;
+
+/**
+ * Container class for sequence encoders.
+ */
+public final class SequenceEncoders {
+ private SequenceEncoders() {}
+
+ /**
+ * Maximum encodable single-byte code.
+ */
+ private static final int REMOVE_EVERYTHING = 255;
+
+ public static interface IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer);
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer);
+ public EncoderType type();
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix <code>src</code> has. The output code is (bytes):
+ * <pre>
+ * {K}{suffix}
+ * </pre>
+ * where (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code>
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: foo
+ * dst: foobar
+ * encoded: Abar
+ *
+ * src: foo
+ * dst: bar
+ * encoded: Dbar
+ * </pre>
+ *
+ * <p><strong>Note:</strong> The code length is a single byte. If equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimSuffixEncoder implements IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ int sharedPrefix = sharedPrefixLength(src, dst);
+ int truncateBytes = src.size() - sharedPrefix;
+ if (truncateBytes >= REMOVE_EVERYTHING) {
+ truncateBytes = REMOVE_EVERYTHING;
+ sharedPrefix = 0;
+ }
+
+ final byte suffixTrimCode = (byte) (truncateBytes + 'A');
+ encoded.add(suffixTrimCode);
+ encoded.add(dst.buffer, sharedPrefix, dst.size() - sharedPrefix);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int suffixTrimCode = encoded.get(0);
+ int truncateBytes = (suffixTrimCode - 'A') & 0xFF;
+ if (truncateBytes == REMOVE_EVERYTHING) {
+ truncateBytes = src.size();
+ }
+
+ dst.add(src.buffer, 0, src.size() - truncateBytes);
+ dst.add(encoded.buffer, 1, encoded.size() - 1);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.SUFFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix and prefix <code>src</code> and <code>dst</code> have.
+ * The output code is (bytes):
+ * <pre>
+ * {P}{K}{suffix}
+ * </pre>
+ * where (<code>P</code> - 'A') bytes should be trimmed from the start of <code>src</code>,
+ * (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code>
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: abc
+ * dst: abcd
+ * encoded: AAd
+ *
+ * src: abc
+ * dst: xyz
+ * encoded: ADxyz
+ * </pre>
+ *
+ * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimPrefixAndSuffixEncoder implements IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ // Search for the maximum matching subsequence that can be encoded.
+ int maxSubsequenceLength = 0;
+ int maxSubsequenceIndex = 0;
+ for (int i = 0; i < src.size(); i++) {
+ // prefix at i => shared subsequence (infix)
+ int sharedPrefix = sharedPrefixLength(src, i, dst, 0);
+ // Only update maxSubsequenceLength if we will be able to encode it.
+ if (sharedPrefix > maxSubsequenceLength
+ && i < REMOVE_EVERYTHING
+ && (src.size() - (i + sharedPrefix)) < REMOVE_EVERYTHING) {
+ maxSubsequenceLength = sharedPrefix;
+ maxSubsequenceIndex = i;
+ }
+ }
+
+ // Determine how much to remove (and where) from src to get a prefix of dst.
+ int truncatePrefixBytes = maxSubsequenceIndex;
+ int truncateSuffixBytes = (src.size() - (maxSubsequenceIndex + maxSubsequenceLength));
+ if (truncatePrefixBytes >= REMOVE_EVERYTHING ||
+ truncateSuffixBytes >= REMOVE_EVERYTHING) {
+ maxSubsequenceIndex = maxSubsequenceLength = 0;
+ truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING;
+ }
+
+ encoded.add((byte) ((truncatePrefixBytes + 'A') & 0xFF));
+ encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF));
+ encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int truncatePrefixBytes = (encoded.get(0) - 'A') & 0xFF;
+ int truncateSuffixBytes = (encoded.get(1) - 'A') & 0xFF;
+
+ if (truncatePrefixBytes == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ truncatePrefixBytes = src.size();
+ truncateSuffixBytes = 0;
+ }
+
+ dst.add(src.buffer, truncatePrefixBytes, src.size() - (truncateSuffixBytes + truncatePrefixBytes));
+ dst.add(encoded.buffer, 2, encoded.size() - 2);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.PREFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix and infix <code>src</code> and <code>dst</code> have.
+ * The output code is (bytes):
+ * <pre>
+ * {X}{L}{K}{suffix}
+ * </pre>
+ * where <code>src's</code> infix at position (<code>X</code> - 'A') and of length
+ * (<code>L</code> - 'A') should be removed, then (<code>K</code> - 'A') bytes
+ * should be trimmed from the end
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: ayz
+ * dst: abc
+ * encoded: AACbc
+ *
+ * src: aillent
+ * dst: aller
+ * encoded: BBCr
+ * </pre>
+ *
+ * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimInfixAndSuffixEncoder implements IEncoder {
+ ByteArrayList scratch = new ByteArrayList();
+
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ // Search for the infix that can we can encode and remove from src
+ // to get a maximum-length prefix of dst. This could be done more efficiently
+ // by running a smarter longest-common-subsequence algorithm and some pruning (?).
+ //
+ // For now, naive loop should do.
+
+ // There can be only two positions for the infix to delete:
+ // 1) we remove leading bytes, even if they are partially matching (but a longer match
+ // exists somewhere later on).
+ // 2) we leave max. matching prefix and remove non-matching bytes that follow.
+ int maxInfixIndex = 0;
+ int maxSubsequenceLength = sharedPrefixLength(src, dst);
+ int maxInfixLength = 0;
+ for (int i : new int [] {0, maxSubsequenceLength}) {
+ for (int j = 1; j <= src.size() - i; j++) {
+ // Compute temporary src with the infix removed.
+ // Concatenate in scratch space for simplicity.
+ scratch.clear();
+ scratch.add(src.buffer, 0, i);
+ scratch.add(src.buffer, i + j, src.size() - (i + j));
+
+ int sharedPrefix = sharedPrefixLength(scratch, dst);
+
+ // Only update maxSubsequenceLength if we will be able to encode it.
+ if (sharedPrefix > 0 &&
+ sharedPrefix > maxSubsequenceLength &&
+ i < REMOVE_EVERYTHING &&
+ j < REMOVE_EVERYTHING) {
+ maxSubsequenceLength = sharedPrefix;
+ maxInfixIndex = i;
+ maxInfixLength = j;
+ }
+ }
+ }
+
+ int truncateSuffixBytes = src.size() - (maxInfixLength + maxSubsequenceLength);
+
+ // Special case: if we're removing the suffix in the infix code, move it
+ // to the suffix code instead.
+ if (truncateSuffixBytes == 0 &&
+ maxInfixIndex + maxInfixLength == src.size()) {
+ truncateSuffixBytes = maxInfixLength;
+ maxInfixIndex = maxInfixLength = 0;
+ }
+
+
+ if (maxInfixIndex >= REMOVE_EVERYTHING ||
+ maxInfixLength >= REMOVE_EVERYTHING ||
+ truncateSuffixBytes >= REMOVE_EVERYTHING) {
+ maxInfixIndex = maxSubsequenceLength = 0;
+ maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING;
+ }
+
+ encoded.add((byte) ((maxInfixIndex + 'A') & 0xFF));
+ encoded.add((byte) ((maxInfixLength + 'A') & 0xFF));
+ encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF));
+ encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int infixIndex = (encoded.get(0) - 'A') & 0xFF;
+ int infixLength = (encoded.get(1) - 'A') & 0xFF;
+ int truncateSuffixBytes = (encoded.get(2) - 'A') & 0xFF;
+
+ if (infixLength == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ infixIndex = 0;
+ infixLength = src.size();
+ truncateSuffixBytes = 0;
+ }
+
+ dst.add(src.buffer, 0, infixIndex);
+ dst.add(src.buffer, infixIndex + infixLength, src.size() - (infixIndex + infixLength + truncateSuffixBytes));
+ dst.add(encoded.buffer, 3, encoded.size() - 3);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.INFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ *
+ */
+ public static class CopyEncoder implements IEncoder {
+ @Override
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer)
+ {
+ encodedBuffer.add(derived.buffer, 0, derived.size());
+ return encodedBuffer;
+ }
+
+ @Override
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer)
+ {
+ derivedBuffer.add(encoded.buffer, 0, encoded.size());
+ return derivedBuffer;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.NONE;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Compute the length of the shared prefix between two byte sequences.
+ */
+ private static int sharedPrefixLength(ByteArrayList a, ByteArrayList b) {
+ final int max = Math.min(a.size(), b.size());
+ int i = 0;
+ while (i < max && a.get(i) == b.get(i)) {
+ i++;
+ }
+ return i;
+ }
+
+ /**
+ * Compute the length of the shared prefix between two byte sequences.
+ */
+ private static int sharedPrefixLength(ByteArrayList a, int aStart, ByteArrayList b, int bStart) {
+
+ int i = 0;
+ while (aStart < a.size() &&
+ bStart < b.size() &&
+ a.get(aStart++) == b.get(bStart++)) {
+ i++;
+ }
+ return i;
+ }
+
+ public static IEncoder forType(EncoderType encType)
+ {
+ switch (encType) {
+ case INFIX: return new TrimInfixAndSuffixEncoder();
+ case PREFIX: return new TrimPrefixAndSuffixEncoder();
+ case SUFFIX: return new TrimSuffixEncoder();
+ case NONE: return new CopyEncoder();
+ }
+ throw new RuntimeException("Unknown encoder: " + encType);
+ }
+}