diff options
Diffstat (limited to 'morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java')
-rw-r--r-- | morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java | 361 |
1 files changed, 361 insertions, 0 deletions
diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java new file mode 100644 index 0000000..37cd0cc --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java @@ -0,0 +1,361 @@ +package morfologik.tools; + +import morfologik.stemming.EncoderType; + +import com.carrotsearch.hppc.ByteArrayList; + +/** + * Container class for sequence encoders. + */ +public final class SequenceEncoders { + private SequenceEncoders() {} + + /** + * Maximum encodable single-byte code. + */ + private static final int REMOVE_EVERYTHING = 255; + + public static interface IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer); + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer); + public EncoderType type(); + } + + /** + * Encodes <code>dst</code> relative to <code>src</code> by trimming + * whatever non-equal suffix <code>src</code> has. The output code is (bytes): + * <pre> + * {K}{suffix} + * </pre> + * where (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code> + * and then the <code>suffix</code> should be appended to the resulting byte sequence. + * + * <p>Examples:</p> + * <pre> + * src: foo + * dst: foobar + * encoded: Abar + * + * src: foo + * dst: bar + * encoded: Dbar + * </pre> + * + * <p><strong>Note:</strong> The code length is a single byte. If equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence + * should be discarded.</p> + */ + public static class TrimSuffixEncoder implements IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + int sharedPrefix = sharedPrefixLength(src, dst); + int truncateBytes = src.size() - sharedPrefix; + if (truncateBytes >= REMOVE_EVERYTHING) { + truncateBytes = REMOVE_EVERYTHING; + sharedPrefix = 0; + } + + final byte suffixTrimCode = (byte) (truncateBytes + 'A'); + encoded.add(suffixTrimCode); + encoded.add(dst.buffer, sharedPrefix, dst.size() - sharedPrefix); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int suffixTrimCode = encoded.get(0); + int truncateBytes = (suffixTrimCode - 'A') & 0xFF; + if (truncateBytes == REMOVE_EVERYTHING) { + truncateBytes = src.size(); + } + + dst.add(src.buffer, 0, src.size() - truncateBytes); + dst.add(encoded.buffer, 1, encoded.size() - 1); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.SUFFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Encodes <code>dst</code> relative to <code>src</code> by trimming + * whatever non-equal suffix and prefix <code>src</code> and <code>dst</code> have. + * The output code is (bytes): + * <pre> + * {P}{K}{suffix} + * </pre> + * where (<code>P</code> - 'A') bytes should be trimmed from the start of <code>src</code>, + * (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code> + * and then the <code>suffix</code> should be appended to the resulting byte sequence. + * + * <p>Examples:</p> + * <pre> + * src: abc + * dst: abcd + * encoded: AAd + * + * src: abc + * dst: xyz + * encoded: ADxyz + * </pre> + * + * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence + * should be discarded.</p> + */ + public static class TrimPrefixAndSuffixEncoder implements IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + // Search for the maximum matching subsequence that can be encoded. + int maxSubsequenceLength = 0; + int maxSubsequenceIndex = 0; + for (int i = 0; i < src.size(); i++) { + // prefix at i => shared subsequence (infix) + int sharedPrefix = sharedPrefixLength(src, i, dst, 0); + // Only update maxSubsequenceLength if we will be able to encode it. + if (sharedPrefix > maxSubsequenceLength + && i < REMOVE_EVERYTHING + && (src.size() - (i + sharedPrefix)) < REMOVE_EVERYTHING) { + maxSubsequenceLength = sharedPrefix; + maxSubsequenceIndex = i; + } + } + + // Determine how much to remove (and where) from src to get a prefix of dst. + int truncatePrefixBytes = maxSubsequenceIndex; + int truncateSuffixBytes = (src.size() - (maxSubsequenceIndex + maxSubsequenceLength)); + if (truncatePrefixBytes >= REMOVE_EVERYTHING || + truncateSuffixBytes >= REMOVE_EVERYTHING) { + maxSubsequenceIndex = maxSubsequenceLength = 0; + truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING; + } + + encoded.add((byte) ((truncatePrefixBytes + 'A') & 0xFF)); + encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); + encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int truncatePrefixBytes = (encoded.get(0) - 'A') & 0xFF; + int truncateSuffixBytes = (encoded.get(1) - 'A') & 0xFF; + + if (truncatePrefixBytes == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + truncatePrefixBytes = src.size(); + truncateSuffixBytes = 0; + } + + dst.add(src.buffer, truncatePrefixBytes, src.size() - (truncateSuffixBytes + truncatePrefixBytes)); + dst.add(encoded.buffer, 2, encoded.size() - 2); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.PREFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Encodes <code>dst</code> relative to <code>src</code> by trimming + * whatever non-equal suffix and infix <code>src</code> and <code>dst</code> have. + * The output code is (bytes): + * <pre> + * {X}{L}{K}{suffix} + * </pre> + * where <code>src's</code> infix at position (<code>X</code> - 'A') and of length + * (<code>L</code> - 'A') should be removed, then (<code>K</code> - 'A') bytes + * should be trimmed from the end + * and then the <code>suffix</code> should be appended to the resulting byte sequence. + * + * <p>Examples:</p> + * <pre> + * src: ayz + * dst: abc + * encoded: AACbc + * + * src: aillent + * dst: aller + * encoded: BBCr + * </pre> + * + * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence + * should be discarded.</p> + */ + public static class TrimInfixAndSuffixEncoder implements IEncoder { + ByteArrayList scratch = new ByteArrayList(); + + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + // Search for the infix that can we can encode and remove from src + // to get a maximum-length prefix of dst. This could be done more efficiently + // by running a smarter longest-common-subsequence algorithm and some pruning (?). + // + // For now, naive loop should do. + + // There can be only two positions for the infix to delete: + // 1) we remove leading bytes, even if they are partially matching (but a longer match + // exists somewhere later on). + // 2) we leave max. matching prefix and remove non-matching bytes that follow. + int maxInfixIndex = 0; + int maxSubsequenceLength = sharedPrefixLength(src, dst); + int maxInfixLength = 0; + for (int i : new int [] {0, maxSubsequenceLength}) { + for (int j = 1; j <= src.size() - i; j++) { + // Compute temporary src with the infix removed. + // Concatenate in scratch space for simplicity. + scratch.clear(); + scratch.add(src.buffer, 0, i); + scratch.add(src.buffer, i + j, src.size() - (i + j)); + + int sharedPrefix = sharedPrefixLength(scratch, dst); + + // Only update maxSubsequenceLength if we will be able to encode it. + if (sharedPrefix > 0 && + sharedPrefix > maxSubsequenceLength && + i < REMOVE_EVERYTHING && + j < REMOVE_EVERYTHING) { + maxSubsequenceLength = sharedPrefix; + maxInfixIndex = i; + maxInfixLength = j; + } + } + } + + int truncateSuffixBytes = src.size() - (maxInfixLength + maxSubsequenceLength); + + // Special case: if we're removing the suffix in the infix code, move it + // to the suffix code instead. + if (truncateSuffixBytes == 0 && + maxInfixIndex + maxInfixLength == src.size()) { + truncateSuffixBytes = maxInfixLength; + maxInfixIndex = maxInfixLength = 0; + } + + + if (maxInfixIndex >= REMOVE_EVERYTHING || + maxInfixLength >= REMOVE_EVERYTHING || + truncateSuffixBytes >= REMOVE_EVERYTHING) { + maxInfixIndex = maxSubsequenceLength = 0; + maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING; + } + + encoded.add((byte) ((maxInfixIndex + 'A') & 0xFF)); + encoded.add((byte) ((maxInfixLength + 'A') & 0xFF)); + encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); + encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int infixIndex = (encoded.get(0) - 'A') & 0xFF; + int infixLength = (encoded.get(1) - 'A') & 0xFF; + int truncateSuffixBytes = (encoded.get(2) - 'A') & 0xFF; + + if (infixLength == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + infixIndex = 0; + infixLength = src.size(); + truncateSuffixBytes = 0; + } + + dst.add(src.buffer, 0, infixIndex); + dst.add(src.buffer, infixIndex + infixLength, src.size() - (infixIndex + infixLength + truncateSuffixBytes)); + dst.add(encoded.buffer, 3, encoded.size() - 3); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.INFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * + */ + public static class CopyEncoder implements IEncoder { + @Override + public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer) + { + encodedBuffer.add(derived.buffer, 0, derived.size()); + return encodedBuffer; + } + + @Override + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer) + { + derivedBuffer.add(encoded.buffer, 0, encoded.size()); + return derivedBuffer; + } + + @Override + public EncoderType type() { + return EncoderType.NONE; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Compute the length of the shared prefix between two byte sequences. + */ + private static int sharedPrefixLength(ByteArrayList a, ByteArrayList b) { + final int max = Math.min(a.size(), b.size()); + int i = 0; + while (i < max && a.get(i) == b.get(i)) { + i++; + } + return i; + } + + /** + * Compute the length of the shared prefix between two byte sequences. + */ + private static int sharedPrefixLength(ByteArrayList a, int aStart, ByteArrayList b, int bStart) { + + int i = 0; + while (aStart < a.size() && + bStart < b.size() && + a.get(aStart++) == b.get(bStart++)) { + i++; + } + return i; + } + + public static IEncoder forType(EncoderType encType) + { + switch (encType) { + case INFIX: return new TrimInfixAndSuffixEncoder(); + case PREFIX: return new TrimPrefixAndSuffixEncoder(); + case SUFFIX: return new TrimSuffixEncoder(); + case NONE: return new CopyEncoder(); + } + throw new RuntimeException("Unknown encoder: " + encType); + } +} |