package morfologik.tools; import morfologik.stemming.EncoderType; import com.carrotsearch.hppc.ByteArrayList; /** * Container class for sequence encoders. */ public final class SequenceEncoders { private SequenceEncoders() {} /** * Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; public static interface IEncoder { public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer); public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer); public EncoderType type(); } /** * Encodes dst relative to src by trimming * whatever non-equal suffix src has. The output code is (bytes): *

     * {K}{suffix}
     *

* where (K - 'A') bytes should be trimmed from the end of src * and then the suffix should be appended to the resulting byte sequence. * *

Examples:

     * src: foo
     * dst: foobar
     * encoded: Abar
     * 
     * src: foo
     * dst: bar
     * encoded: Dbar
     *

* *

Note: The code length is a single byte. If equal to * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence * should be discarded.

*/ public static class TrimSuffixEncoder implements IEncoder { public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { int sharedPrefix = sharedPrefixLength(src, dst); int truncateBytes = src.size() - sharedPrefix; if (truncateBytes >= REMOVE_EVERYTHING) { truncateBytes = REMOVE_EVERYTHING; sharedPrefix = 0; } final byte suffixTrimCode = (byte) (truncateBytes + 'A'); encoded.add(suffixTrimCode); encoded.add(dst.buffer, sharedPrefix, dst.size() - sharedPrefix); return encoded; } public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { int suffixTrimCode = encoded.get(0); int truncateBytes = (suffixTrimCode - 'A') & 0xFF; if (truncateBytes == REMOVE_EVERYTHING) { truncateBytes = src.size(); } dst.add(src.buffer, 0, src.size() - truncateBytes); dst.add(encoded.buffer, 1, encoded.size() - 1); return dst; } @Override public EncoderType type() { return EncoderType.SUFFIX; } @Override public String toString() { return getClass().getSimpleName(); } } /** * Encodes dst relative to src by trimming * whatever non-equal suffix and prefix src and dst have. * The output code is (bytes): *

     * {P}{K}{suffix}
     *

* where (P - 'A') bytes should be trimmed from the start of src, * (K - 'A') bytes should be trimmed from the end of src * and then the suffix should be appended to the resulting byte sequence. * *

Examples:

     * src: abc
     * dst: abcd
     * encoded: AAd
     * 
     * src: abc
     * dst: xyz
     * encoded: ADxyz
     *

* *

Note: Each code's length is a single byte. If any is equal to * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence * should be discarded.

*/ public static class TrimPrefixAndSuffixEncoder implements IEncoder { public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { // Search for the maximum matching subsequence that can be encoded. int maxSubsequenceLength = 0; int maxSubsequenceIndex = 0; for (int i = 0; i < src.size(); i++) { // prefix at i => shared subsequence (infix) int sharedPrefix = sharedPrefixLength(src, i, dst, 0); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && (src.size() - (i + sharedPrefix)) < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxSubsequenceIndex = i; } } // Determine how much to remove (and where) from src to get a prefix of dst. int truncatePrefixBytes = maxSubsequenceIndex; int truncateSuffixBytes = (src.size() - (maxSubsequenceIndex + maxSubsequenceLength)); if (truncatePrefixBytes >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxSubsequenceIndex = maxSubsequenceLength = 0; truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING; } encoded.add((byte) ((truncatePrefixBytes + 'A') & 0xFF)); encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); return encoded; } public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { int truncatePrefixBytes = (encoded.get(0) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(1) - 'A') & 0xFF; if (truncatePrefixBytes == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { truncatePrefixBytes = src.size(); truncateSuffixBytes = 0; } dst.add(src.buffer, truncatePrefixBytes, src.size() - (truncateSuffixBytes + truncatePrefixBytes)); dst.add(encoded.buffer, 2, encoded.size() - 2); return dst; } @Override public EncoderType type() { return EncoderType.PREFIX; } @Override public String toString() { return getClass().getSimpleName(); } } /** * Encodes dst relative to src by trimming * whatever non-equal suffix and infix src and dst have. * The output code is (bytes): *

     * {X}{L}{K}{suffix}
     *

* where src's infix at position (X - 'A') and of length * (L - 'A') should be removed, then (K - 'A') bytes * should be trimmed from the end * and then the suffix should be appended to the resulting byte sequence. * *

Examples:

     * src: ayz
     * dst: abc
     * encoded: AACbc
     * 
     * src: aillent
     * dst: aller
     * encoded: BBCr
     *

* *

Note: Each code's length is a single byte. If any is equal to * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence * should be discarded.

*/ public static class TrimInfixAndSuffixEncoder implements IEncoder { ByteArrayList scratch = new ByteArrayList(); public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { // Search for the infix that can we can encode and remove from src // to get a maximum-length prefix of dst. This could be done more efficiently // by running a smarter longest-common-subsequence algorithm and some pruning (?). // // For now, naive loop should do. // There can be only two positions for the infix to delete: // 1) we remove leading bytes, even if they are partially matching (but a longer match // exists somewhere later on). // 2) we leave max. matching prefix and remove non-matching bytes that follow. int maxInfixIndex = 0; int maxSubsequenceLength = sharedPrefixLength(src, dst); int maxInfixLength = 0; for (int i : new int [] {0, maxSubsequenceLength}) { for (int j = 1; j <= src.size() - i; j++) { // Compute temporary src with the infix removed. // Concatenate in scratch space for simplicity. scratch.clear(); scratch.add(src.buffer, 0, i); scratch.add(src.buffer, i + j, src.size() - (i + j)); int sharedPrefix = sharedPrefixLength(scratch, dst); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > 0 && sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && j < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxInfixIndex = i; maxInfixLength = j; } } } int truncateSuffixBytes = src.size() - (maxInfixLength + maxSubsequenceLength); // Special case: if we're removing the suffix in the infix code, move it // to the suffix code instead. if (truncateSuffixBytes == 0 && maxInfixIndex + maxInfixLength == src.size()) { truncateSuffixBytes = maxInfixLength; maxInfixIndex = maxInfixLength = 0; } if (maxInfixIndex >= REMOVE_EVERYTHING || maxInfixLength >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxInfixIndex = maxSubsequenceLength = 0; maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING; } encoded.add((byte) ((maxInfixIndex + 'A') & 0xFF)); encoded.add((byte) ((maxInfixLength + 'A') & 0xFF)); encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); return encoded; } public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { int infixIndex = (encoded.get(0) - 'A') & 0xFF; int infixLength = (encoded.get(1) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(2) - 'A') & 0xFF; if (infixLength == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { infixIndex = 0; infixLength = src.size(); truncateSuffixBytes = 0; } dst.add(src.buffer, 0, infixIndex); dst.add(src.buffer, infixIndex + infixLength, src.size() - (infixIndex + infixLength + truncateSuffixBytes)); dst.add(encoded.buffer, 3, encoded.size() - 3); return dst; } @Override public EncoderType type() { return EncoderType.INFIX; } @Override public String toString() { return getClass().getSimpleName(); } } /** * */ public static class CopyEncoder implements IEncoder { @Override public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer) { encodedBuffer.add(derived.buffer, 0, derived.size()); return encodedBuffer; } @Override public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer) { derivedBuffer.add(encoded.buffer, 0, encoded.size()); return derivedBuffer; } @Override public EncoderType type() { return EncoderType.NONE; } @Override public String toString() { return getClass().getSimpleName(); } } /** * Compute the length of the shared prefix between two byte sequences. */ private static int sharedPrefixLength(ByteArrayList a, ByteArrayList b) { final int max = Math.min(a.size(), b.size()); int i = 0; while (i < max && a.get(i) == b.get(i)) { i++; } return i; } /** * Compute the length of the shared prefix between two byte sequences. */ private static int sharedPrefixLength(ByteArrayList a, int aStart, ByteArrayList b, int bStart) { int i = 0; while (aStart < a.size() && bStart < b.size() && a.get(aStart++) == b.get(bStart++)) { i++; } return i; } public static IEncoder forType(EncoderType encType) { switch (encType) { case INFIX: return new TrimInfixAndSuffixEncoder(); case PREFIX: return new TrimPrefixAndSuffixEncoder(); case SUFFIX: return new TrimSuffixEncoder(); case NONE: return new CopyEncoder(); } throw new RuntimeException("Unknown encoder: " + encType); } }