diff options
author | Andrej Shadura <andrewsh@debian.org> | 2018-12-26 20:22:36 +0100 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2018-12-26 20:22:36 +0100 |
commit | 991420c6b1ebc1f041818ffeaee65e5dcaa1581d (patch) | |
tree | f8537e1dc33fcdbaaebe0801a7598c2d7f5baea2 /morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java | |
parent | 0abd94bb0cc3d7cc07da074acccffcfbc27b0352 (diff) |
Import Upstream version 1.9.0+dfsg
Diffstat (limited to 'morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java')
-rw-r--r-- | morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java | 486 |
1 files changed, 486 insertions, 0 deletions
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java new file mode 100644 index 0000000..0cf7cc0 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java @@ -0,0 +1,486 @@ +package morfologik.fsa; + +import java.util.*; + +import morfologik.util.Arrays; + +import static morfologik.fsa.ConstantArcSizeFSA.*; + +/** + * Fast, memory-conservative finite state automaton builder, returning a + * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction + * speed and memory consumption). + */ +public final class FSABuilder { + /** + * Debug and information constants. + * + * @see FSABuilder#getInfo() + */ + public enum InfoEntry { + SERIALIZATION_BUFFER_SIZE("Serialization buffer size"), + SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"), + CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"), + MAX_ACTIVE_PATH_LENGTH("Max active path"), + STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"), + STATE_REGISTRY_SIZE("Registry hash entries"), + ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)"); + + private final String stringified; + + InfoEntry(String stringified) { + this.stringified = stringified; + } + + @Override + public String toString() { + return stringified; + } + } + + /** A megabyte. */ + private final static int MB = 1024 * 1024; + + /** + * Internal serialized FSA buffer expand ratio. + */ + private final static int BUFFER_GROWTH_SIZE = 5 * MB; + + /** + * Maximum number of labels from a single state. + */ + private final static int MAX_LABELS = 256; + + /** + * Comparator comparing full byte arrays consistently with + * {@link #compare(byte[], int, int, byte[], int, int)}. + */ + public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() { + public int compare(byte[] o1, byte[] o2) { + return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length); + } + }; + + /** + * Internal serialized FSA buffer expand ratio. + */ + private final int bufferGrowthSize; + + /** + * Holds serialized and mutable states. Each state is a sequential list of + * arcs, the last arc is marked with {@link #BIT_ARC_LAST}. + */ + private byte[] serialized = new byte[0]; + + /** + * Number of bytes already taken in {@link #serialized}. Start from 1 to + * keep 0 a sentinel value (for the hash set and final state). + */ + private int size; + + /** + * States on the "active path" (still mutable). Values are addresses of each + * state's first arc. + */ + private int[] activePath = new int[0]; + + /** + * Current length of the active path. + */ + private int activePathLen; + + /** + * The next offset at which an arc will be added to the given state on + * {@link #activePath}. + */ + private int[] nextArcOffset = new int[0]; + + /** + * Root state. If negative, the automaton has been built already and cannot be extended. + */ + private int root; + + /** + * An epsilon state. The first and only arc of this state points either + * to the root or to the terminal state, indicating an empty automaton. + */ + private int epsilon; + + /** + * Hash set of state addresses in {@link #serialized}, hashed by + * {@link #hash(int, int)}. Zero reserved for an unoccupied slot. + */ + private int[] hashSet = new int[2]; + + /** + * Number of entries currently stored in {@link #hashSet}. + */ + private int hashSize = 0; + + /** + * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only. + */ + private byte [] previous; + + /** + * Information about the automaton and its compilation. + */ + private TreeMap<InfoEntry, Object> info; + + /** + * {@link #previous} sequence's length, used in assertions only. + */ + private int previousLength; + + /** */ + public FSABuilder() { + this(BUFFER_GROWTH_SIZE); + } + + /** */ + public FSABuilder(int bufferGrowthSize) { + this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS); + + // Allocate epsilon state. + epsilon = allocateState(1); + serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST; + + // Allocate root, with an initial empty set of output arcs. + expandActivePath(1); + root = activePath[0]; + } + + /** + * Add a single sequence of bytes to the FSA. The input must be lexicographically greater + * than any previously added sequence. + */ + public void add(byte[] sequence, int start, int len) { + assert serialized != null : "Automaton already built."; + assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : + "Input must be sorted: " + + Arrays.toString(previous, 0, previousLength) + " >= " + + Arrays.toString(sequence, start, len); + assert setPrevious(sequence, start, len); + + // Determine common prefix length. + final int commonPrefix = commonPrefix(sequence, start, len); + + // Make room for extra states on active path, if needed. + expandActivePath(len); + + // Freeze all the states after the common prefix. + for (int i = activePathLen - 1; i > commonPrefix; i--) { + final int frozenState = freezeState(i); + setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState); + nextArcOffset[i] = activePath[i]; + } + + // Create arcs to new suffix states. + for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) { + final int p = nextArcOffset[i - 1]; + + serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0); + serialized[p + LABEL_OFFSET] = sequence[j++]; + setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]); + + nextArcOffset[i - 1] = p + ARC_SIZE; + } + + // Save last sequence's length so that we don't need to calculate it again. + this.activePathLen = len; + } + + /** Number of serialization buffer reallocations. */ + private int serializationBufferReallocations; + + /** + * Complete the automaton. + */ + public FSA complete() { + add(new byte[0], 0, 0); + + if (nextArcOffset[0] - activePath[0] == 0) { + // An empty FSA. + setArcTarget(epsilon, TERMINAL_STATE); + } else { + // An automaton with at least a single arc from root. + root = freezeState(0); + setArcTarget(epsilon, root); + } + + info = new TreeMap<InfoEntry, Object>(); + info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length); + info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations); + info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size); + info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length); + info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length); + info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize); + info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, + (this.serialized.length + this.hashSet.length * 4) / (double) MB); + + final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon); + this.serialized = null; + this.hashSet = null; + return fsa; + } + + /** + * Build a minimal, deterministic automaton from a sorted list of byte sequences. + */ + public static FSA build(byte[][] input) { + final FSABuilder builder = new FSABuilder(); + + for (byte [] chs : input) + builder.add(chs, 0, chs.length); + + return builder.complete(); + } + + /** + * Build a minimal, deterministic automaton from an iterable list of byte sequences. + */ + public static FSA build(Iterable<byte[]> input) { + final FSABuilder builder = new FSABuilder(); + + for (byte [] chs : input) + builder.add(chs, 0, chs.length); + + return builder.complete(); + } + + /** + * Return various statistics concerning the FSA and its compilation. + */ + public Map<InfoEntry, Object> getInfo() { + return info; + } + + /** Is this arc the state's last? */ + private boolean isArcLast(int arc) { + return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; + } + + /** Is this arc final? */ + private boolean isArcFinal(int arc) { + return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; + } + + /** Get label's arc. */ + private byte getArcLabel(int arc) { + return serialized[arc + LABEL_OFFSET]; + } + + /** + * Fills the target state address of an arc. + */ + private void setArcTarget(int arc, int state) { + arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE; + for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) { + serialized[--arc] = (byte) state; + state >>>= 8; + } + } + + /** + * Returns the address of an arc. + */ + private int getArcTarget(int arc) { + arc += ADDRESS_OFFSET; + return (serialized[arc]) << 24 | + (serialized[arc + 1] & 0xff) << 16 | + (serialized[arc + 2] & 0xff) << 8 | + (serialized[arc + 3] & 0xff); + } + + /** + * @return The number of common prefix characters with the previous + * sequence. + */ + private int commonPrefix(byte[] sequence, int start, int len) { + // Empty root state case. + final int max = Math.min(len, activePathLen); + int i; + for (i = 0; i < max; i++) { + final int lastArc = nextArcOffset[i] - ARC_SIZE; + if (sequence[start++] != getArcLabel(lastArc)) { + break; + } + } + + return i; + } + + /** + * Freeze a state: try to find an equivalent state in the interned states + * dictionary first, if found, return it, otherwise, serialize the mutable + * state at <code>activePathIndex</code> and return it. + */ + private int freezeState(final int activePathIndex) { + final int start = activePath[activePathIndex]; + final int end = nextArcOffset[activePathIndex]; + final int len = end - start; + + // Set the last arc flag on the current active path's state. + serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST; + + // Try to locate a state with an identical content in the hash set. + final int bucketMask = (hashSet.length - 1); + int slot = hash(start, len) & bucketMask; + for (int i = 0;;) { + int state = hashSet[slot]; + if (state == 0) { + state = hashSet[slot] = serialize(activePathIndex); + if (++hashSize > hashSet.length / 2) + expandAndRehash(); + return state; + } else if (equivalent(state, start, len)) { + return state; + } + + slot = (slot + (++i)) & bucketMask; + } + } + + /** + * Reallocate and rehash the hash set. + */ + private void expandAndRehash() { + final int[] newHashSet = new int[hashSet.length * 2]; + final int bucketMask = (newHashSet.length - 1); + + for (int j = 0; j < hashSet.length; j++) { + final int state = hashSet[j]; + if (state > 0) { + int slot = hash(state, stateLength(state)) & bucketMask; + for (int i = 0; newHashSet[slot] > 0;) { + slot = (slot + (++i)) & bucketMask; + } + newHashSet[slot] = state; + } + } + this.hashSet = newHashSet; + } + + /** + * The total length of the serialized state data (all arcs). + */ + private int stateLength(int state) { + int arc = state; + while (!isArcLast(arc)) { + arc += ARC_SIZE; + } + return arc - state + ARC_SIZE; + } + + /** Return <code>true</code> if two regions in {@link #serialized} are identical. */ + private boolean equivalent(int start1, int start2, int len) { + if (start1 + len > size || start2 + len > size) + return false; + + while (len-- > 0) + if (serialized[start1++] != serialized[start2++]) + return false; + + return true; + } + + /** + * Serialize a given state on the active path. + */ + private int serialize(final int activePathIndex) { + expandBuffers(); + + final int newState = size; + final int start = activePath[activePathIndex]; + final int len = nextArcOffset[activePathIndex] - start; + System.arraycopy(serialized, start, serialized, newState, len); + + size += len; + return newState; + } + + /** + * Hash code of a fragment of {@link #serialized} array. + */ + private int hash(int start, int byteCount) { + assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?"; + + int h = 0; + for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) { + h = 17 * h + getArcLabel(start); + h = 17 * h + getArcTarget(start); + if (isArcFinal(start)) h += 17; + } + + return h; + } + + /** + * Append a new mutable state to the active path. + */ + private void expandActivePath(int size) { + if (activePath.length < size) { + final int p = activePath.length; + activePath = java.util.Arrays.copyOf(activePath, size); + nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size); + + for (int i = p; i < size; i++) { + nextArcOffset[i] = activePath[i] = + allocateState(/* assume max labels count */ MAX_LABELS); + } + } + } + + /** + * Expand internal buffers for the next state. + */ + private void expandBuffers() { + if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) { + serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize); + serializationBufferReallocations++; + } + } + + /** + * Allocate space for a state with the given number of outgoing labels. + * + * @return state offset + */ + private int allocateState(int labels) { + expandBuffers(); + final int state = size; + size += labels * ARC_SIZE; + return state; + } + + /** + * Copy <code>current</code> into an internal buffer. + */ + private boolean setPrevious(byte [] sequence, int start, int length) { + if (previous == null || previous.length < length) { + previous = new byte [length]; + } + + System.arraycopy(sequence, start, previous, 0, length); + previousLength = length; + return true; + } + + /** + * Lexicographic order of input sequences. By default, consistent with the "C" sort + * (absolute value of bytes, 0-255). + */ + public static int compare(byte [] s1, int start1, int lens1, + byte [] s2, int start2, int lens2) { + final int max = Math.min(lens1, lens2); + + for (int i = 0; i < max; i++) { + final byte c1 = s1[start1++]; + final byte c2 = s2[start2++]; + if (c1 != c2) + return (c1 & 0xff) - (c2 & 0xff); + } + + return lens1 - lens2; + } +} |