summaryrefslogtreecommitdiff
path: root/src/morfologik/fsa/FSABuilder.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/morfologik/fsa/FSABuilder.java')
-rw-r--r--src/morfologik/fsa/FSABuilder.java486
1 files changed, 0 insertions, 486 deletions
diff --git a/src/morfologik/fsa/FSABuilder.java b/src/morfologik/fsa/FSABuilder.java
deleted file mode 100644
index 0cf7cc0..0000000
--- a/src/morfologik/fsa/FSABuilder.java
+++ /dev/null
@@ -1,486 +0,0 @@
-package morfologik.fsa;
-
-import java.util.*;
-
-import morfologik.util.Arrays;
-
-import static morfologik.fsa.ConstantArcSizeFSA.*;
-
-/**
- * Fast, memory-conservative finite state automaton builder, returning a
- * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
- * speed and memory consumption).
- */
-public final class FSABuilder {
- /**
- * Debug and information constants.
- *
- * @see FSABuilder#getInfo()
- */
- public enum InfoEntry {
- SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
- SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
- CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
- MAX_ACTIVE_PATH_LENGTH("Max active path"),
- STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
- STATE_REGISTRY_SIZE("Registry hash entries"),
- ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");
-
- private final String stringified;
-
- InfoEntry(String stringified) {
- this.stringified = stringified;
- }
-
- @Override
- public String toString() {
- return stringified;
- }
- }
-
- /** A megabyte. */
- private final static int MB = 1024 * 1024;
-
- /**
- * Internal serialized FSA buffer expand ratio.
- */
- private final static int BUFFER_GROWTH_SIZE = 5 * MB;
-
- /**
- * Maximum number of labels from a single state.
- */
- private final static int MAX_LABELS = 256;
-
- /**
- * Comparator comparing full byte arrays consistently with
- * {@link #compare(byte[], int, int, byte[], int, int)}.
- */
- public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() {
- public int compare(byte[] o1, byte[] o2) {
- return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
- }
- };
-
- /**
- * Internal serialized FSA buffer expand ratio.
- */
- private final int bufferGrowthSize;
-
- /**
- * Holds serialized and mutable states. Each state is a sequential list of
- * arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
- */
- private byte[] serialized = new byte[0];
-
- /**
- * Number of bytes already taken in {@link #serialized}. Start from 1 to
- * keep 0 a sentinel value (for the hash set and final state).
- */
- private int size;
-
- /**
- * States on the "active path" (still mutable). Values are addresses of each
- * state's first arc.
- */
- private int[] activePath = new int[0];
-
- /**
- * Current length of the active path.
- */
- private int activePathLen;
-
- /**
- * The next offset at which an arc will be added to the given state on
- * {@link #activePath}.
- */
- private int[] nextArcOffset = new int[0];
-
- /**
- * Root state. If negative, the automaton has been built already and cannot be extended.
- */
- private int root;
-
- /**
- * An epsilon state. The first and only arc of this state points either
- * to the root or to the terminal state, indicating an empty automaton.
- */
- private int epsilon;
-
- /**
- * Hash set of state addresses in {@link #serialized}, hashed by
- * {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
- */
- private int[] hashSet = new int[2];
-
- /**
- * Number of entries currently stored in {@link #hashSet}.
- */
- private int hashSize = 0;
-
- /**
- * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only.
- */
- private byte [] previous;
-
- /**
- * Information about the automaton and its compilation.
- */
- private TreeMap<InfoEntry, Object> info;
-
- /**
- * {@link #previous} sequence's length, used in assertions only.
- */
- private int previousLength;
-
- /** */
- public FSABuilder() {
- this(BUFFER_GROWTH_SIZE);
- }
-
- /** */
- public FSABuilder(int bufferGrowthSize) {
- this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);
-
- // Allocate epsilon state.
- epsilon = allocateState(1);
- serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;
-
- // Allocate root, with an initial empty set of output arcs.
- expandActivePath(1);
- root = activePath[0];
- }
-
- /**
- * Add a single sequence of bytes to the FSA. The input must be lexicographically greater
- * than any previously added sequence.
- */
- public void add(byte[] sequence, int start, int len) {
- assert serialized != null : "Automaton already built.";
- assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 :
- "Input must be sorted: "
- + Arrays.toString(previous, 0, previousLength) + " >= "
- + Arrays.toString(sequence, start, len);
- assert setPrevious(sequence, start, len);
-
- // Determine common prefix length.
- final int commonPrefix = commonPrefix(sequence, start, len);
-
- // Make room for extra states on active path, if needed.
- expandActivePath(len);
-
- // Freeze all the states after the common prefix.
- for (int i = activePathLen - 1; i > commonPrefix; i--) {
- final int frozenState = freezeState(i);
- setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
- nextArcOffset[i] = activePath[i];
- }
-
- // Create arcs to new suffix states.
- for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
- final int p = nextArcOffset[i - 1];
-
- serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
- serialized[p + LABEL_OFFSET] = sequence[j++];
- setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
-
- nextArcOffset[i - 1] = p + ARC_SIZE;
- }
-
- // Save last sequence's length so that we don't need to calculate it again.
- this.activePathLen = len;
- }
-
- /** Number of serialization buffer reallocations. */
- private int serializationBufferReallocations;
-
- /**
- * Complete the automaton.
- */
- public FSA complete() {
- add(new byte[0], 0, 0);
-
- if (nextArcOffset[0] - activePath[0] == 0) {
- // An empty FSA.
- setArcTarget(epsilon, TERMINAL_STATE);
- } else {
- // An automaton with at least a single arc from root.
- root = freezeState(0);
- setArcTarget(epsilon, root);
- }
-
- info = new TreeMap<InfoEntry, Object>();
- info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
- info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
- info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
- info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
- info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
- info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
- info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB,
- (this.serialized.length + this.hashSet.length * 4) / (double) MB);
-
- final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
- this.serialized = null;
- this.hashSet = null;
- return fsa;
- }
-
- /**
- * Build a minimal, deterministic automaton from a sorted list of byte sequences.
- */
- public static FSA build(byte[][] input) {
- final FSABuilder builder = new FSABuilder();
-
- for (byte [] chs : input)
- builder.add(chs, 0, chs.length);
-
- return builder.complete();
- }
-
- /**
- * Build a minimal, deterministic automaton from an iterable list of byte sequences.
- */
- public static FSA build(Iterable<byte[]> input) {
- final FSABuilder builder = new FSABuilder();
-
- for (byte [] chs : input)
- builder.add(chs, 0, chs.length);
-
- return builder.complete();
- }
-
- /**
- * Return various statistics concerning the FSA and its compilation.
- */
- public Map<InfoEntry, Object> getInfo() {
- return info;
- }
-
- /** Is this arc the state's last? */
- private boolean isArcLast(int arc) {
- return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
- }
-
- /** Is this arc final? */
- private boolean isArcFinal(int arc) {
- return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
- }
-
- /** Get label's arc. */
- private byte getArcLabel(int arc) {
- return serialized[arc + LABEL_OFFSET];
- }
-
- /**
- * Fills the target state address of an arc.
- */
- private void setArcTarget(int arc, int state) {
- arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
- for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
- serialized[--arc] = (byte) state;
- state >>>= 8;
- }
- }
-
- /**
- * Returns the address of an arc.
- */
- private int getArcTarget(int arc) {
- arc += ADDRESS_OFFSET;
- return (serialized[arc]) << 24 |
- (serialized[arc + 1] & 0xff) << 16 |
- (serialized[arc + 2] & 0xff) << 8 |
- (serialized[arc + 3] & 0xff);
- }
-
- /**
- * @return The number of common prefix characters with the previous
- * sequence.
- */
- private int commonPrefix(byte[] sequence, int start, int len) {
- // Empty root state case.
- final int max = Math.min(len, activePathLen);
- int i;
- for (i = 0; i < max; i++) {
- final int lastArc = nextArcOffset[i] - ARC_SIZE;
- if (sequence[start++] != getArcLabel(lastArc)) {
- break;
- }
- }
-
- return i;
- }
-
- /**
- * Freeze a state: try to find an equivalent state in the interned states
- * dictionary first, if found, return it, otherwise, serialize the mutable
- * state at <code>activePathIndex</code> and return it.
- */
- private int freezeState(final int activePathIndex) {
- final int start = activePath[activePathIndex];
- final int end = nextArcOffset[activePathIndex];
- final int len = end - start;
-
- // Set the last arc flag on the current active path's state.
- serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;
-
- // Try to locate a state with an identical content in the hash set.
- final int bucketMask = (hashSet.length - 1);
- int slot = hash(start, len) & bucketMask;
- for (int i = 0;;) {
- int state = hashSet[slot];
- if (state == 0) {
- state = hashSet[slot] = serialize(activePathIndex);
- if (++hashSize > hashSet.length / 2)
- expandAndRehash();
- return state;
- } else if (equivalent(state, start, len)) {
- return state;
- }
-
- slot = (slot + (++i)) & bucketMask;
- }
- }
-
- /**
- * Reallocate and rehash the hash set.
- */
- private void expandAndRehash() {
- final int[] newHashSet = new int[hashSet.length * 2];
- final int bucketMask = (newHashSet.length - 1);
-
- for (int j = 0; j < hashSet.length; j++) {
- final int state = hashSet[j];
- if (state > 0) {
- int slot = hash(state, stateLength(state)) & bucketMask;
- for (int i = 0; newHashSet[slot] > 0;) {
- slot = (slot + (++i)) & bucketMask;
- }
- newHashSet[slot] = state;
- }
- }
- this.hashSet = newHashSet;
- }
-
- /**
- * The total length of the serialized state data (all arcs).
- */
- private int stateLength(int state) {
- int arc = state;
- while (!isArcLast(arc)) {
- arc += ARC_SIZE;
- }
- return arc - state + ARC_SIZE;
- }
-
- /** Return <code>true</code> if two regions in {@link #serialized} are identical. */
- private boolean equivalent(int start1, int start2, int len) {
- if (start1 + len > size || start2 + len > size)
- return false;
-
- while (len-- > 0)
- if (serialized[start1++] != serialized[start2++])
- return false;
-
- return true;
- }
-
- /**
- * Serialize a given state on the active path.
- */
- private int serialize(final int activePathIndex) {
- expandBuffers();
-
- final int newState = size;
- final int start = activePath[activePathIndex];
- final int len = nextArcOffset[activePathIndex] - start;
- System.arraycopy(serialized, start, serialized, newState, len);
-
- size += len;
- return newState;
- }
-
- /**
- * Hash code of a fragment of {@link #serialized} array.
- */
- private int hash(int start, int byteCount) {
- assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";
-
- int h = 0;
- for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
- h = 17 * h + getArcLabel(start);
- h = 17 * h + getArcTarget(start);
- if (isArcFinal(start)) h += 17;
- }
-
- return h;
- }
-
- /**
- * Append a new mutable state to the active path.
- */
- private void expandActivePath(int size) {
- if (activePath.length < size) {
- final int p = activePath.length;
- activePath = java.util.Arrays.copyOf(activePath, size);
- nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);
-
- for (int i = p; i < size; i++) {
- nextArcOffset[i] = activePath[i] =
- allocateState(/* assume max labels count */ MAX_LABELS);
- }
- }
- }
-
- /**
- * Expand internal buffers for the next state.
- */
- private void expandBuffers() {
- if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
- serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
- serializationBufferReallocations++;
- }
- }
-
- /**
- * Allocate space for a state with the given number of outgoing labels.
- *
- * @return state offset
- */
- private int allocateState(int labels) {
- expandBuffers();
- final int state = size;
- size += labels * ARC_SIZE;
- return state;
- }
-
- /**
- * Copy <code>current</code> into an internal buffer.
- */
- private boolean setPrevious(byte [] sequence, int start, int length) {
- if (previous == null || previous.length < length) {
- previous = new byte [length];
- }
-
- System.arraycopy(sequence, start, previous, 0, length);
- previousLength = length;
- return true;
- }
-
- /**
- * Lexicographic order of input sequences. By default, consistent with the "C" sort
- * (absolute value of bytes, 0-255).
- */
- public static int compare(byte [] s1, int start1, int lens1,
- byte [] s2, int start2, int lens2) {
- final int max = Math.min(lens1, lens2);
-
- for (int i = 0; i < max; i++) {
- final byte c1 = s1[start1++];
- final byte c2 = s2[start2++];
- if (c1 != c2)
- return (c1 & 0xff) - (c2 & 0xff);
- }
-
- return lens1 - lens2;
- }
-}