summaryrefslogtreecommitdiff
path: root/morfologik-fsa/src/main/java
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-fsa/src/main/java')
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java364
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java404
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java543
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java134
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSA.java286
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java323
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java332
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java486
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java154
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java64
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java52
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java156
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java43
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java169
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java202
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java25
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java86
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java22
-rw-r--r--morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java11
-rw-r--r--morfologik-fsa/src/main/java/morfologik/util/Arrays.java68
-rw-r--r--morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java54
-rw-r--r--morfologik-fsa/src/main/java/morfologik/util/FileUtils.java137
-rw-r--r--morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java58
23 files changed, 4173 insertions, 0 deletions
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java
new file mode 100644
index 0000000..695664d
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java
@@ -0,0 +1,364 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.FSAFlags.*;
+import static morfologik.util.FileUtils.readFully;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * CFSA (Compact Finite State Automaton) binary format implementation. This is a
+ * slightly reorganized version of {@link FSA5} offering smaller automata size
+ * at some (minor) performance penalty.
+ *
+ * <p><b>Note:</b> Serialize to {@link CFSA2} for new code.</p>
+ *
+ * <p>The encoding of automaton body is as follows.</p>
+ *
+ * <pre>
+ * ---- FSA header (standard)
+ * Byte Description
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | +------ '\'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 1 | | | | | | | | | +------ 'f'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 2 | | | | | | | | | +------ 's'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 3 | | | | | | | | | +------ 'a'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 4 | | | | | | | | | +------ version (fixed 0xc5)
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 5 | | | | | | | | | +------ filler character
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 6 | | | | | | | | | +------ annot character
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding.
+ * : : : : : : : : : |
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * ---- Start of a node; only if automaton was compiled with NUMBERS option.
+ *
+ * Byte
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | \ LSB
+ * +-+-+-+-+-+-+-+-+ +
+ * 1 | | | | | | | | | | number of strings recognized
+ * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
+ * : : : : : : : : : | from this node.
+ * +-+-+-+-+-+-+-+-+ +
+ * ctl-1 | | | | | | | | | / MSB
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * ---- A vector of node's arcs. Conditional format, depending on flags.
+ *
+ * 1) NEXT bit set, mapped arc label.
+ *
+ * +--------------- arc's label mapped in M bits if M's field value > 0
+ * | +------------- node pointed to is next
+ * | | +----------- the last arc of the node
+ * _______| | | +--------- the arc is final
+ * / | | | |
+ * +-+-+-+-+-+-+-+-+\
+ * 0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * 2) NEXT bit set, label separate.
+ *
+ * +--------------- arc's label stored separately (M's field is zero).
+ * | +------------- node pointed to is next
+ * | | +----------- the last arc of the node
+ * | | | +--------- the arc is final
+ * | | | |
+ * +-+-+-+-+-+-+-+-+\
+ * 0 |0|0|0|0|0|1|L|F| +------ flags
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 1 | | | | | | | | | +------ label
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * 3) NEXT bit not set. Full arc.
+ *
+ * +------------- node pointed to is next
+ * | +----------- the last arc of the node
+ * | | +--------- the arc is final
+ * | | |
+ * +-+-+-+-+-+-+-+-+\
+ * 0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 1 | | | | | | | | | +------ label
+ * +-+-+-+-+-+-+-+-+/
+ * : : : : : : : : :
+ * +-+-+-+-+-+-+-+-+\
+ * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
+ * +-+-+-+-+-+-+-+-+/
+ * </pre>
+ */
+public final class CFSA extends FSA {
+ /**
+ * Automaton header version value.
+ */
+ public static final byte VERSION = (byte) 0xC5;
+
+ /**
+ * Bitmask indicating that an arc corresponds to the last character of a
+ * sequence available when building the automaton.
+ */
+ public static final int BIT_FINAL_ARC = 1 << 0;
+
+ /**
+ * Bitmask indicating that an arc is the last one of the node's list and the
+ * following one belongs to another node.
+ */
+ public static final int BIT_LAST_ARC = 1 << 1;
+
+ /**
+ * Bitmask indicating that the target node of this arc follows it in the
+ * compressed automaton structure (no goto field).
+ */
+ public static final int BIT_TARGET_NEXT = 1 << 2;
+
+ /**
+ * An array of bytes with the internal representation of the automaton.
+ * Please see the documentation of this class for more information on how
+ * this structure is organized.
+ */
+ public byte[] arcs;
+
+ /**
+ * The length of the node header structure (if the automaton was compiled with
+ * <code>NUMBERS</code> option). Otherwise zero.
+ */
+ public final int nodeDataLength;
+
+ /**
+ * Flags for this automaton version.
+ */
+ private final Set<FSAFlags> flags;
+
+ /**
+ * Number of bytes each address takes in full, expanded form (goto length).
+ */
+ public final int gtl;
+
+ /**
+ * Label mapping for arcs of type (1) (see class documentation). The array
+ * is indexed by mapped label's value and contains the original label.
+ */
+ public final byte[] labelMapping;
+
+ /**
+ * Creates a new automaton, reading it from a file in FSA format, version 5.
+ */
+ public CFSA(InputStream fsaStream) throws IOException {
+ // Read the header first.
+ final FSAHeader header = FSAHeader.read(fsaStream);
+
+ // Ensure we have the correct version.
+ if (header.version != VERSION) {
+ throw new IOException("This class can read FSA version 5 only: " + header.version);
+ }
+
+ /*
+ * Determine if the automaton was compiled with NUMBERS. If so, modify
+ * ctl and goto fields accordingly.
+ */
+ flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
+ if ((header.gtl & 0xf0) != 0) {
+ this.nodeDataLength = (header.gtl >>> 4) & 0x0f;
+ this.gtl = header.gtl & 0x0f;
+ flags.add(NUMBERS);
+ } else {
+ this.nodeDataLength = 0;
+ this.gtl = header.gtl & 0x0f;
+ }
+
+ /*
+ * Read mapping dictionary.
+ */
+ labelMapping = new byte[1 << 5];
+ readFully(fsaStream, labelMapping);
+
+ /*
+ * Read arcs' data.
+ */
+ arcs = readFully(fsaStream);
+ }
+
+ /**
+ * Returns the start node of this automaton. May return <code>0</code> if
+ * the start node is also an end node.
+ */
+ @Override
+ public int getRootNode() {
+ // Skip dummy node marking terminating state.
+ final int epsilonNode = skipArc(getFirstArc(0));
+
+ // And follow the epsilon node's first (and only) arc.
+ return getDestinationNodeOffset(getFirstArc(epsilonNode));
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getFirstArc(int node) {
+ return nodeDataLength + node;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getNextArc(int arc) {
+ if (isArcLast(arc))
+ return 0;
+ else
+ return skipArc(arc);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getArc(int node, byte label) {
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (getArcLabel(arc) == label)
+ return arc;
+ }
+
+ // An arc labeled with "label" not found.
+ return 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getEndNode(int arc) {
+ final int nodeOffset = getDestinationNodeOffset(arc);
+ if (0 == nodeOffset) {
+ throw new RuntimeException("This is a terminal arc [" + arc + "]");
+ }
+ return nodeOffset;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public byte getArcLabel(int arc) {
+ if (isNextSet(arc) && isLabelCompressed(arc)) {
+ return this.labelMapping[(arcs[arc] >>> 3) & 0x1f];
+ } else {
+ return arcs[arc + 1];
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getRightLanguageCount(int node) {
+ assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
+ return FSA5.decodeFromBytes(arcs, node, nodeDataLength);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcFinal(int arc) {
+ return (arcs[arc] & BIT_FINAL_ARC) != 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcTerminal(int arc) {
+ return (0 == getDestinationNodeOffset(arc));
+ }
+
+ /**
+ * Returns <code>true</code> if this arc has <code>NEXT</code> bit set.
+ *
+ * @see #BIT_LAST_ARC
+ */
+ public boolean isArcLast(int arc) {
+ return (arcs[arc] & BIT_LAST_ARC) != 0;
+ }
+
+ /**
+ * @see #BIT_TARGET_NEXT
+ */
+ public boolean isNextSet(int arc) {
+ return (arcs[arc] & BIT_TARGET_NEXT) != 0;
+ }
+
+ /**
+ * Returns <code>true</code> if the label is compressed inside flags byte.
+ */
+ public boolean isLabelCompressed(int arc) {
+ assert isNextSet(arc) : "Only applicable to arcs with NEXT bit.";
+ return (arcs[arc] & (-1 << 3)) != 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * <p>For this automaton version, an additional {@link FSAFlags#NUMBERS} flag
+ * may be set to indicate the automaton contains extra fields for each node.</p>
+ */
+ public Set<FSAFlags> getFlags() {
+ return Collections.unmodifiableSet(flags);
+ }
+
+ /**
+ * Returns the address of the node pointed to by this arc.
+ */
+ final int getDestinationNodeOffset(int arc) {
+ if (isNextSet(arc)) {
+ /* The destination node follows this arc in the array. */
+ return skipArc(arc);
+ } else {
+ /*
+ * The destination node address has to be extracted from the arc's
+ * goto field.
+ */
+ int r = 0;
+ for (int i = gtl; --i >= 1;) {
+ r = r << 8 | (arcs[arc + 1 + i] & 0xff);
+ }
+ r = r << 8 | (arcs[arc] & 0xff);
+ return r >>> 3;
+ }
+ }
+
+ /**
+ * Read the arc's layout and skip as many bytes, as needed, to skip it.
+ */
+ private int skipArc(int offset) {
+ if (isNextSet(offset)) {
+ if (isLabelCompressed(offset)) {
+ offset++;
+ } else {
+ offset += 1 + 1;
+ }
+ } else {
+ offset += 1 + gtl;
+ }
+ return offset;
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java
new file mode 100644
index 0000000..6955da4
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java
@@ -0,0 +1,404 @@
+package morfologik.fsa;
+
+import static morfologik.util.FileUtils.readFully;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.EnumSet;
+import java.util.Set;
+
+import morfologik.util.FileUtils;
+
+/**
+ * CFSA (Compact Finite State Automaton) binary format implementation, version 2:
+ * <ul>
+ * <li>{@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one.</li>
+ * <li>v-coded goto field</li>
+ * <li>v-coded perfect hashing numbers, if any</li>
+ * <li>31 most frequent labels integrated with flags byte</li>
+ * </ul>
+ *
+ * <p>The encoding of automaton body is as follows.</p>
+ *
+ * <pre>
+ * ---- CFSA header
+ * Byte Description
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | +------ '\'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 1 | | | | | | | | | +------ 'f'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 2 | | | | | | | | | +------ 's'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 3 | | | | | | | | | +------ 'a'
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 4 | | | | | | | | | +------ version (fixed 0xc6)
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 5 | | | | | | | | | +----\
+ * +-+-+-+-+-+-+-+-+/ \ flags [MSB first]
+ * +-+-+-+-+-+-+-+-+\ /
+ * 6 | | | | | | | | | +----/
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 7 | | | | | | | | | +------ label lookup table size
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 8-32 | | | | | | | | | +------ label value lookup table
+ * : : : : : : : : : |
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * ---- Start of a node; only if automaton was compiled with NUMBERS option.
+ *
+ * Byte
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | \
+ * +-+-+-+-+-+-+-+-+ +
+ * 1 | | | | | | | | | | number of strings recognized
+ * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
+ * : : : : : : : : : | from this node. v-coding
+ * +-+-+-+-+-+-+-+-+ +
+ * | | | | | | | | | /
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
+ *
+ * 1) NEXT bit set, mapped arc label.
+ *
+ * +----------------------- node pointed to is next
+ * | +--------------------- the last arc of the node
+ * | | +------------------- this arc leads to a final state (acceptor)
+ * | | | _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
+ * | | | / | | | |
+ * +-+-+-+-+-+-+-+-+\
+ * 0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
+ * +-+-+-+-+-+-+-+-+/
+ * +-+-+-+-+-+-+-+-+\
+ * 1 | | | | | | | | | +------ optional label if M == 0
+ * +-+-+-+-+-+-+-+-+/
+ * : : : : : : : : :
+ * +-+-+-+-+-+-+-+-+\
+ * |A|A|A|A|A|A|A|A| +------ v-coded goto address
+ * +-+-+-+-+-+-+-+-+/
+ * </pre>
+ */
+public final class CFSA2 extends FSA {
+ /**
+ * Automaton header version value.
+ */
+ public static final byte VERSION = (byte) 0xc6;
+
+ /**
+ * The target node of this arc follows the last arc of the current state
+ * (no goto field).
+ */
+ public static final int BIT_TARGET_NEXT = 1 << 7;
+
+ /**
+ * The arc is the last one from the current node's arcs list.
+ */
+ public static final int BIT_LAST_ARC = 1 << 6;
+
+ /**
+ * The arc corresponds to the last character of a sequence
+ * available when building the automaton (acceptor transition).
+ */
+ public static final int BIT_FINAL_ARC = 1 << 5;
+
+ /**
+ * The count of bits assigned to storing an indexed label.
+ */
+ static final int LABEL_INDEX_BITS = 5;
+
+ /**
+ * Masks only the M bits of a flag byte.
+ */
+ static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1;
+
+ /**
+ * Maximum size of the labels index.
+ */
+ static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1;
+
+ /**
+ * An array of bytes with the internal representation of the automaton.
+ * Please see the documentation of this class for more information on how
+ * this structure is organized.
+ */
+ public byte[] arcs;
+
+ /**
+ * Flags for this automaton version.
+ */
+ private final EnumSet<FSAFlags> flags;
+
+ /**
+ * Label mapping for M-indexed labels.
+ */
+ public final byte[] labelMapping;
+
+ /**
+ * If <code>true</code> states are prepended with numbers.
+ */
+ private final boolean hasNumbers;
+
+ /**
+ * Epsilon node's offset.
+ */
+ private final int epsilon = 0;
+
+ /**
+ * Reads an automaton from a byte stream.
+ */
+ public CFSA2(InputStream in) throws IOException {
+ // Read the header first.
+ if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in))
+ throw new IOException("Invalid file header magic bytes.");
+
+ // Ensure we have the correct version.
+ final int version = FileUtils.readByte(in);
+ if (version != VERSION) {
+ throw new IOException("This class can only read FSA version: " + VERSION);
+ }
+
+ // Read flags.
+ short flagBits = FileUtils.readShort(in);
+ flags = EnumSet.noneOf(FSAFlags.class);
+ for (FSAFlags f : FSAFlags.values()) {
+ if (FSAFlags.isSet(flagBits, f))
+ flags.add(f);
+ }
+
+ if (flagBits != FSAFlags.asShort(flags))
+ throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits));
+
+ this.hasNumbers = flags.contains(FSAFlags.NUMBERS);
+
+ /*
+ * Read mapping dictionary.
+ */
+ int labelMappingSize = FileUtils.readByte(in) & 0xff;
+ labelMapping = new byte[labelMappingSize];
+ readFully(in, labelMapping);
+
+ /*
+ * Read arcs' data.
+ */
+ arcs = readFully(in);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getRootNode() {
+ // Skip dummy node marking terminating state.
+ return getDestinationNodeOffset(getFirstArc(epsilon));
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getFirstArc(int node) {
+ if (hasNumbers) {
+ return skipVInt(node);
+ } else {
+ return node;
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getNextArc(int arc) {
+ if (isArcLast(arc))
+ return 0;
+ else
+ return skipArc(arc);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getArc(int node, byte label) {
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (getArcLabel(arc) == label)
+ return arc;
+ }
+
+ // An arc labeled with "label" not found.
+ return 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getEndNode(int arc) {
+ final int nodeOffset = getDestinationNodeOffset(arc);
+ assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc;
+ assert nodeOffset < arcs.length : "Node out of bounds.";
+ return nodeOffset;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public byte getArcLabel(int arc) {
+ int index = arcs[arc] & LABEL_INDEX_MASK;
+ if (index > 0) {
+ return this.labelMapping[index];
+ } else {
+ return arcs[arc + 1];
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getRightLanguageCount(int node) {
+ assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
+ return readVInt(arcs, node);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcFinal(int arc) {
+ return (arcs[arc] & BIT_FINAL_ARC) != 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcTerminal(int arc) {
+ return (0 == getDestinationNodeOffset(arc));
+ }
+
+ /**
+ * Returns <code>true</code> if this arc has <code>NEXT</code> bit set.
+ *
+ * @see #BIT_LAST_ARC
+ */
+ public boolean isArcLast(int arc) {
+ return (arcs[arc] & BIT_LAST_ARC) != 0;
+ }
+
+ /**
+ * @see #BIT_TARGET_NEXT
+ */
+ public boolean isNextSet(int arc) {
+ return (arcs[arc] & BIT_TARGET_NEXT) != 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public Set<FSAFlags> getFlags() {
+ return flags;
+ }
+
+ /**
+ * Returns the address of the node pointed to by this arc.
+ */
+ final int getDestinationNodeOffset(int arc) {
+ if (isNextSet(arc)) {
+ /* Follow until the last arc of this state. */
+ while (!isArcLast(arc)) {
+ arc = getNextArc(arc);
+ }
+
+ /* And return the byte right after it. */
+ return skipArc(arc);
+ } else {
+ /*
+ * The destination node address is v-coded. v-code starts either
+ * at the next byte (label indexed) or after the next byte (label explicit).
+ */
+ return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1));
+ }
+ }
+
+ /**
+ * Read the arc's layout and skip as many bytes, as needed, to skip it.
+ */
+ private int skipArc(int offset) {
+ int flag = arcs[offset++];
+
+ // Explicit label?
+ if ((flag & LABEL_INDEX_MASK) == 0) {
+ offset++;
+ }
+
+ // Explicit goto?
+ if ((flag & BIT_TARGET_NEXT) == 0) {
+ offset = skipVInt(offset);
+ }
+
+ assert offset < this.arcs.length;
+ return offset;
+ }
+
+ /**
+ * Read a v-int.
+ */
+ static int readVInt(byte[] array, int offset) {
+ byte b = array[offset];
+ int value = b & 0x7F;
+
+ for (int shift = 7; b < 0; shift += 7) {
+ b = array[++offset];
+ value |= (b & 0x7F) << shift;
+ }
+
+ return value;
+ }
+
+ /**
+ * Write a v-int to a byte array.
+ */
+ static int writeVInt(byte[] array, int offset, int value) {
+ assert value >= 0 : "Can't v-code negative ints.";
+
+ while (value > 0x7F) {
+ array[offset++] = (byte) (0x80 | (value & 0x7F));
+ value >>= 7;
+ }
+ array[offset++] = (byte) value;
+
+ return offset;
+ }
+
+ /**
+ * Return the byte-length of a v-coded int.
+ */
+ static int vIntLength(int value) {
+ assert value >= 0 : "Can't v-code negative ints.";
+
+ int bytes;
+ for (bytes = 1; value >= 0x80; bytes++) {
+ value >>= 7;
+ }
+
+ return bytes;
+ }
+
+ /**
+ * Skip a v-int.
+ */
+ private int skipVInt(int offset) {
+ while (arcs[offset++] < 0);
+ return offset;
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java
new file mode 100644
index 0000000..8026f33
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java
@@ -0,0 +1,543 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.CFSA2.BIT_FINAL_ARC;
+import static morfologik.fsa.CFSA2.BIT_LAST_ARC;
+import static morfologik.fsa.CFSA2.BIT_TARGET_NEXT;
+import static morfologik.fsa.FSAFlags.FLEXIBLE;
+import static morfologik.fsa.FSAFlags.NEXTBIT;
+import static morfologik.fsa.FSAFlags.NUMBERS;
+import static morfologik.fsa.FSAFlags.STOPBIT;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayDeque;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.PriorityQueue;
+import java.util.Set;
+import java.util.TreeSet;
+
+import morfologik.fsa.FSAUtils.IntIntHolder;
+import morfologik.util.FileUtils;
+
+import com.carrotsearch.hppc.BitSet;
+import com.carrotsearch.hppc.BoundedProportionalArraySizingStrategy;
+import com.carrotsearch.hppc.IntArrayList;
+import com.carrotsearch.hppc.IntIntOpenHashMap;
+import com.carrotsearch.hppc.IntStack;
+import com.carrotsearch.hppc.cursors.IntCursor;
+import com.carrotsearch.hppc.cursors.IntIntCursor;
+
+/**
+ * Serializes in-memory {@link FSA} graphs to {@link CFSA2}.
+ *
+ * <p>
+ * It is possible to serialize the automaton with numbers required for perfect
+ * hashing. See {@link #withNumbers()} method.
+ * </p>
+ *
+ * @see CFSA2
+ * @see FSA#read(java.io.InputStream)
+ */
+public final class CFSA2Serializer implements FSASerializer {
+ /**
+ * Supported flags.
+ */
+ private final static EnumSet<FSAFlags> flags = EnumSet.of(NUMBERS, FLEXIBLE, STOPBIT, NEXTBIT);
+
+ /**
+ * No-state id.
+ */
+ private final static int NO_STATE = -1;
+
+ /**
+ * <code>true</code> if we should serialize with numbers.
+ *
+ * @see #withNumbers()
+ */
+ private boolean withNumbers;
+
+ /**
+ * A hash map of [state, offset] pairs.
+ */
+ private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
+
+ /**
+ * A hash map of [state, right-language-count] pairs.
+ */
+ private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
+
+ /**
+ * Scratch array for serializing vints.
+ */
+ private final byte [] scratch = new byte [5];
+
+ /**
+ * The most frequent labels for integrating with the flags field.
+ */
+ private byte [] labelsIndex;
+
+ /**
+ * Inverted index of labels to be integrated with flags field. A label
+ * at index <code>i<code> has the index or zero (no integration).
+ */
+ private int [] labelsInvIndex;
+
+ /**
+ * Logger for progress.
+ */
+ private IMessageLogger logger = new NullMessageLogger();
+
+ /**
+ * Serialize the automaton with the number of right-language sequences in
+ * each node. This is required to implement perfect hashing. The numbering
+ * also preserves the order of input sequences.
+ *
+ * @return Returns the same object for easier call chaining.
+ */
+ public CFSA2Serializer withNumbers() {
+ withNumbers = true;
+ return this;
+ }
+
+ /**
+ * Serializes any {@link FSA} to {@link CFSA2} stream.
+ *
+ * @see #withNumbers
+ * @return Returns <code>os</code> for chaining.
+ */
+ @Override
+ public <T extends OutputStream> T serialize(final FSA fsa, T os) throws IOException {
+ /*
+ * Calculate the most frequent labels and build indexed labels dictionary.
+ */
+ computeLabelsIndex(fsa);
+
+ /*
+ * Calculate the number of bytes required for the node data, if
+ * serializing with numbers.
+ */
+ if (withNumbers) {
+ this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
+ }
+
+ /*
+ * Linearize all the states, optimizing their layout.
+ */
+ IntArrayList linearized = linearize(fsa);
+
+ /*
+ * Emit the header.
+ */
+ FileUtils.writeInt(os, FSAHeader.FSA_MAGIC);
+ os.write(CFSA2.VERSION);
+
+ EnumSet<FSAFlags> fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
+ if (withNumbers) fsaFlags.add(NUMBERS);
+ FileUtils.writeShort(os, FSAFlags.asShort(fsaFlags));
+
+ /*
+ * Emit labels index.
+ */
+ os.write(labelsIndex.length);
+ os.write(labelsIndex);
+
+ /*
+ * Emit the automaton.
+ */
+ int size = emitNodes(fsa, os, linearized);
+ assert size == 0 : "Size changed in the final pass?";
+
+ return os;
+ }
+
+ /**
+ * Compute a set of labels to be integrated with the flags field.
+ */
+ private void computeLabelsIndex(final FSA fsa) {
+ // Compute labels count.
+ final int [] countByValue = new int [256];
+
+ fsa.visitAllStates(new StateVisitor() {
+ public boolean accept(int state) {
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
+ countByValue[fsa.getArcLabel(arc) & 0xff]++;
+ return true;
+ }
+ });
+
+ // Order by descending frequency of counts and increasing label value.
+ Comparator<IntIntHolder> comparator = new Comparator<IntIntHolder>() {
+ public int compare(IntIntHolder o1, IntIntHolder o2) {
+ int countDiff = o2.b - o1.b;
+ if (countDiff == 0) {
+ countDiff = o1.a - o2.a;
+ }
+ return countDiff;
+ }
+ };
+
+ TreeSet<IntIntHolder> labelAndCount = new TreeSet<IntIntHolder>(comparator);
+ for (int label = 0; label < countByValue.length; label++) {
+ if (countByValue[label] > 0) {
+ labelAndCount.add(new IntIntHolder(label, countByValue[label]));
+ }
+ }
+
+ this.logger.startPart("Label distribution");
+ for (IntIntHolder c : labelAndCount) {
+ this.logger.log("0x" + Integer.toHexString(c.a), c.b);
+ }
+ this.logger.endPart();
+
+ labelsIndex = new byte [1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)];
+ labelsInvIndex = new int [256];
+ for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) {
+ IntIntHolder p = labelAndCount.first();
+ labelAndCount.remove(p);
+ labelsInvIndex[p.a] = i;
+ labelsIndex[i] = (byte) p.a;
+ }
+ }
+
+ /**
+ * Return supported flags.
+ */
+ @Override
+ public Set<FSAFlags> getFlags() {
+ return flags;
+ }
+
+ /**
+ * Linearization of states.
+ */
+ private IntArrayList linearize(final FSA fsa) throws IOException {
+ /*
+ * Compute the states with most inlinks. These should be placed as close to the
+ * start of the automaton, as possible so that v-coded addresses are tiny.
+ */
+ final IntIntOpenHashMap inlinkCount = computeInlinkCount(fsa);
+
+ /*
+ * An array of ordered states for serialization.
+ */
+ final IntArrayList linearized = new IntArrayList(0,
+ new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f));
+
+ /*
+ * Determine which states should be linearized first (at fixed positions) so as to
+ * minimize the place occupied by goto fields.
+ */
+ int maxStates = Integer.MAX_VALUE;
+ int minInlinkCount = 2;
+ ArrayDeque<Integer> statesQueue = computeFirstStates(inlinkCount, maxStates, minInlinkCount);
+ IntArrayList states = new IntArrayList();
+ while (!statesQueue.isEmpty())
+ states.add(statesQueue.pop());
+
+ /*
+ * Compute initial addresses, without node rearrangements.
+ */
+ int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets);
+
+ /*
+ * Probe for better node arrangements by selecting between [lower, upper]
+ * nodes from the potential candidate nodes list.
+ */
+ IntArrayList sublist = new IntArrayList();
+ sublist.buffer = states.buffer;
+ sublist.elementsCount = states.elementsCount;
+
+ /*
+ * Probe the initial region a little bit, looking for optimal cut. It can't be binary search
+ * because the result isn't monotonic.
+ */
+ logger.startPart("Compacting");
+ logger.log("Initial output size", serializedSize);
+ int cutAt = 0;
+ for (int cut = Math.min(25, states.size()); cut <= Math.min(150, states.size()); cut += 25) {
+ sublist.elementsCount = cut;
+ int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
+ logger.log("Moved " + sublist.size() + " states, output size", newSize);
+ if (newSize >= serializedSize) {
+ break;
+ }
+ cutAt = cut;
+ }
+
+ /*
+ * Cut at the calculated point and repeat linearization.
+ */
+ sublist.elementsCount = cutAt;
+ int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
+
+ logger.log("Will move " + sublist.size() + " states, final size", size);
+ logger.endPart();
+
+ return linearized;
+ }
+
+ /**
+ * Linearize all states, putting <code>states</code> in front of the automaton and
+ * calculating stable state offsets.
+ */
+ private int linearizeAndCalculateOffsets(FSA fsa, IntArrayList states,
+ IntArrayList linearized, IntIntOpenHashMap offsets) throws IOException
+ {
+ final BitSet visited = new BitSet();
+ final IntStack nodes = new IntStack();
+ linearized.clear();
+
+ /*
+ * Linearize states with most inlinks first.
+ */
+ for (int i = 0; i < states.size(); i++) {
+ linearizeState(fsa, nodes, linearized, visited, states.get(i));
+ }
+
+ /*
+ * Linearize the remaining states by chaining them one after another, in depth-order.
+ */
+ nodes.push(fsa.getRootNode());
+ while (!nodes.isEmpty()) {
+ final int node = nodes.pop();
+ if (visited.get(node))
+ continue;
+
+ linearizeState(fsa, nodes, linearized, visited, node);
+ }
+
+ /*
+ * Calculate new state offsets. This is iterative. We start with
+ * maximum potential offsets and recalculate until converged.
+ */
+ int MAX_OFFSET = Integer.MAX_VALUE;
+ for (IntCursor c : linearized) {
+ offsets.put(c.value, MAX_OFFSET);
+ }
+
+ int i, j = 0;
+ while ((i = emitNodes(fsa, null, linearized)) > 0) {
+ j = i;
+ }
+ return j;
+ }
+
+ /**
+ * Add a state to linearized list.
+ */
+ private void linearizeState(final FSA fsa,
+ IntStack nodes,
+ IntArrayList linearized,
+ BitSet visited, int node)
+ {
+ linearized.add(node);
+ visited.set(node);
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ if (!fsa.isArcTerminal(arc)) {
+ final int target = fsa.getEndNode(arc);
+ if (!visited.get(target))
+ nodes.push(target);
+ }
+ }
+ }
+
+ /**
+ * Compute the set of states that should be linearized first to minimize other
+ * states goto length.
+ */
+ private ArrayDeque<Integer> computeFirstStates(IntIntOpenHashMap inlinkCount,
+ int maxStates,
+ int minInlinkCount)
+ {
+ Comparator<IntIntHolder> comparator = new Comparator<FSAUtils.IntIntHolder>() {
+ public int compare(IntIntHolder o1, IntIntHolder o2) {
+ int v = o1.a - o2.a;
+ return v == 0 ? (o1.b - o2.b) : v;
+ }
+ };
+
+ PriorityQueue<IntIntHolder> stateInlink = new PriorityQueue<IntIntHolder>(1, comparator);
+ IntIntHolder scratch = new IntIntHolder();
+ for (IntIntCursor c : inlinkCount) {
+ if (c.value > minInlinkCount) {
+ scratch.a = c.value;
+ scratch.b = c.key;
+
+ if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) {
+ stateInlink.add(new IntIntHolder(c.value, c.key));
+ if (stateInlink.size() > maxStates) stateInlink.remove();
+ }
+ }
+ }
+
+ ArrayDeque<Integer> states = new ArrayDeque<Integer>();
+ while (!stateInlink.isEmpty()) {
+ IntIntHolder i = stateInlink.remove();
+ states.addFirst(i.b);
+ }
+ return states;
+ }
+
+ /**
+ * Compute in-link count for each state.
+ */
+ private IntIntOpenHashMap computeInlinkCount(final FSA fsa) {
+ IntIntOpenHashMap inlinkCount = new IntIntOpenHashMap();
+ BitSet visited = new BitSet();
+ IntStack nodes = new IntStack();
+ nodes.push(fsa.getRootNode());
+
+ while (!nodes.isEmpty()) {
+ final int node = nodes.pop();
+ if (visited.get(node))
+ continue;
+
+ visited.set(node);
+
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ if (!fsa.isArcTerminal(arc)) {
+ final int target = fsa.getEndNode(arc);
+ inlinkCount.putOrAdd(target, 1, 1);
+ if (!visited.get(target))
+ nodes.push(target);
+ }
+ }
+ }
+
+ return inlinkCount;
+ }
+
+ /**
+ * Update arc offsets assuming the given goto length.
+ */
+ private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException {
+ int offset = 0;
+
+ // Add epsilon state.
+ offset += emitNodeData(os, 0);
+ if (fsa.getRootNode() != 0)
+ offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode()));
+ else
+ offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0);
+
+ boolean offsetsChanged = false;
+ final int max = linearized.size();
+ for (IntCursor c : linearized) {
+ final int state = c.value;
+ final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE;
+
+ if (os == null) {
+ offsetsChanged |= (offsets.get(state) != offset);
+ offsets.put(state, offset);
+ } else {
+ assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset;
+ }
+
+ offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0);
+ offset += emitNodeArcs(fsa, os, state, nextState);
+ }
+
+ return offsetsChanged ? offset : 0;
+ }
+
+ /**
+ * Emit all arcs of a single node.
+ */
+ private int emitNodeArcs(FSA fsa, OutputStream os,
+ final int state, final int nextState) throws IOException {
+ int offset = 0;
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
+ int targetOffset;
+ final int target;
+
+ if (fsa.isArcTerminal(arc)) {
+ target = 0;
+ targetOffset = 0;
+ } else {
+ target = fsa.getEndNode(arc);
+ targetOffset = offsets.get(target);
+ }
+
+ int flags = 0;
+
+ if (fsa.isArcFinal(arc)) {
+ flags |= BIT_FINAL_ARC;
+ }
+
+ if (fsa.getNextArc(arc) == 0) {
+ flags |= BIT_LAST_ARC;
+ }
+
+ if (targetOffset != 0 && target == nextState) {
+ flags |= BIT_TARGET_NEXT;
+ targetOffset = 0;
+ }
+
+ offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset);
+ }
+
+ return offset;
+ }
+
+ /** */
+ private int emitArc(OutputStream os, int flags, byte label, int targetOffset)
+ throws IOException
+ {
+ int length = 0;
+
+ int labelIndex = labelsInvIndex[label & 0xff];
+ if (labelIndex > 0) {
+ if (os != null) os.write(flags | labelIndex);
+ length++;
+ } else {
+ if (os != null) {
+ os.write(flags);
+ os.write(label);
+ }
+ length += 2;
+ }
+
+ if ((flags & BIT_TARGET_NEXT) == 0) {
+ int len = CFSA2.writeVInt(scratch, 0, targetOffset);
+ if (os != null) {
+ os.write(scratch, 0, len);
+ }
+ length += len;
+ }
+
+ return length;
+ }
+
+ /** */
+ private int emitNodeData(OutputStream os, int number) throws IOException {
+ int size = 0;
+
+ if (withNumbers) {
+ size = CFSA2.writeVInt(scratch, 0, number);
+ if (os != null) {
+ os.write(scratch, 0, size);
+ }
+ }
+
+ return size;
+ }
+
+ /** */
+ @Override
+ public CFSA2Serializer withFiller(byte filler) {
+ throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file.");
+ }
+
+ /** */
+ @Override
+ public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) {
+ throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file.");
+ }
+
+ @Override
+ public CFSA2Serializer withLogger(IMessageLogger logger) {
+ this.logger = logger;
+ return this;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java
new file mode 100644
index 0000000..2f6d412
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java
@@ -0,0 +1,134 @@
+package morfologik.fsa;
+
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * An FSA with constant-size arc representation produced directly
+ * by {@link FSABuilder}.
+ *
+ * @see FSABuilder
+ */
+public final class ConstantArcSizeFSA extends FSA {
+ /** Size of the target address field (constant for the builder). */
+ public final static int TARGET_ADDRESS_SIZE = 4;
+
+ /** Size of the flags field (constant for the builder). */
+ public final static int FLAGS_SIZE = 1;
+
+ /** Size of the label field (constant for the builder). */
+ public final static int LABEL_SIZE = 1;
+
+ /**
+ * Size of a single arc structure.
+ */
+ public final static int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE;
+
+ /** Offset of the flags field inside an arc. */
+ public final static int FLAGS_OFFSET = 0;
+
+ /** Offset of the label field inside an arc. */
+ public final static int LABEL_OFFSET = FLAGS_SIZE;
+
+ /** Offset of the address field inside an arc. */
+ public final static int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE;
+
+ /** A dummy address of the terminal state. */
+ final static int TERMINAL_STATE = 0;
+
+ /**
+ * An arc flag indicating the target node of an arc corresponds to a final
+ * state.
+ */
+ public final static int BIT_ARC_FINAL = 1 << 1;
+
+ /** An arc flag indicating the arc is last within its state. */
+ public final static int BIT_ARC_LAST = 1 << 0;
+
+ /**
+ * An epsilon state. The first and only arc of this state points either
+ * to the root or to the terminal state, indicating an empty automaton.
+ */
+ private final int epsilon;
+
+ /**
+ * FSA data, serialized as a byte array.
+ */
+ private final byte[] data;
+
+ /**
+ * @param data FSA data. There must be no trailing bytes after the last state.
+ */
+ ConstantArcSizeFSA(byte[] data, int epsilon) {
+ assert epsilon == 0 : "Epsilon is not zero?";
+
+ this.epsilon = epsilon;
+ this.data = data;
+ }
+
+ @Override
+ public int getRootNode() {
+ return getEndNode(getFirstArc(epsilon));
+ }
+
+ @Override
+ public int getFirstArc(int node) {
+ return node;
+ }
+
+ @Override
+ public int getArc(int node, byte label) {
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (getArcLabel(arc) == label)
+ return arc;
+ }
+ return 0;
+ }
+
+ @Override
+ public int getNextArc(int arc) {
+ if (isArcLast(arc))
+ return 0;
+ return arc + ARC_SIZE;
+ }
+
+ @Override
+ public byte getArcLabel(int arc) {
+ return data[arc + LABEL_OFFSET];
+ }
+
+ /**
+ * Fills the target state address of an arc.
+ */
+ private int getArcTarget(int arc) {
+ arc += ADDRESS_OFFSET;
+ return (data[arc]) << 24 |
+ (data[arc + 1] & 0xff) << 16 |
+ (data[arc + 2] & 0xff) << 8 |
+ (data[arc + 3] & 0xff);
+ }
+
+ @Override
+ public boolean isArcFinal(int arc) {
+ return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
+ }
+
+ @Override
+ public boolean isArcTerminal(int arc) {
+ return getArcTarget(arc) == 0;
+ }
+
+ private boolean isArcLast(int arc) {
+ return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
+ }
+
+ @Override
+ public int getEndNode(int arc) {
+ return getArcTarget(arc);
+ }
+
+ @Override
+ public Set<FSAFlags> getFlags() {
+ return Collections.emptySet();
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java
new file mode 100644
index 0000000..d734b95
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java
@@ -0,0 +1,286 @@
+package morfologik.fsa;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * This is a top abstract class for handling finite state automata. These
+ * automata are arc-based, a design described in Jan Daciuk's <i>Incremental
+ * Construction of Finite-State Automata and Transducers, and Their Use in the
+ * Natural Language Processing</i> (PhD thesis, Technical University of Gdansk).
+ *
+ * <p>
+ * Concrete subclasses (implementations) provide varying tradeoffs and features:
+ * traversal speed vs. memory size, for example.
+ * </p>
+ *
+ * @see FSABuilder
+ */
+public abstract class FSA implements Iterable<ByteBuffer> {
+ /**
+ * @return Returns the identifier of the root node of this automaton.
+ * Returns 0 if the start node is also the end node (the automaton
+ * is empty).
+ */
+ public abstract int getRootNode();
+
+ /**
+ * @return Returns the identifier of the first arc leaving <code>node</code>
+ * or 0 if the node has no outgoing arcs.
+ */
+ public abstract int getFirstArc(int node);
+
+ /**
+ * @return Returns the identifier of the next arc after <code>arc</code> and
+ * leaving <code>node</code>. Zero is returned if no more arcs are
+ * available for the node.
+ */
+ public abstract int getNextArc(int arc);
+
+ /**
+ * @return Returns the identifier of an arc leaving <code>node</code> and
+ * labeled with <code>label</code>. An identifier equal to 0 means
+ * the node has no outgoing arc labeled <code>label</code>.
+ */
+ public abstract int getArc(int node, byte label);
+
+ /**
+ * Return the label associated with a given <code>arc</code>.
+ */
+ public abstract byte getArcLabel(int arc);
+
+ /**
+ * Returns <code>true</code> if the destination node at the end of this
+ * <code>arc</code> corresponds to an input sequence created when building
+ * this automaton.
+ */
+ public abstract boolean isArcFinal(int arc);
+
+ /**
+ * Returns <code>true</code> if this <code>arc</code> does not have a
+ * terminating node (@link {@link #getEndNode(int)} will throw an
+ * exception). Implies {@link #isArcFinal(int)}.
+ */
+ public abstract boolean isArcTerminal(int arc);
+
+ /**
+ * Return the end node pointed to by a given <code>arc</code>. Terminal arcs
+ * (those that point to a terminal state) have no end node representation
+ * and throw a runtime exception.
+ */
+ public abstract int getEndNode(int arc);
+
+ /**
+ * Returns a set of flags for this FSA instance.
+ */
+ public abstract Set<FSAFlags> getFlags();
+
+ /**
+ * Calculates the number of arcs of a given node. Unless really required,
+ * use the following idiom for looping through all arcs:
+ * <pre>
+ * for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ * }
+ * </pre>
+ */
+ public int getArcCount(int node) {
+ int count = 0;
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ count++;
+ }
+ return count;
+ }
+
+ /**
+ * @return Returns the number of sequences reachable from the given state if
+ * the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of
+ * the right language of the state, in other words.
+ *
+ * @throws UnsupportedOperationException If the automaton was not compiled with
+ * {@link FSAFlags#NUMBERS}. The value can then be computed by manual count
+ * of {@link #getSequences(int)}.
+ */
+ public int getRightLanguageCount(int node) {
+ throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
+ }
+
+ /**
+ * Returns an iterator over all binary sequences starting at the given FSA
+ * state (node) and ending in final nodes. This corresponds to a set of
+ * suffixes of a given prefix from all sequences stored in the automaton.
+ *
+ * <p>
+ * The returned iterator is a {@link ByteBuffer} whose contents changes on
+ * each call to {@link Iterator#next()}. The keep the contents between calls
+ * to {@link Iterator#next()}, one must copy the buffer to some other
+ * location.
+ * </p>
+ *
+ * <p>
+ * <b>Important.</b> It is guaranteed that the returned byte buffer is
+ * backed by a byte array and that the content of the byte buffer starts at
+ * the array's index 0.
+ * </p>
+ *
+ * @see Iterable
+ */
+ public Iterable<ByteBuffer> getSequences(final int node) {
+ if (node == 0) {
+ return Collections.<ByteBuffer> emptyList();
+ }
+
+ return new Iterable<ByteBuffer>() {
+ public Iterator<ByteBuffer> iterator() {
+ return new FSAFinalStatesIterator(FSA.this, node);
+ }
+ };
+ }
+
+ /**
+ * An alias of calling {@link #iterator} directly ({@link FSA} is also
+ * {@link Iterable}).
+ */
+ public final Iterable<ByteBuffer> getSequences() {
+ return getSequences(getRootNode());
+ }
+
+ /**
+ * Returns an iterator over all binary sequences starting from the initial
+ * FSA state (node) and ending in final nodes. The returned iterator is a
+ * {@link ByteBuffer} whose contents changes on each call to
+ * {@link Iterator#next()}. The keep the contents between calls to
+ * {@link Iterator#next()}, one must copy the buffer to some other location.
+ *
+ * <p>
+ * <b>Important.</b> It is guaranteed that the returned byte buffer is
+ * backed by a byte array and that the content of the byte buffer starts at
+ * the array's index 0.
+ * </p>
+ *
+ * @see Iterable
+ */
+ public final Iterator<ByteBuffer> iterator() {
+ return getSequences().iterator();
+ }
+
+ /**
+ * Visit all states. The order of visiting is undefined. This method may be faster
+ * than traversing the automaton in post or preorder since it can scan states
+ * linearly. Returning false from {@link StateVisitor#accept(int)}
+ * immediately terminates the traversal.
+ */
+ public <T extends StateVisitor> T visitAllStates(T v) {
+ return visitInPostOrder(v);
+ }
+
+ /**
+ * Same as {@link #visitInPostOrder(StateVisitor, int)},
+ * starting from root automaton node.
+ */
+ public <T extends StateVisitor> T visitInPostOrder(T v) {
+ return visitInPostOrder(v, getRootNode());
+ }
+
+ /**
+ * Visits all states reachable from <code>node</code> in postorder.
+ * Returning false from {@link StateVisitor#accept(int)}
+ * immediately terminates the traversal.
+ */
+ public <T extends StateVisitor> T visitInPostOrder(T v, int node) {
+ visitInPostOrder(v, node, new BitSet());
+ return v;
+ }
+
+ /** Private recursion. */
+ private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) {
+ if (visited.get(node))
+ return true;
+ visited.set(node);
+
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (!isArcTerminal(arc)) {
+ if (!visitInPostOrder(v, getEndNode(arc), visited))
+ return false;
+ }
+ }
+
+ return v.accept(node);
+ }
+
+ /**
+ * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node.
+ */
+ public <T extends StateVisitor> T visitInPreOrder(T v) {
+ return visitInPreOrder(v, getRootNode());
+ }
+
+ /**
+ * Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)}
+ * skips traversal of all sub-states of a given state.
+ */
+ public <T extends StateVisitor> T visitInPreOrder(T v, int node) {
+ visitInPreOrder(v, node, new BitSet());
+ return v;
+ }
+
+ /** Private recursion. */
+ private void visitInPreOrder(StateVisitor v, int node, BitSet visited) {
+ if (visited.get(node))
+ return;
+ visited.set(node);
+
+ if (v.accept(node)) {
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (!isArcTerminal(arc)) {
+ visitInPreOrder(v, getEndNode(arc), visited);
+ }
+ }
+ }
+ }
+
+ /**
+ * A factory for reading automata in any of the supported versions. If
+ * possible, explicit constructors should be used.
+ *
+ * @see FSA5#FSA5(InputStream)
+ */
+ @SuppressWarnings("unchecked")
+ public static <T extends FSA> T read(InputStream in) throws IOException {
+ if (!in.markSupported()) {
+ in = new BufferedInputStream(in, Math.max(FSAHeader.MAX_HEADER_LENGTH + 1, 1024));
+ }
+
+ in.mark(FSAHeader.MAX_HEADER_LENGTH);
+ FSAHeader header = FSAHeader.read(in);
+ in.reset();
+
+ if (header.version == FSA5.VERSION)
+ return (T) new FSA5(in);
+
+ if (header.version == CFSA.VERSION)
+ return (T) new CFSA(in);
+
+ if (header.version == CFSA2.VERSION)
+ return (T) new CFSA2(in);
+
+ throw new IOException("Unsupported automaton version: "
+ + header.version);
+ }
+
+ public static FSA read(File fsa) throws IOException {
+ InputStream is = new BufferedInputStream(new FileInputStream(fsa));
+ try {
+ return read(is);
+ } finally {
+ is.close();
+ }
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java
new file mode 100644
index 0000000..d43f4d8
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java
@@ -0,0 +1,323 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.FSAFlags.FLEXIBLE;
+import static morfologik.fsa.FSAFlags.NEXTBIT;
+import static morfologik.fsa.FSAFlags.NUMBERS;
+import static morfologik.fsa.FSAFlags.STOPBIT;
+import static morfologik.util.FileUtils.readFully;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Set;
+
+/**
+ * FSA binary format implementation for version 5.
+ *
+ * <p>
+ * Version 5 indicates the dictionary was built with these flags:
+ * {@link FSAFlags#FLEXIBLE}, {@link FSAFlags#STOPBIT} and
+ * {@link FSAFlags#NEXTBIT}. The internal representation of the FSA must
+ * therefore follow this description (please note this format describes only a
+ * single transition (arc), not the entire dictionary file).
+ *
+ * <pre>
+ * ---- this node header present only if automaton was compiled with NUMBERS option.
+ * Byte
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | \ LSB
+ * +-+-+-+-+-+-+-+-+ +
+ * 1 | | | | | | | | | | number of strings recognized
+ * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
+ * : : : : : : : : : | from this node.
+ * +-+-+-+-+-+-+-+-+ +
+ * ctl-1 | | | | | | | | | / MSB
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * ---- remaining part of the node
+ *
+ * Byte
+ * +-+-+-+-+-+-+-+-+\
+ * 0 | | | | | | | | | +------ label
+ * +-+-+-+-+-+-+-+-+/
+ *
+ * +------------- node pointed to is next
+ * | +----------- the last arc of the node
+ * | | +--------- the arc is final
+ * | | |
+ * +-----------+
+ * | | | | |
+ * ___+___ | | | |
+ * / \ | | | |
+ * MSB LSB |
+ * 7 6 5 4 3 2 1 0 |
+ * +-+-+-+-+-+-+-+-+ |
+ * 1 | | | | | | | | | \ \
+ * +-+-+-+-+-+-+-+-+ \ \ LSB
+ * +-+-+-+-+-+-+-+-+ +
+ * 2 | | | | | | | | | |
+ * +-+-+-+-+-+-+-+-+ |
+ * 3 | | | | | | | | | +----- target node address (in bytes)
+ * +-+-+-+-+-+-+-+-+ | (not present except for the byte
+ * : : : : : : : : : | with flags if the node pointed to
+ * +-+-+-+-+-+-+-+-+ + is next)
+ * gtl | | | | | | | | | / MSB
+ * +-+-+-+-+-+-+-+-+ /
+ * gtl+1 (gtl = gotoLength)
+ * </pre>
+ */
+public final class FSA5 extends FSA {
+ /**
+ * Default filler byte.
+ */
+ public final static byte DEFAULT_FILLER = '_';
+
+ /**
+ * Default annotation byte.
+ */
+ public final static byte DEFAULT_ANNOTATION = '+';
+
+ /**
+ * Automaton version as in the file header.
+ */
+ public static final byte VERSION = 5;
+
+ /**
+ * Bit indicating that an arc corresponds to the last character of a
+ * sequence available when building the automaton.
+ */
+ public static final int BIT_FINAL_ARC = 1 << 0;
+
+ /**
+ * Bit indicating that an arc is the last one of the node's list and the
+ * following one belongs to another node.
+ */
+ public static final int BIT_LAST_ARC = 1 << 1;
+
+ /**
+ * Bit indicating that the target node of this arc follows it in the
+ * compressed automaton structure (no goto field).
+ */
+ public static final int BIT_TARGET_NEXT = 1 << 2;
+
+ /**
+ * An offset in the arc structure, where the address and flags field begins.
+ * In version 5 of FSA automata, this value is constant (1, skip label).
+ */
+ public final static int ADDRESS_OFFSET = 1;
+
+ /**
+ * An array of bytes with the internal representation of the automaton.
+ * Please see the documentation of this class for more information on how
+ * this structure is organized.
+ */
+ public final byte[] arcs;
+
+ /**
+ * The length of the node header structure (if the automaton was compiled with
+ * <code>NUMBERS</code> option). Otherwise zero.
+ */
+ public final int nodeDataLength;
+
+ /**
+ * Flags for this automaton version.
+ */
+ private final Set<FSAFlags> flags;
+
+ /**
+ * Number of bytes each address takes in full, expanded form (goto length).
+ */
+ public final int gtl;
+
+ /** Filler character. */
+ public final byte filler;
+
+ /** Annotation character. */
+ public final byte annotation;
+
+ /**
+ * Read and wrap a binary automaton in FSA version 5.
+ */
+ public FSA5(InputStream fsaStream) throws IOException {
+ // Read the header first.
+ final FSAHeader header = FSAHeader.read(fsaStream);
+
+ // Ensure we have version 5.
+ if (header.version != VERSION) {
+ throw new IOException("This class can read FSA version 5 only: " + header.version);
+ }
+
+ /*
+ * Determine if the automaton was compiled with NUMBERS. If so, modify
+ * ctl and goto fields accordingly.
+ */
+ flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
+ if ((header.gtl & 0xf0) != 0) {
+ flags.add(NUMBERS);
+ }
+
+ this.nodeDataLength = (header.gtl >>> 4) & 0x0f;
+ this.gtl = header.gtl & 0x0f;
+
+ this.filler = header.filler;
+ this.annotation = header.annotation;
+
+ arcs = readFully(fsaStream);
+ }
+
+ /**
+ * Returns the start node of this automaton.
+ */
+ @Override
+ public int getRootNode() {
+ // Skip dummy node marking terminating state.
+ final int epsilonNode = skipArc(getFirstArc(0));
+
+ // And follow the epsilon node's first (and only) arc.
+ return getDestinationNodeOffset(getFirstArc(epsilonNode));
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getFirstArc(int node) {
+ return nodeDataLength + node;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public final int getNextArc(int arc) {
+ if (isArcLast(arc))
+ return 0;
+ else
+ return skipArc(arc);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getArc(int node, byte label) {
+ for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
+ if (getArcLabel(arc) == label)
+ return arc;
+ }
+
+ // An arc labeled with "label" not found.
+ return 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getEndNode(int arc) {
+ final int nodeOffset = getDestinationNodeOffset(arc);
+ assert nodeOffset != 0 : "No target node for terminal arcs.";
+ return nodeOffset;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public byte getArcLabel(int arc) {
+ return arcs[arc];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcFinal(int arc) {
+ return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isArcTerminal(int arc) {
+ return (0 == getDestinationNodeOffset(arc));
+ }
+
+ /**
+ * Returns the number encoded at the given node. The number equals the count
+ * of the set of suffixes reachable from <code>node</code> (called its right
+ * language).
+ */
+ @Override
+ public int getRightLanguageCount(int node) {
+ assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
+ return decodeFromBytes(arcs, node, nodeDataLength);
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * <p>For this automaton version, an additional {@link FSAFlags#NUMBERS} flag
+ * may be set to indicate the automaton contains extra fields for each node.</p>
+ */
+ @Override
+ public Set<FSAFlags> getFlags() {
+ return Collections.unmodifiableSet(flags);
+ }
+
+ /**
+ * Returns <code>true</code> if this arc has <code>LAST</code> bit set.
+ *
+ * @see #BIT_LAST_ARC
+ */
+ public boolean isArcLast(int arc) {
+ return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0;
+ }
+
+ /**
+ * @see #BIT_TARGET_NEXT
+ */
+ public boolean isNextSet(int arc) {
+ return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0;
+ }
+
+ /**
+ * Returns an n-byte integer encoded in byte-packed representation.
+ */
+ static final int decodeFromBytes(
+ final byte[] arcs, final int start, final int n)
+ {
+ int r = 0;
+ for (int i = n; --i >= 0;) {
+ r = r << 8 | (arcs[start + i] & 0xff);
+ }
+ return r;
+ }
+
+ /**
+ * Returns the address of the node pointed to by this arc.
+ */
+ final int getDestinationNodeOffset(int arc) {
+ if (isNextSet(arc)) {
+ /* The destination node follows this arc in the array. */
+ return skipArc(arc);
+ } else {
+ /*
+ * The destination node address has to be extracted from the arc's
+ * goto field.
+ */
+ return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3;
+ }
+ }
+
+ /**
+ * Read the arc's layout and skip as many bytes, as needed.
+ */
+ private int skipArc(int offset) {
+ return offset + (isNextSet(offset)
+ ? 1 + 1 /* label + flags */
+ : 1 + gtl /* label + flags/address */);
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java
new file mode 100644
index 0000000..21627a9
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java
@@ -0,0 +1,332 @@
+package morfologik.fsa;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.carrotsearch.hppc.*;
+import com.carrotsearch.hppc.BitSet;
+
+import static morfologik.fsa.FSAFlags.*;
+
+/**
+ * Serializes in-memory {@link FSA} graphs to a binary format compatible with
+ * Jan Daciuk's <code>fsa</code>'s package <code>FSA5</code> format.
+ *
+ * <p>
+ * It is possible to serialize the automaton with numbers required for perfect
+ * hashing. See {@link #withNumbers()} method.
+ * </p>
+ *
+ * @see FSA5
+ * @see FSA#read(java.io.InputStream)
+ */
+public final class FSA5Serializer implements FSASerializer {
+ /**
+ * Maximum number of bytes for a serialized arc.
+ */
+ private final static int MAX_ARC_SIZE = 1 + 5;
+
+ /**
+ * Maximum number of bytes for per-node data.
+ */
+ private final static int MAX_NODE_DATA_SIZE = 16;
+
+ /**
+ * Number of bytes for the arc's flags header (arc representation without
+ * the goto address).
+ */
+ private final static int SIZEOF_FLAGS = 1;
+
+ /**
+ * Supported flags.
+ */
+ private final static EnumSet<FSAFlags> flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT);
+
+ /**
+ * @see FSA5#filler
+ */
+ public byte fillerByte = FSA5.DEFAULT_FILLER;
+
+ /**
+ * @see FSA5#annotation
+ */
+ public byte annotationByte = FSA5.DEFAULT_ANNOTATION;
+
+ /**
+ * <code>true</code> if we should serialize with numbers.
+ *
+ * @see #withNumbers()
+ */
+ private boolean withNumbers;
+
+ /**
+ * A hash map of [state, offset] pairs.
+ */
+ private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
+
+ /**
+ * A hash map of [state, right-language-count] pairs.
+ */
+ private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
+
+ /**
+ * Serialize the automaton with the number of right-language sequences in
+ * each node. This is required to implement perfect hashing. The numbering
+ * also preserves the order of input sequences.
+ *
+ * @return Returns the same object for easier call chaining.
+ */
+ public FSA5Serializer withNumbers() {
+ withNumbers = true;
+ return this;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public FSA5Serializer withFiller(byte filler) {
+ this.fillerByte = filler;
+ return this;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) {
+ this.annotationByte = annotationSeparator;
+ return this;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public FSASerializer withLogger(IMessageLogger logger) {
+ return this;
+ }
+
+ /**
+ * Serialize root state <code>s</code> to an output stream in
+ * <code>FSA5</code> format.
+ *
+ * @see #withNumbers
+ * @return Returns <code>os</code> for chaining.
+ */
+ @Override
+ public <T extends OutputStream> T serialize(final FSA fsa, T os)
+ throws IOException {
+
+ // Prepare space for arc offsets and linearize all the states.
+ int[] linearized = linearize(fsa);
+
+ /*
+ * Calculate the number of bytes required for the node data, if
+ * serializing with numbers.
+ */
+ int nodeDataLength = 0;
+ if (withNumbers) {
+ this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
+ int maxNumber = numbers.get(fsa.getRootNode());
+ while (maxNumber > 0) {
+ nodeDataLength++;
+ maxNumber >>>= 8;
+ }
+ }
+
+ // Calculate minimal goto length.
+ int gtl = 1;
+ while (true) {
+ // First pass: calculate offsets of states.
+ if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) {
+ gtl++;
+ continue;
+ }
+
+ // Second pass: check if goto overflows anywhere.
+ if (emitArcs(fsa, null, linearized, gtl, nodeDataLength))
+ break;
+
+ gtl++;
+ }
+
+ /*
+ * Emit the header.
+ */
+ os.write(new byte[] { '\\', 'f', 's', 'a' });
+ os.write(FSA5.VERSION);
+ os.write(fillerByte);
+ os.write(annotationByte);
+ os.write((nodeDataLength << 4) | gtl);
+
+ /*
+ * Emit the automaton.
+ */
+ boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength);
+ assert gtlUnchanged : "gtl changed in the final pass.";
+
+ return os;
+ }
+
+ /**
+ * Return supported flags.
+ */
+ @Override
+ public Set<FSAFlags> getFlags() {
+ return flags;
+ }
+
+ /**
+ * Linearization of states.
+ */
+ private int[] linearize(final FSA fsa) {
+ int[] linearized = new int[0];
+ int last = 0;
+
+ BitSet visited = new BitSet();
+ IntStack nodes = new IntStack();
+ nodes.push(fsa.getRootNode());
+
+ while (!nodes.isEmpty()) {
+ final int node = nodes.pop();
+ if (visited.get(node))
+ continue;
+
+ if (last >= linearized.length) {
+ linearized = Arrays.copyOf(linearized, linearized.length + 100000);
+ }
+
+ visited.set(node);
+ linearized[last++] = node;
+
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ if (!fsa.isArcTerminal(arc)) {
+ int target = fsa.getEndNode(arc);
+ if (!visited.get(target))
+ nodes.push(target);
+ }
+ }
+ }
+
+ return Arrays.copyOf(linearized, last);
+ }
+
+ /**
+ * Update arc offsets assuming the given goto length.
+ */
+ private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized,
+ int gtl, int nodeDataLength) throws IOException {
+ final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE,
+ MAX_ARC_SIZE));
+
+ int offset = 0;
+
+ // Add dummy terminal state.
+ offset += emitNodeData(bb, os, nodeDataLength, 0);
+ offset += emitArc(bb, os, gtl, 0, (byte) 0, 0);
+
+ // Add epsilon state.
+ offset += emitNodeData(bb, os, nodeDataLength, 0);
+ if (fsa.getRootNode() != 0)
+ offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0);
+ else
+ offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC , (byte) '^', 0);
+
+ int maxStates = linearized.length;
+ for (int j = 0; j < maxStates; j++) {
+ final int s = linearized[j];
+
+ if (os == null) {
+ offsets.put(s, offset);
+ } else {
+ assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset;
+ }
+
+ offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0);
+
+ for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
+ int targetOffset;
+ final int target;
+ if (fsa.isArcTerminal(arc)) {
+ targetOffset = 0;
+ target = 0;
+ } else {
+ target = fsa.getEndNode(arc);
+ targetOffset = offsets.get(target);
+ }
+
+ int flags = 0;
+ if (fsa.isArcFinal(arc)) {
+ flags |= FSA5.BIT_FINAL_ARC;
+ }
+
+ if (fsa.getNextArc(arc) == 0) {
+ flags |= FSA5.BIT_LAST_ARC;
+
+ if (j + 1 < maxStates && target == linearized[j + 1]
+ && targetOffset != 0) {
+ flags |= FSA5.BIT_TARGET_NEXT;
+ targetOffset = 0;
+ }
+ }
+
+ int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset);
+ if (bytes < 0)
+ // gtl too small. interrupt eagerly.
+ return false;
+
+ offset += bytes;
+ }
+ }
+
+ return true;
+ }
+
+ /** */
+ private int emitArc(ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset)
+ throws IOException
+ {
+ int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl;
+
+ flags |= (targetOffset << 3);
+ bb.put(label);
+ for (int b = 0; b < arcBytes; b++) {
+ bb.put((byte) flags);
+ flags >>>= 8;
+ }
+
+ if (flags != 0) {
+ // gtl too small. interrupt eagerly.
+ return -1;
+ }
+
+ bb.flip();
+ int bytes = bb.remaining();
+ if (os != null) {
+ os.write(bb.array(), bb.position(), bb.remaining());
+ }
+ bb.clear();
+
+ return bytes;
+ }
+
+ /** */
+ private int emitNodeData(ByteBuffer bb, OutputStream os,
+ int nodeDataLength, int number) throws IOException {
+ if (nodeDataLength > 0 && os != null) {
+ for (int i = 0; i < nodeDataLength; i++) {
+ bb.put((byte) number);
+ number >>>= 8;
+ }
+
+ bb.flip();
+ os.write(bb.array(), bb.position(), bb.remaining());
+ bb.clear();
+ }
+
+ return nodeDataLength;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java
new file mode 100644
index 0000000..0cf7cc0
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java
@@ -0,0 +1,486 @@
+package morfologik.fsa;
+
+import java.util.*;
+
+import morfologik.util.Arrays;
+
+import static morfologik.fsa.ConstantArcSizeFSA.*;
+
+/**
+ * Fast, memory-conservative finite state automaton builder, returning a
+ * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
+ * speed and memory consumption).
+ */
+public final class FSABuilder {
+ /**
+ * Debug and information constants.
+ *
+ * @see FSABuilder#getInfo()
+ */
+ public enum InfoEntry {
+ SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
+ SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
+ CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
+ MAX_ACTIVE_PATH_LENGTH("Max active path"),
+ STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
+ STATE_REGISTRY_SIZE("Registry hash entries"),
+ ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");
+
+ private final String stringified;
+
+ InfoEntry(String stringified) {
+ this.stringified = stringified;
+ }
+
+ @Override
+ public String toString() {
+ return stringified;
+ }
+ }
+
+ /** A megabyte. */
+ private final static int MB = 1024 * 1024;
+
+ /**
+ * Internal serialized FSA buffer expand ratio.
+ */
+ private final static int BUFFER_GROWTH_SIZE = 5 * MB;
+
+ /**
+ * Maximum number of labels from a single state.
+ */
+ private final static int MAX_LABELS = 256;
+
+ /**
+ * Comparator comparing full byte arrays consistently with
+ * {@link #compare(byte[], int, int, byte[], int, int)}.
+ */
+ public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() {
+ public int compare(byte[] o1, byte[] o2) {
+ return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
+ }
+ };
+
+ /**
+ * Internal serialized FSA buffer expand ratio.
+ */
+ private final int bufferGrowthSize;
+
+ /**
+ * Holds serialized and mutable states. Each state is a sequential list of
+ * arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
+ */
+ private byte[] serialized = new byte[0];
+
+ /**
+ * Number of bytes already taken in {@link #serialized}. Start from 1 to
+ * keep 0 a sentinel value (for the hash set and final state).
+ */
+ private int size;
+
+ /**
+ * States on the "active path" (still mutable). Values are addresses of each
+ * state's first arc.
+ */
+ private int[] activePath = new int[0];
+
+ /**
+ * Current length of the active path.
+ */
+ private int activePathLen;
+
+ /**
+ * The next offset at which an arc will be added to the given state on
+ * {@link #activePath}.
+ */
+ private int[] nextArcOffset = new int[0];
+
+ /**
+ * Root state. If negative, the automaton has been built already and cannot be extended.
+ */
+ private int root;
+
+ /**
+ * An epsilon state. The first and only arc of this state points either
+ * to the root or to the terminal state, indicating an empty automaton.
+ */
+ private int epsilon;
+
+ /**
+ * Hash set of state addresses in {@link #serialized}, hashed by
+ * {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
+ */
+ private int[] hashSet = new int[2];
+
+ /**
+ * Number of entries currently stored in {@link #hashSet}.
+ */
+ private int hashSize = 0;
+
+ /**
+ * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only.
+ */
+ private byte [] previous;
+
+ /**
+ * Information about the automaton and its compilation.
+ */
+ private TreeMap<InfoEntry, Object> info;
+
+ /**
+ * {@link #previous} sequence's length, used in assertions only.
+ */
+ private int previousLength;
+
+ /** */
+ public FSABuilder() {
+ this(BUFFER_GROWTH_SIZE);
+ }
+
+ /** */
+ public FSABuilder(int bufferGrowthSize) {
+ this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);
+
+ // Allocate epsilon state.
+ epsilon = allocateState(1);
+ serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;
+
+ // Allocate root, with an initial empty set of output arcs.
+ expandActivePath(1);
+ root = activePath[0];
+ }
+
+ /**
+ * Add a single sequence of bytes to the FSA. The input must be lexicographically greater
+ * than any previously added sequence.
+ */
+ public void add(byte[] sequence, int start, int len) {
+ assert serialized != null : "Automaton already built.";
+ assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 :
+ "Input must be sorted: "
+ + Arrays.toString(previous, 0, previousLength) + " >= "
+ + Arrays.toString(sequence, start, len);
+ assert setPrevious(sequence, start, len);
+
+ // Determine common prefix length.
+ final int commonPrefix = commonPrefix(sequence, start, len);
+
+ // Make room for extra states on active path, if needed.
+ expandActivePath(len);
+
+ // Freeze all the states after the common prefix.
+ for (int i = activePathLen - 1; i > commonPrefix; i--) {
+ final int frozenState = freezeState(i);
+ setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
+ nextArcOffset[i] = activePath[i];
+ }
+
+ // Create arcs to new suffix states.
+ for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
+ final int p = nextArcOffset[i - 1];
+
+ serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
+ serialized[p + LABEL_OFFSET] = sequence[j++];
+ setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
+
+ nextArcOffset[i - 1] = p + ARC_SIZE;
+ }
+
+ // Save last sequence's length so that we don't need to calculate it again.
+ this.activePathLen = len;
+ }
+
+ /** Number of serialization buffer reallocations. */
+ private int serializationBufferReallocations;
+
+ /**
+ * Complete the automaton.
+ */
+ public FSA complete() {
+ add(new byte[0], 0, 0);
+
+ if (nextArcOffset[0] - activePath[0] == 0) {
+ // An empty FSA.
+ setArcTarget(epsilon, TERMINAL_STATE);
+ } else {
+ // An automaton with at least a single arc from root.
+ root = freezeState(0);
+ setArcTarget(epsilon, root);
+ }
+
+ info = new TreeMap<InfoEntry, Object>();
+ info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
+ info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
+ info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
+ info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
+ info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
+ info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
+ info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB,
+ (this.serialized.length + this.hashSet.length * 4) / (double) MB);
+
+ final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
+ this.serialized = null;
+ this.hashSet = null;
+ return fsa;
+ }
+
+ /**
+ * Build a minimal, deterministic automaton from a sorted list of byte sequences.
+ */
+ public static FSA build(byte[][] input) {
+ final FSABuilder builder = new FSABuilder();
+
+ for (byte [] chs : input)
+ builder.add(chs, 0, chs.length);
+
+ return builder.complete();
+ }
+
+ /**
+ * Build a minimal, deterministic automaton from an iterable list of byte sequences.
+ */
+ public static FSA build(Iterable<byte[]> input) {
+ final FSABuilder builder = new FSABuilder();
+
+ for (byte [] chs : input)
+ builder.add(chs, 0, chs.length);
+
+ return builder.complete();
+ }
+
+ /**
+ * Return various statistics concerning the FSA and its compilation.
+ */
+ public Map<InfoEntry, Object> getInfo() {
+ return info;
+ }
+
+ /** Is this arc the state's last? */
+ private boolean isArcLast(int arc) {
+ return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
+ }
+
+ /** Is this arc final? */
+ private boolean isArcFinal(int arc) {
+ return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
+ }
+
+ /** Get label's arc. */
+ private byte getArcLabel(int arc) {
+ return serialized[arc + LABEL_OFFSET];
+ }
+
+ /**
+ * Fills the target state address of an arc.
+ */
+ private void setArcTarget(int arc, int state) {
+ arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
+ for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
+ serialized[--arc] = (byte) state;
+ state >>>= 8;
+ }
+ }
+
+ /**
+ * Returns the address of an arc.
+ */
+ private int getArcTarget(int arc) {
+ arc += ADDRESS_OFFSET;
+ return (serialized[arc]) << 24 |
+ (serialized[arc + 1] & 0xff) << 16 |
+ (serialized[arc + 2] & 0xff) << 8 |
+ (serialized[arc + 3] & 0xff);
+ }
+
+ /**
+ * @return The number of common prefix characters with the previous
+ * sequence.
+ */
+ private int commonPrefix(byte[] sequence, int start, int len) {
+ // Empty root state case.
+ final int max = Math.min(len, activePathLen);
+ int i;
+ for (i = 0; i < max; i++) {
+ final int lastArc = nextArcOffset[i] - ARC_SIZE;
+ if (sequence[start++] != getArcLabel(lastArc)) {
+ break;
+ }
+ }
+
+ return i;
+ }
+
+ /**
+ * Freeze a state: try to find an equivalent state in the interned states
+ * dictionary first, if found, return it, otherwise, serialize the mutable
+ * state at <code>activePathIndex</code> and return it.
+ */
+ private int freezeState(final int activePathIndex) {
+ final int start = activePath[activePathIndex];
+ final int end = nextArcOffset[activePathIndex];
+ final int len = end - start;
+
+ // Set the last arc flag on the current active path's state.
+ serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;
+
+ // Try to locate a state with an identical content in the hash set.
+ final int bucketMask = (hashSet.length - 1);
+ int slot = hash(start, len) & bucketMask;
+ for (int i = 0;;) {
+ int state = hashSet[slot];
+ if (state == 0) {
+ state = hashSet[slot] = serialize(activePathIndex);
+ if (++hashSize > hashSet.length / 2)
+ expandAndRehash();
+ return state;
+ } else if (equivalent(state, start, len)) {
+ return state;
+ }
+
+ slot = (slot + (++i)) & bucketMask;
+ }
+ }
+
+ /**
+ * Reallocate and rehash the hash set.
+ */
+ private void expandAndRehash() {
+ final int[] newHashSet = new int[hashSet.length * 2];
+ final int bucketMask = (newHashSet.length - 1);
+
+ for (int j = 0; j < hashSet.length; j++) {
+ final int state = hashSet[j];
+ if (state > 0) {
+ int slot = hash(state, stateLength(state)) & bucketMask;
+ for (int i = 0; newHashSet[slot] > 0;) {
+ slot = (slot + (++i)) & bucketMask;
+ }
+ newHashSet[slot] = state;
+ }
+ }
+ this.hashSet = newHashSet;
+ }
+
+ /**
+ * The total length of the serialized state data (all arcs).
+ */
+ private int stateLength(int state) {
+ int arc = state;
+ while (!isArcLast(arc)) {
+ arc += ARC_SIZE;
+ }
+ return arc - state + ARC_SIZE;
+ }
+
+ /** Return <code>true</code> if two regions in {@link #serialized} are identical. */
+ private boolean equivalent(int start1, int start2, int len) {
+ if (start1 + len > size || start2 + len > size)
+ return false;
+
+ while (len-- > 0)
+ if (serialized[start1++] != serialized[start2++])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Serialize a given state on the active path.
+ */
+ private int serialize(final int activePathIndex) {
+ expandBuffers();
+
+ final int newState = size;
+ final int start = activePath[activePathIndex];
+ final int len = nextArcOffset[activePathIndex] - start;
+ System.arraycopy(serialized, start, serialized, newState, len);
+
+ size += len;
+ return newState;
+ }
+
+ /**
+ * Hash code of a fragment of {@link #serialized} array.
+ */
+ private int hash(int start, int byteCount) {
+ assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";
+
+ int h = 0;
+ for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
+ h = 17 * h + getArcLabel(start);
+ h = 17 * h + getArcTarget(start);
+ if (isArcFinal(start)) h += 17;
+ }
+
+ return h;
+ }
+
+ /**
+ * Append a new mutable state to the active path.
+ */
+ private void expandActivePath(int size) {
+ if (activePath.length < size) {
+ final int p = activePath.length;
+ activePath = java.util.Arrays.copyOf(activePath, size);
+ nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);
+
+ for (int i = p; i < size; i++) {
+ nextArcOffset[i] = activePath[i] =
+ allocateState(/* assume max labels count */ MAX_LABELS);
+ }
+ }
+ }
+
+ /**
+ * Expand internal buffers for the next state.
+ */
+ private void expandBuffers() {
+ if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
+ serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
+ serializationBufferReallocations++;
+ }
+ }
+
+ /**
+ * Allocate space for a state with the given number of outgoing labels.
+ *
+ * @return state offset
+ */
+ private int allocateState(int labels) {
+ expandBuffers();
+ final int state = size;
+ size += labels * ARC_SIZE;
+ return state;
+ }
+
+ /**
+ * Copy <code>current</code> into an internal buffer.
+ */
+ private boolean setPrevious(byte [] sequence, int start, int length) {
+ if (previous == null || previous.length < length) {
+ previous = new byte [length];
+ }
+
+ System.arraycopy(sequence, start, previous, 0, length);
+ previousLength = length;
+ return true;
+ }
+
+ /**
+ * Lexicographic order of input sequences. By default, consistent with the "C" sort
+ * (absolute value of bytes, 0-255).
+ */
+ public static int compare(byte [] s1, int start1, int lens1,
+ byte [] s2, int start2, int lens2) {
+ final int max = Math.min(lens1, lens2);
+
+ for (int i = 0; i < max; i++) {
+ final byte c1 = s1[start1++];
+ final byte c2 = s2[start2++];
+ if (c1 != c2)
+ return (c1 & 0xff) - (c2 & 0xff);
+ }
+
+ return lens1 - lens2;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java
new file mode 100644
index 0000000..9e381f4
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java
@@ -0,0 +1,154 @@
+package morfologik.fsa;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+/**
+ * An iterator that traverses the right language of a given node (all sequences
+ * reachable from a given node).
+ */
+public final class FSAFinalStatesIterator implements Iterator<ByteBuffer> {
+ /**
+ * Default expected depth of the recursion stack (estimated longest sequence
+ * in the automaton). Buffers expand by the same value if exceeded.
+ */
+ private final static int EXPECTED_MAX_STATES = 15;
+
+ /** The FSA to which this iterator belongs. */
+ private final FSA fsa;
+
+ /** An internal cache for the next element in the FSA */
+ private ByteBuffer nextElement;
+
+ /**
+ * A buffer for the current sequence of bytes from the current node to the
+ * root.
+ */
+ private byte[] buffer = new byte[EXPECTED_MAX_STATES];
+
+ /** Reusable byte buffer wrapper around {@link #buffer}. */
+ private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer);
+
+ /** An arc stack for DFS when processing the automaton. */
+ private int[] arcs = new int[EXPECTED_MAX_STATES];
+
+ /** Current processing depth in {@link #arcs}. */
+ private int position;
+
+ /**
+ * Create an instance of the iterator for a given node.
+ */
+ public FSAFinalStatesIterator(FSA fsa, int node) {
+ this.fsa = fsa;
+
+ if (fsa.getFirstArc(node) != 0) {
+ restartFrom(node);
+ }
+ }
+
+ /**
+ * Restart walking from <code>node</code>. Allows iterator reuse.
+ */
+ public void restartFrom(int node) {
+ position = 0;
+ bufferWrapper.clear();
+ nextElement = null;
+
+ pushNode(node);
+ }
+
+ /** Returns <code>true</code> if there are still elements in this iterator. */
+ @Override
+ public boolean hasNext() {
+ if (nextElement == null) {
+ nextElement = advance();
+ }
+
+ return nextElement != null;
+ }
+
+ /**
+ * @return Returns a {@link ByteBuffer} with the sequence corresponding to
+ * the next final state in the automaton.
+ */
+ @Override
+ public ByteBuffer next() {
+ if (nextElement != null) {
+ final ByteBuffer cache = nextElement;
+ nextElement = null;
+ return cache;
+ } else {
+ final ByteBuffer cache = advance();
+ if (cache == null) {
+ throw new NoSuchElementException();
+ }
+ return cache;
+ }
+ }
+
+ /**
+ * Advances to the next available final state.
+ */
+ private final ByteBuffer advance() {
+ if (position == 0) {
+ return null;
+ }
+
+ while (position > 0) {
+ final int lastIndex = position - 1;
+ final int arc = arcs[lastIndex];
+
+ if (arc == 0) {
+ // Remove the current node from the queue.
+ position--;
+ continue;
+ }
+
+ // Go to the next arc, but leave it on the stack
+ // so that we keep the recursion depth level accurate.
+ arcs[lastIndex] = fsa.getNextArc(arc);
+
+ // Expand buffer if needed.
+ final int bufferLength = this.buffer.length;
+ if (lastIndex >= bufferLength) {
+ this.buffer = Arrays.copyOf(buffer, bufferLength
+ + EXPECTED_MAX_STATES);
+ this.bufferWrapper = ByteBuffer.wrap(buffer);
+ }
+ buffer[lastIndex] = fsa.getArcLabel(arc);
+
+ if (!fsa.isArcTerminal(arc)) {
+ // Recursively descend into the arc's node.
+ pushNode(fsa.getEndNode(arc));
+ }
+
+ if (fsa.isArcFinal(arc)) {
+ bufferWrapper.clear();
+ bufferWrapper.limit(lastIndex + 1);
+ return bufferWrapper;
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Not implemented in this iterator.
+ */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("Read-only iterator.");
+ }
+
+ /**
+ * Descends to a given node, adds its arcs to the stack to be traversed.
+ */
+ private void pushNode(int node) {
+ // Expand buffers if needed.
+ if (position == arcs.length) {
+ arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES);
+ }
+
+ arcs[position++] = fsa.getFirstArc(node);
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java
new file mode 100644
index 0000000..7b9a730
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java
@@ -0,0 +1,64 @@
+package morfologik.fsa;
+
+import java.util.Set;
+
+/**
+ * FSA automaton flags. Where applicable, flags follow Daciuk's <code>fsa</code> package.
+ */
+public enum FSAFlags {
+ /** Daciuk: flexible FSA encoding. */
+ FLEXIBLE(1 << 0),
+
+ /** Daciuk: stop bit in use. */
+ STOPBIT(1 << 1),
+
+ /** Daciuk: next bit in use. */
+ NEXTBIT(1 << 2),
+
+ /** Daciuk: tails compression. */
+ TAILS(1 << 3),
+
+ /*
+ * These flags are outside of byte range (never occur in Daciuk's FSA).
+ */
+
+ /**
+ * The FSA contains right-language count numbers on states.
+ *
+ * @see FSA#getRightLanguageCount(int)
+ */
+ NUMBERS(1 << 8),
+
+ /**
+ * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package
+ * compatibility).
+ */
+ SEPARATORS(1 << 9);
+
+ /**
+ * Bit mask for the corresponding flag.
+ */
+ public final int bits;
+
+ /** */
+ private FSAFlags(int bits) {
+ this.bits = bits;
+ }
+
+ /**
+ * Returns <code>true</code> if the corresponding flag is set in the bit set.
+ */
+ public static boolean isSet(int flags, FSAFlags flag) {
+ return (flags & flag.bits) != 0;
+ }
+
+ /**
+ * Returns the set of flags encoded in a single short.
+ */
+ public static short asShort(Set<FSAFlags> flags) {
+ short value = 0;
+ for (FSAFlags f : flags)
+ value |= f.bits;
+ return value;
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java
new file mode 100644
index 0000000..76fd6ff
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java
@@ -0,0 +1,52 @@
+package morfologik.fsa;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import morfologik.util.FileUtils;
+import static morfologik.util.FileUtils.*;
+
+/**
+ * Standard FSA file header, as described in <code>fsa</code> package documentation.
+ */
+final class FSAHeader {
+ /**
+ * FSA magic (4 bytes).
+ */
+ public final static int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a');
+
+ /**
+ * Maximum length of the header block.
+ */
+ public static final int MAX_HEADER_LENGTH = 4 + 8;
+
+ /** FSA version number. */
+ public byte version;
+
+ /** Filler character. */
+ public byte filler;
+
+ /** Annotation character. */
+ public byte annotation;
+
+ /** Goto field (may be a compound, depending on the automaton version). */
+ public byte gtl;
+
+ /**
+ * Read FSA header from a stream, consuming its bytes.
+ *
+ * @throws IOException If the stream ends prematurely or if it contains invalid data.
+ */
+ public static FSAHeader read(InputStream in) throws IOException {
+ if (FSA_MAGIC != FileUtils.readInt(in))
+ throw new IOException("Invalid file header magic bytes.");
+
+ final FSAHeader h = new FSAHeader();
+ h.version = readByte(in);
+ h.filler = readByte(in);
+ h.annotation = readByte(in);
+ h.gtl = readByte(in);
+
+ return h;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java
new file mode 100644
index 0000000..4015dce
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java
@@ -0,0 +1,156 @@
+package morfologik.fsa;
+
+import java.util.BitSet;
+
+import com.carrotsearch.hppc.IntIntOpenHashMap;
+
+/**
+ * Compute additional information about an FSA: number of arcs, nodes, etc.
+ */
+public final class FSAInfo {
+ /**
+ * Computes the exact number of states and nodes by recursively traversing
+ * the FSA.
+ */
+ private static class NodeVisitor {
+ final BitSet visitedArcs = new BitSet();
+ final BitSet visitedNodes = new BitSet();
+
+ int nodes;
+ int arcs;
+ int totalArcs;
+
+ private final FSA fsa;
+
+ NodeVisitor(FSA fsa) {
+ this.fsa = fsa;
+ }
+
+ public void visitNode(final int node) {
+ if (visitedNodes.get(node)) {
+ return;
+ }
+ visitedNodes.set(node);
+
+ nodes++;
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa
+ .getNextArc(arc)) {
+ if (!visitedArcs.get(arc)) {
+ arcs++;
+ }
+ totalArcs++;
+ visitedArcs.set(arc);
+
+ if (!fsa.isArcTerminal(arc)) {
+ visitNode(fsa.getEndNode(arc));
+ }
+ }
+ }
+ }
+
+ /**
+ * Computes the exact number of final states.
+ */
+ private static class FinalStateVisitor {
+ final IntIntOpenHashMap visitedNodes = new IntIntOpenHashMap();
+
+ private final FSA fsa;
+
+ FinalStateVisitor(FSA fsa) {
+ this.fsa = fsa;
+ }
+
+ public int visitNode(int node) {
+ if (visitedNodes.containsKey(node))
+ return visitedNodes.lget();
+
+ int fromHere = 0;
+ for (int arc = fsa.getFirstArc(node);
+ arc != 0; arc = fsa.getNextArc(arc))
+ {
+ if (fsa.isArcFinal(arc))
+ fromHere++;
+
+ if (!fsa.isArcTerminal(arc)) {
+ fromHere += visitNode(fsa.getEndNode(arc));
+ }
+ }
+ visitedNodes.put(node, fromHere);
+ return fromHere;
+ }
+ }
+
+ /**
+ * Number of nodes in the automaton.
+ */
+ public final int nodeCount;
+
+ /**
+ * Number of arcs in the automaton, excluding an arcs from the zero node
+ * (initial) and an arc from the start node to the root node.
+ */
+ public final int arcsCount;
+
+ /**
+ * Total number of arcs, counting arcs that physically overlap due to
+ * merging.
+ */
+ public final int arcsCountTotal;
+
+ /**
+ * Number of final states (number of input sequences stored in the automaton).
+ */
+ public final int finalStatesCount;
+
+ /**
+ * Arcs size (in serialized form).
+ */
+ public final int size;
+
+ /*
+ *
+ */
+ public FSAInfo(FSA fsa) {
+ final NodeVisitor w = new NodeVisitor(fsa);
+ int root = fsa.getRootNode();
+ if (root > 0) {
+ w.visitNode(root);
+ }
+
+ this.nodeCount = 1 + w.nodes;
+ this.arcsCount = 1 + w.arcs;
+ this.arcsCountTotal = 1 + w.totalArcs;
+
+ final FinalStateVisitor fsv = new FinalStateVisitor(fsa);
+ this.finalStatesCount = fsv.visitNode(fsa.getRootNode());
+
+ if (fsa instanceof FSA5) {
+ this.size = ((FSA5) fsa).arcs.length;
+ } else {
+ this.size = 0;
+ }
+ }
+
+ /*
+ *
+ */
+ public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) {
+ this.nodeCount = nodeCount;
+ this.arcsCount = arcsCount;
+ this.arcsCountTotal = arcsCountTotal;
+ this.finalStatesCount = finalStatesCount;
+ this.size = 0;
+ }
+
+ /*
+ *
+ */
+ @Override
+ public String toString() {
+ return "Nodes: " + nodeCount
+ + ", arcs visited: " + arcsCount
+ + ", arcs total: " + arcsCountTotal
+ + ", final states: " + finalStatesCount
+ + ", size: " + size;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java
new file mode 100644
index 0000000..fc52eeb
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java
@@ -0,0 +1,43 @@
+package morfologik.fsa;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Set;
+
+/**
+ * All FSA serializers to binary formats will implement this interface.
+ */
+public interface FSASerializer {
+ /**
+ * Serialize a finite state automaton to an output stream.
+ */
+ public <T extends OutputStream> T serialize(FSA fsa, T os) throws IOException;
+
+ /**
+ * Returns the set of flags supported by the serializer (and the output automaton).
+ */
+ public Set<FSAFlags> getFlags();
+
+ /**
+ * Log extra messages during construction.
+ */
+ public FSASerializer withLogger(IMessageLogger logger);
+
+ /**
+ * Supports built-in filler separator. Only if {@link #getFlags()} returns
+ * {@link FSAFlags#SEPARATORS}.
+ */
+ public FSASerializer withFiller(byte filler);
+
+ /**
+ * Supports built-in annotation separator. Only if {@link #getFlags()} returns
+ * {@link FSAFlags#SEPARATORS}.
+ */
+ public FSASerializer withAnnotationSeparator(byte annotationSeparator);
+
+ /**
+ * Supports built-in right language count on nodes, speeding up perfect hash counts.
+ * Only if {@link #getFlags()} returns {@link FSAFlags#NUMBERS}.
+ */
+ public FSASerializer withNumbers();
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java
new file mode 100644
index 0000000..9e59003
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java
@@ -0,0 +1,169 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.MatchResult.*;
+
+/**
+ * This class implements some common matching and scanning operations on a
+ * generic FSA.
+ */
+public final class FSATraversal {
+ /**
+ * Target automaton.
+ */
+ private final FSA fsa;
+
+ /**
+ * Traversals of the given FSA.
+ */
+ public FSATraversal(FSA fsa) {
+ this.fsa = fsa;
+ }
+
+ /**
+ * Calculate perfect hash for a given input sequence of bytes. The perfect hash requires
+ * that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential
+ * order of input sequences used at automaton construction time.
+ *
+ * @param start Start index in the sequence array.
+ * @param length Length of the byte sequence, must be at least 1.
+ *
+ * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting
+ * the number of that sequence in the input used to build the automaton). Returns a negative
+ * integer if the input sequence was not part of the input from which the automaton was created.
+ * The type of mismatch is a constant defined in {@link MatchResult}.
+ */
+ public int perfectHash(byte[] sequence, int start, int length, int node) {
+ assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option.";
+ assert length > 0 : "Must be a non-empty sequence.";
+
+ int hash = 0;
+ final int end = start + length - 1;
+
+ int seqIndex = start;
+ byte label = sequence[seqIndex];
+
+ // Seek through the current node's labels, looking for 'label', update hash.
+ for (int arc = fsa.getFirstArc(node); arc != 0;) {
+ if (fsa.getArcLabel(arc) == label) {
+ if (fsa.isArcFinal(arc)) {
+ if (seqIndex == end)
+ return hash;
+
+ hash++;
+ }
+
+ if (fsa.isArcTerminal(arc)) {
+ /* The automaton contains a prefix of the input sequence. */
+ return AUTOMATON_HAS_PREFIX;
+ }
+
+ // The sequence is a prefix of one of the sequences stored in the automaton.
+ if (seqIndex == end) {
+ return SEQUENCE_IS_A_PREFIX;
+ }
+
+ // Make a transition along the arc, go the target node's first arc.
+ arc = fsa.getFirstArc(fsa.getEndNode(arc));
+ label = sequence[++seqIndex];
+ continue;
+ } else {
+ if (fsa.isArcFinal(arc))
+ hash++;
+ if (!fsa.isArcTerminal(arc))
+ hash += fsa.getRightLanguageCount(fsa.getEndNode(arc));
+ }
+
+ arc = fsa.getNextArc(arc);
+ }
+
+ // Labels of this node ended without a match on the sequence.
+ // Perfect hash does not exist.
+ return NO_MATCH;
+ }
+
+ /**
+ * @see #perfectHash(byte[], int, int, int)
+ */
+ public int perfectHash(byte[] sequence) {
+ return perfectHash(sequence, 0, sequence.length, fsa.getRootNode());
+ }
+
+ /**
+ * Same as {@link #match(byte[], int, int, int)}, but allows passing
+ * a reusable {@link MatchResult} object so that no intermediate garbage is
+ * produced.
+ *
+ * @return The same object as <code>result</code>, but with reset internal
+ * type and other fields.
+ */
+ public MatchResult match(MatchResult result,
+ byte[] sequence, int start, int length, int node)
+ {
+ if (node == 0) {
+ result.reset(NO_MATCH, start, node);
+ return result;
+ }
+
+ final FSA fsa = this.fsa;
+ final int end = start + length;
+ for (int i = start; i < end; i++) {
+ final int arc = fsa.getArc(node, sequence[i]);
+ if (arc != 0) {
+ if (fsa.isArcFinal(arc) && i + 1 == end) {
+ /* The automaton has an exact match of the input sequence. */
+ result.reset(EXACT_MATCH, i, node);
+ return result;
+ }
+
+ if (fsa.isArcTerminal(arc)) {
+ /* The automaton contains a prefix of the input sequence. */
+ result.reset(AUTOMATON_HAS_PREFIX, i + 1, 0);
+ return result;
+ }
+
+ // Make a transition along the arc.
+ node = fsa.getEndNode(arc);
+ } else {
+ result.reset(NO_MATCH, i, node);
+ return result;
+ }
+ }
+
+ /* The sequence is a prefix of at least one sequence in the automaton. */
+ result.reset(SEQUENCE_IS_A_PREFIX, 0, node);
+ return result;
+ }
+
+ /**
+ * Finds a matching path in the dictionary for a given sequence of labels
+ * from <code>sequence</code> and starting at node <code>node</code>.
+ *
+ * @param sequence
+ * An array of labels to follow in the FSA.
+ * @param start
+ * Starting index in <code>sequence</code>.
+ * @param length
+ * How many symbols to consider from <code>sequence</code>?
+ * @param node
+ * Start node identifier in the FSA.
+ *
+ * @see #match(byte [], int)
+ */
+ public MatchResult match(byte[] sequence, int start, int length, int node) {
+ return match(new MatchResult(), sequence, start, length, node);
+ }
+
+ /**
+ * @see #match(byte[], int, int, int)
+ */
+ public MatchResult match(byte[] sequence, int node) {
+ return match(sequence, 0, sequence.length, node);
+ }
+
+ /**
+ * @see #match(byte[], int, int, int)
+ */
+ public MatchResult match(byte[] sequence) {
+ return match(sequence, fsa.getRootNode());
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java
new file mode 100644
index 0000000..cad611e
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java
@@ -0,0 +1,202 @@
+package morfologik.fsa;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.TreeMap;
+
+import com.carrotsearch.hppc.IntIntOpenHashMap;
+
+/**
+ * Other FSA-related utilities not directly associated with the class hierarchy.
+ */
+public final class FSAUtils {
+ public final static class IntIntHolder {
+ public int a;
+ public int b;
+
+ public IntIntHolder(int a, int b) {
+ this.a = a;
+ this.b = b;
+ }
+
+ public IntIntHolder() {
+ }
+ }
+
+ /**
+ * Returns the right-language reachable from a given FSA node, formatted
+ * as an input for the graphviz package (expressed in the <code>dot</code>
+ * language).
+ */
+ public static String toDot(FSA fsa, int node) {
+ try {
+ StringWriter w = new StringWriter();
+ toDot(w, fsa, node);
+ return w.toString();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Saves the right-language reachable from a given FSA node, formatted
+ * as an input for the graphviz package (expressed in the <code>dot</code>
+ * language), to the given writer.
+ */
+ public static void toDot(Writer w, FSA fsa, int node) throws IOException {
+ w.write("digraph Automaton {\n");
+ w.write(" rankdir = LR;\n");
+
+ final BitSet visited = new BitSet();
+
+ w.write(" stop [shape=doublecircle,label=\"\"];\n");
+ w.write(" initial [shape=plaintext,label=\"\"];\n");
+ w.write(" initial -> " + node + "\n\n");
+
+ visitNode(w, 0, fsa, node, visited);
+ w.write("}\n");
+ }
+
+ private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException {
+ visited.set(s);
+ w.write(" "); w.write(Integer.toString(s));
+
+ if (fsa.getFlags().contains(FSAFlags.NUMBERS)) {
+ int nodeNumber = fsa.getRightLanguageCount(s);
+ w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n");
+ } else {
+ w.write(" [shape=circle,label=\"\"];\n");
+ }
+
+ for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
+ w.write(" ");
+ w.write(Integer.toString(s));
+ w.write(" -> ");
+ if (fsa.isArcTerminal(arc)) {
+ w.write("stop");
+ } else {
+ w.write(Integer.toString(fsa.getEndNode(arc)));
+ }
+
+ final byte label = fsa.getArcLabel(arc);
+ w.write(" [label=\"");
+ if (Character.isLetterOrDigit(label))
+ w.write((char) label);
+ else {
+ w.write("0x");
+ w.write(Integer.toHexString(label & 0xFF));
+ }
+ w.write("\"");
+ if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\"");
+ if (fsa instanceof FSA5) {
+ if (((FSA5) fsa).isNextSet(arc)) {
+ w.write(" color=\"blue\"");
+ }
+ }
+
+ w.write("]\n");
+ }
+
+ for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
+ if (!fsa.isArcTerminal(arc)) {
+ int endNode = fsa.getEndNode(arc);
+ if (!visited.get(endNode)) {
+ visitNode(w, d + 1, fsa, endNode, visited);
+ }
+ }
+ }
+ }
+
+ /**
+ * All byte sequences generated as the right language of <code>state</code>.
+ */
+ public static ArrayList<byte[]> rightLanguage(FSA fsa, int state) {
+ final ArrayList<byte[]> rl = new ArrayList<byte[]>();
+ final byte [] buffer = new byte [0];
+
+ descend(fsa, state, buffer, 0, rl);
+
+ return rl;
+ }
+
+ /**
+ * Recursive descend and collection of the right language.
+ */
+ private static byte [] descend(FSA fsa, int state, byte [] b, int position, ArrayList<byte[]> rl) {
+
+ if (b.length <= position) {
+ b = Arrays.copyOf(b, position + 1);
+ }
+
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
+ b[position] = fsa.getArcLabel(arc);
+
+ if (fsa.isArcFinal(arc)) {
+ rl.add(Arrays.copyOf(b, position + 1));
+ }
+
+ if (!fsa.isArcTerminal(arc))
+ b = descend(fsa, fsa.getEndNode(arc), b, position + 1, rl);
+ }
+
+ return b;
+ }
+
+ /**
+ * Calculate fan-out ratio.
+ * @return The returned array: result[outgoing-arcs]
+ */
+ public static TreeMap<Integer, Integer> calculateFanOuts(final FSA fsa, int root) {
+ final int [] result = new int [256];
+ fsa.visitInPreOrder(new StateVisitor() {
+ public boolean accept(int state) {
+ int count = 0;
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
+ count++;
+ result[count]++;
+ return true;
+ }
+ });
+
+ TreeMap<Integer, Integer> output = new TreeMap<Integer, Integer>();
+
+ int low = 1; // Omit #0, there is always a single node like that (dummy).
+ while (low < result.length && result[low] == 0) low++;
+
+ int high = result.length - 1;
+ while (high >= 0 && result[high] == 0) high--;
+
+ for (int i = low; i <= high; i++) {
+ output.put(i, result[i]);
+ }
+
+ return output;
+ }
+
+ /**
+ * Calculate the size of right language for each state in an FSA.
+ */
+ public static IntIntOpenHashMap rightLanguageForAllStates(final FSA fsa) {
+ final IntIntOpenHashMap numbers = new IntIntOpenHashMap();
+
+ fsa.visitInPostOrder(new StateVisitor() {
+ public boolean accept(int state) {
+ int thisNodeNumber = 0;
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
+ thisNodeNumber +=
+ (fsa.isArcFinal(arc) ? 1 : 0) +
+ (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc)));
+ }
+ numbers.put(state, thisNodeNumber);
+
+ return true;
+ }
+ });
+
+ return numbers;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java b/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java
new file mode 100644
index 0000000..4d86c1b
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java
@@ -0,0 +1,25 @@
+package morfologik.fsa;
+
+public interface IMessageLogger {
+
+ /**
+ * Log progress to the console.
+ */
+ public void log(String msg);
+
+ /**
+ * Log message header and save current time.
+ */
+ public void startPart(String header);
+
+ /**
+ *
+ */
+ public void endPart();
+
+ /**
+ * Log a two-part message.
+ */
+ public void log(String header, Object v);
+
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java b/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java
new file mode 100644
index 0000000..2f5cbd7
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java
@@ -0,0 +1,86 @@
+package morfologik.fsa;
+
+/**
+ * A matching result returned from {@link FSATraversal}.
+ *
+ * @see FSATraversal
+ */
+public final class MatchResult {
+ /**
+ * The automaton has exactly one match for the input sequence.
+ */
+ public static final int EXACT_MATCH = 0;
+
+ /**
+ * The automaton has no match for the input sequence.
+ */
+ public static final int NO_MATCH = -1;
+
+ /**
+ * The automaton contains a prefix of the input sequence. That is:
+ * one of the input sequences used to build the automaton is a
+ * prefix of the input sequence that is shorter than the sequence.
+ *
+ * <p>{@link MatchResult#index} will contain an index of the
+ * first character of the input sequence not present in the
+ * dictionary.</p>
+ */
+ public static final int AUTOMATON_HAS_PREFIX = -3;
+
+ /**
+ * The sequence is a prefix of at least one sequence in the automaton.
+ * {@link MatchResult#node} returns the node from which all sequences
+ * with the given prefix start in the automaton.
+ */
+ public static final int SEQUENCE_IS_A_PREFIX = -4;
+
+ /**
+ * One of the match kind constants defined in this class.
+ *
+ * @see #NO_MATCH
+ * @see #EXACT_MATCH
+ * @see #AUTOMATON_HAS_PREFIX
+ * @see #SEQUENCE_IS_A_PREFIX
+ */
+ public int kind;
+
+ /**
+ * Input sequence's index, interpretation depends on {@link #kind}.
+ */
+ public int index;
+
+ /**
+ * Automaton node, interpretation depends on the {@link #kind}.
+ */
+ public int node;
+
+ /*
+ *
+ */
+ MatchResult(int kind, int index, int node) {
+ reset(kind, index, node);
+ }
+
+ /*
+ *
+ */
+ MatchResult(int kind) {
+ reset(kind, 0, 0);
+ }
+
+ /*
+ *
+ */
+ public MatchResult() {
+ reset(NO_MATCH, 0, 0);
+ }
+
+ /*
+ *
+ */
+ final void reset(int kind, int index, int node) {
+ this.kind = kind;
+ this.index = index;
+ this.node = node;
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java b/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java
new file mode 100644
index 0000000..c5134b6
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java
@@ -0,0 +1,22 @@
+package morfologik.fsa;
+
+/*
+ * Do-nothing logger.
+ */
+final class NullMessageLogger implements IMessageLogger {
+ @Override
+ public void log(String msg) {
+ }
+
+ @Override
+ public void startPart(String header) {
+ }
+
+ @Override
+ public void endPart() {
+ }
+
+ @Override
+ public void log(String header, Object v) {
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java b/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java
new file mode 100644
index 0000000..8ced239
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java
@@ -0,0 +1,11 @@
+package morfologik.fsa;
+
+/**
+ * State visitor.
+ *
+ * @see FSA#visitInPostOrder(StateVisitor)
+ * @see FSA#visitInPreOrder(StateVisitor)
+ */
+public interface StateVisitor {
+ public boolean accept(int state);
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/util/Arrays.java b/morfologik-fsa/src/main/java/morfologik/util/Arrays.java
new file mode 100644
index 0000000..4d1d840
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/util/Arrays.java
@@ -0,0 +1,68 @@
+package morfologik.util;
+
+/**
+ * Compatibility layer for JVM 1.5.
+ */
+public final class Arrays {
+ private Arrays() {
+ // No instances.
+ }
+
+ /**
+ * Compare two lists of objects for reference-equality.
+ */
+ public static boolean referenceEquals(Object[] a1, int a1s, Object[] a2, int a2s, int length) {
+ for (int i = 0; i < length; i++)
+ if (a1[a1s++] != a2[a2s++])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Compare two arrays for equality.
+ */
+ public static boolean equals(byte[] a1, int a1s, byte [] a2, int a2s, int length) {
+ for (int i = 0; i < length; i++)
+ if (a1[a1s++] != a2[a2s++])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Compare two arrays for equality.
+ */
+ public static boolean equals(boolean[] a1, int a1s, boolean[] a2, int a2s, int length) {
+ for (int i = 0; i < length; i++)
+ if (a1[a1s++] != a2[a2s++])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Compare two arrays for equality.
+ */
+ public static boolean equals(int[] a1, int a1s, int[] a2, int a2s, int length) {
+ for (int i = 0; i < length; i++)
+ if (a1[a1s++] != a2[a2s++])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Convert an array of strings to bytes.
+ */
+ public static String toString(byte [] bytes, int start, int length)
+ {
+ if (bytes.length != length)
+ {
+ final byte [] sub = new byte [length];
+ System.arraycopy(bytes, start, sub, 0, length);
+ bytes = sub;
+ }
+ return java.util.Arrays.toString(bytes);
+ }
+}
diff --git a/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java b/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java
new file mode 100644
index 0000000..6ccfbc6
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java
@@ -0,0 +1,54 @@
+package morfologik.util;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+
+/**
+ * Utility functions for buffers.
+ */
+public final class BufferUtils {
+
+ /**
+ * No instances.
+ */
+ private BufferUtils() {
+ // empty
+ }
+
+ /**
+ * Ensure the byte buffer's capacity. If a new buffer is allocated, its
+ * content is empty (the old buffer's contents is not copied).
+ *
+ * @param buffer
+ * The buffer to check or <code>null</code> if a new buffer
+ * should be allocated.
+ */
+ public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) {
+ if (buffer == null || buffer.capacity() < capacity) {
+ buffer = ByteBuffer.allocate(capacity);
+ }
+ return buffer;
+ }
+
+ /**
+ * Ensure the char buffer's capacity. If a new buffer is allocated, its
+ * content is empty (the old buffer's contents is not copied).
+ *
+ * @param buffer
+ * The buffer to check or <code>null</code> if a new buffer
+ * should be allocated.
+ */
+ public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) {
+ if (buffer == null || buffer.capacity() < capacity) {
+ buffer = CharBuffer.allocate(capacity);
+ }
+ return buffer;
+ }
+
+ /**
+ * Convert a byte buffer to a string in platform default encoding.
+ */
+ public static String toString(ByteBuffer sequence) {
+ return new String(sequence.array(), sequence.position(), sequence.remaining());
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java b/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java
new file mode 100644
index 0000000..5d62212
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java
@@ -0,0 +1,137 @@
+package morfologik.util;
+
+import java.io.*;
+
+/**
+ * Utility functions.
+ */
+public final class FileUtils {
+
+ /**
+ * No instances.
+ */
+ private FileUtils() {
+ // empty
+ }
+
+ /**
+ * Checks if the given file exists.
+ */
+ public static void assertExists(File fsaFile, boolean requireFile,
+ boolean requireDirectory) throws IOException {
+ if (!fsaFile.exists()) {
+ throw new IOException("File does not exist: "
+ + fsaFile.getAbsolutePath());
+ }
+
+ if (requireFile) {
+ if (!fsaFile.isFile() || !fsaFile.canRead()) {
+ throw new IOException("File cannot be read: "
+ + fsaFile.getAbsolutePath());
+ }
+ }
+
+ if (requireDirectory) {
+ if (!fsaFile.isDirectory()) {
+ throw new IOException("Not a directory: "
+ + fsaFile.getAbsolutePath());
+ }
+ }
+ }
+
+ /**
+ * Force any non-null closeables.
+ */
+ public static void close(Closeable... closeables) {
+ for (Closeable c : closeables) {
+ if (c != null) {
+ try {
+ c.close();
+ } catch (IOException e) {
+ // too bad
+ }
+ }
+ }
+ }
+
+ /**
+ * Reads all bytes from an input stream (until EOF).
+ */
+ public static byte[] readFully(InputStream stream) throws IOException {
+ final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16);
+ final byte[] buffer = new byte[1024 * 8];
+ int bytesCount;
+ while ((bytesCount = stream.read(buffer)) > 0) {
+ baos.write(buffer, 0, bytesCount);
+ }
+ return baos.toByteArray();
+ }
+
+ /**
+ * Read enough bytes to fill <code>array</code> If there are not enough
+ * bytes, throw an exception.
+ */
+ public static void readFully(InputStream in, byte[] array)
+ throws IOException {
+ int offset = 0;
+ int cnt;
+ while ((cnt = in.read(array, offset, array.length - offset)) > 0) {
+ offset += cnt;
+
+ if (offset == array.length)
+ break;
+ }
+
+ if (cnt < 0)
+ throw new EOFException();
+ }
+
+ /**
+ * Read exactly 4 bytes from the input stream.
+ */
+ public static int readInt(InputStream in) throws IOException {
+ int v = 0;
+ for (int i = 0; i < 4; i++) {
+ v = (v << 8) | (readByte(in) & 0xff);
+ }
+ return v;
+ }
+
+ /**
+ *
+ */
+ public static void writeInt(OutputStream os, int v) throws IOException {
+ os.write( v >>> 24);
+ os.write((v >>> 16) & 0xff);
+ os.write((v >>> 8) & 0xff);
+ os.write( v & 0xff);
+ }
+
+ /**
+ * Read exactly 2 bytes from the input stream.
+ */
+ public static short readShort(InputStream in) throws IOException {
+ return (short) (readByte(in) << 8 |
+ readByte(in) & 0xff);
+ }
+
+ /**
+ * Read exactly one byte from the input stream.
+ *
+ * @throws EOFException if EOF is reached.
+ */
+ public static byte readByte(InputStream in) throws IOException {
+ int b = in.read();
+ if (b == -1)
+ throw new EOFException();
+ return (byte) b;
+ }
+
+ /**
+ *
+ */
+ public static void writeShort(OutputStream os, short v) throws IOException {
+ os.write((v >>> 8) & 0xff);
+ os.write( v & 0xff);
+ }
+} \ No newline at end of file
diff --git a/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java b/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java
new file mode 100644
index 0000000..2c7bd23
--- /dev/null
+++ b/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java
@@ -0,0 +1,58 @@
+package morfologik.util;
+
+import java.io.*;
+import java.net.*;
+
+/**
+ * Resource management utilities.
+ */
+public final class ResourceUtils {
+ /**
+ * No instances.
+ */
+ private ResourceUtils() {
+ }
+
+ /**
+ * Returns an input stream to the resource.
+ *
+ * @param resource
+ * The path leading to the resource. Can be an URL, a path
+ * leading to a class resource or a {@link File}.
+ *
+ * @return InputStream instance.
+ * @throws IOException
+ * If the resource could not be found or opened.
+ */
+ public static InputStream openInputStream(String resource)
+ throws IOException {
+ try {
+ // See if the resource is an URL first.
+ final URL url = new URL(resource);
+ // success, load the resource.
+ return url.openStream();
+ } catch (MalformedURLException e) {
+ // No luck. Fallback to class loader paths.
+ }
+
+ // Try current thread's class loader first.
+ final ClassLoader ldr = Thread.currentThread().getContextClassLoader();
+
+ InputStream is;
+ if (ldr != null && (is = ldr.getResourceAsStream(resource)) != null) {
+ return is;
+ } else if ((is = ResourceUtils.class.getResourceAsStream(resource)) != null) {
+ return is;
+ } else if ((is = ClassLoader.getSystemResourceAsStream(resource)) != null) {
+ return is;
+ }
+
+ // Try file path
+ final File f = new File(resource);
+ if (f.exists() && f.isFile() && f.canRead()) {
+ return new FileInputStream(f);
+ }
+
+ throw new IOException("Could not locate resource: " + resource);
+ }
+}