summaryrefslogtreecommitdiff
path: root/morfologik-fsa/src/test/java
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-fsa/src/test/java')
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java27
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java10
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java105
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java112
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java179
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java160
-rw-r--r--morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java256
-rw-r--r--morfologik-fsa/src/test/java/morfologik/util/MinMax.java21
8 files changed, 870 insertions, 0 deletions
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java
new file mode 100644
index 0000000..332bbcc
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java
@@ -0,0 +1,27 @@
+package morfologik.fsa;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+/**
+ *
+ */
+public class CFSA2SerializerTest extends SerializerTestBase {
+ protected CFSA2Serializer createSerializer() {
+ return new CFSA2Serializer();
+ }
+
+ @Test
+ public void testVIntCoding() {
+ byte [] scratch = new byte [5];
+
+ int [] values = {0, 1, 128, 256, 0x1000, Integer.MAX_VALUE };
+
+ for (int v : values) {
+ int len = CFSA2.writeVInt(scratch, 0, v);
+ assertEquals(v, CFSA2.readVInt(scratch, 0));
+ assertEquals(len, CFSA2.vIntLength(v));
+ }
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java
new file mode 100644
index 0000000..1d05cfc
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java
@@ -0,0 +1,10 @@
+package morfologik.fsa;
+
+/**
+ *
+ */
+public class FSA5SerializerTest extends SerializerTestBase {
+ protected FSA5Serializer createSerializer() {
+ return new FSA5Serializer();
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java
new file mode 100644
index 0000000..97b2f05
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java
@@ -0,0 +1,105 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.FSAFlags.NEXTBIT;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.Test;
+
+/**
+ * Additional tests for {@link FSA5}.
+ */
+public final class FSA5Test {
+ public ArrayList<String> expected = new ArrayList<String>(Arrays.asList(
+ "a", "aba", "ac", "b", "ba", "c"));
+
+ @Test
+ public void testVersion5() throws IOException {
+ final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
+ assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS));
+ verifyContent(expected, fsa);
+ }
+
+ @Test
+ public void testVersion5WithNumbers() throws IOException {
+ final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
+
+ verifyContent(expected, fsa);
+ assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS));
+ }
+
+ @Test
+ public void testArcsAndNodes() throws IOException {
+ final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream(
+ "abc.fsa"));
+ final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream(
+ "abc-numbers.fsa"));
+
+ FSAInfo info1 = new FSAInfo(fsa1);
+ FSAInfo info2 = new FSAInfo(fsa2);
+
+ assertEquals(info1.arcsCount, info2.arcsCount);
+ assertEquals(info1.nodeCount, info2.nodeCount);
+
+ assertEquals(4, info2.nodeCount);
+ assertEquals(7, info2.arcsCount);
+ }
+
+ @Test
+ public void testNumbers() throws IOException {
+ final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
+
+ assertTrue(fsa.getFlags().contains(NEXTBIT));
+
+ // Get all numbers for nodes.
+ byte[] buffer = new byte[128];
+ final ArrayList<String> result = new ArrayList<String>();
+ walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result);
+
+ Collections.sort(result);
+ assertEquals(Arrays
+ .asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result);
+ }
+
+ public static void walkNode(byte[] buffer, int depth, FSA fsa, int node,
+ int cnt, List<String> result) throws IOException {
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ buffer[depth] = fsa.getArcLabel(arc);
+
+ if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) {
+ result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8"));
+ }
+
+ if (fsa.isArcFinal(arc))
+ cnt++;
+
+ if (!fsa.isArcTerminal(arc)) {
+ walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result);
+ cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc));
+ }
+ }
+ }
+
+ private static void verifyContent(List<String> expected, FSA fsa) throws IOException {
+ final ArrayList<String> actual = new ArrayList<String>();
+
+ int count = 0;
+ for (ByteBuffer bb : fsa.getSequences()) {
+ assertEquals(0, bb.arrayOffset());
+ assertEquals(0, bb.position());
+ actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8"));
+ count++;
+ }
+ assertEquals(expected.size(), count);
+ Collections.sort(actual);
+ assertEquals(expected, actual);
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java
new file mode 100644
index 0000000..d2e1bad
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java
@@ -0,0 +1,112 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.FSATestUtils.*;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSABuilder;
+import morfologik.util.MinMax;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class FSABuilderTest {
+ private static byte[][] input;
+ private static byte[][] input2;
+
+ @BeforeClass
+ public static void prepareByteInput() {
+ input = generateRandom(25000, new MinMax(1, 20), new MinMax(0, 255));
+ input2 = generateRandom(40, new MinMax(1, 20), new MinMax(0, 3));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testEmptyInput() {
+ byte[][] input = {};
+ checkCorrect(input, FSABuilder.build(input));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testHashResizeBug() throws Exception {
+ byte[][] input = {
+ {0, 1 },
+ {0, 2 },
+ {1, 1 },
+ {2, 1 },
+ };
+
+ FSA fsa = FSABuilder.build(input);
+ checkCorrect(input, FSABuilder.build(input));
+ checkMinimal(fsa);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testSmallInput() throws Exception {
+ byte[][] input = {
+ "abc".getBytes("UTF-8"),
+ "bbc".getBytes("UTF-8"),
+ "d".getBytes("UTF-8"),
+ };
+ checkCorrect(input, FSABuilder.build(input));
+ }
+
+ /**
+ * Verify absolute byte-value ordering in the comparators and serialized automaton.
+ */
+ @Test
+ public void testLexicographicOrder() throws IOException {
+ byte[][] input = {
+ {0},
+ {1},
+ {(byte) 0xff},
+ };
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+
+ // Check if lexical ordering is consistent with absolute byte value.
+ assertEquals(0, input[0][0]);
+ assertEquals(1, input[1][0]);
+ assertEquals((byte) 0xff, input[2][0]);
+
+ final FSA fsa;
+ checkCorrect(input, fsa = FSABuilder.build(input));
+
+ int arc = fsa.getFirstArc(fsa.getRootNode());
+ assertEquals(0, fsa.getArcLabel(arc));
+ arc = fsa.getNextArc(arc);
+ assertEquals(1, fsa.getArcLabel(arc));
+ arc = fsa.getNextArc(arc);
+ assertEquals((byte) 0xff, fsa.getArcLabel(arc));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testRandom25000_largerAlphabet() {
+ FSA fsa = FSABuilder.build(input);
+ checkCorrect(input, fsa);
+ checkMinimal(fsa);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testRandom25000_smallAlphabet() throws IOException {
+ FSA fsa = FSABuilder.build(input2);
+ checkCorrect(input2, fsa);
+ checkMinimal(fsa);
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java
new file mode 100644
index 0000000..d6cfeee
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java
@@ -0,0 +1,179 @@
+package morfologik.fsa;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import morfologik.util.BufferUtils;
+import morfologik.util.MinMax;
+
+import org.junit.Assert;
+
+public class FSATestUtils {
+ /**
+ * Generate a sorted list of random sequences.
+ */
+ public static byte[][] generateRandom(int count, MinMax length,
+ MinMax alphabet) {
+ final byte[][] input = new byte[count][];
+ final Random rnd = new Random(0x11223344);
+ for (int i = 0; i < count; i++) {
+ input[i] = randomByteSequence(rnd, length, alphabet);
+ }
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ return input;
+ }
+
+ /**
+ * Generate a random string.
+ */
+ private static byte[] randomByteSequence(Random rnd, MinMax length,
+ MinMax alphabet) {
+ byte[] bytes = new byte[length.min + rnd.nextInt(length.range())];
+ for (int i = 0; i < bytes.length; i++) {
+ bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range()));
+ }
+ return bytes;
+ }
+
+ /**
+ * Check if the DFSA is correct with respect to the given input.
+ */
+ public static void checkCorrect(byte[][] input, FSA fsa) {
+ // (1) All input sequences are in the right language.
+ HashSet<ByteBuffer> rl = new HashSet<ByteBuffer>();
+ for (ByteBuffer bb : fsa) {
+ rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining())));
+ }
+
+ HashSet<ByteBuffer> uniqueInput = new HashSet<ByteBuffer>();
+ for (byte[] sequence : input) {
+ uniqueInput.add(ByteBuffer.wrap(sequence));
+ }
+
+ for (ByteBuffer sequence : uniqueInput) {
+ Assert.assertTrue("Not present in the right language: "
+ + BufferUtils.toString(sequence), rl.remove(sequence));
+ }
+
+ // (2) No other sequence _other_ than the input is in the right language.
+ Assert.assertEquals(0, rl.size());
+ }
+
+ /**
+ * Check if the DFSA reachable from a given state is minimal. This means no
+ * two states have the same right language.
+ */
+ public static void checkMinimal(final FSA fsa) {
+ final HashMap<String, Integer> stateLanguages = new HashMap<String, Integer>();
+
+ fsa.visitInPostOrder(new StateVisitor() {
+ private StringBuilder b = new StringBuilder();
+
+ public boolean accept(int state) {
+ List<byte[]> rightLanguage = allSequences(fsa, state);
+ Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING);
+
+ b.setLength(0);
+ for (byte[] seq : rightLanguage) {
+ b.append(Arrays.toString(seq));
+ b.append(',');
+ }
+
+ String full = b.toString();
+ Assert.assertFalse("State exists: " + state + " "
+ + full + " " + stateLanguages.get(full), stateLanguages.containsKey(full));
+ stateLanguages.put(full, state);
+
+ return true;
+ }
+ });
+ }
+
+ static List<byte[]> allSequences(FSA fsa, int state) {
+ ArrayList<byte[]> seq = new ArrayList<byte[]>();
+ for (ByteBuffer bb : fsa.getSequences(state)) {
+ seq.add(Arrays.copyOf(bb.array(), bb.remaining()));
+ }
+ return seq;
+ }
+
+ /**
+ * Check if two FSAs are identical.
+ */
+ public static void checkIdentical(FSA fsa1, FSA fsa2) {
+ ArrayDeque<String> fromRoot = new ArrayDeque<String>();
+ checkIdentical(fromRoot,
+ fsa1, fsa1.getRootNode(), new BitSet(),
+ fsa2, fsa2.getRootNode(), new BitSet());
+ }
+
+ /*
+ *
+ */
+ static void checkIdentical(ArrayDeque<String> fromRoot,
+ FSA fsa1, int node1, BitSet visited1,
+ FSA fsa2, int node2, BitSet visited2) {
+ int arc1 = fsa1.getFirstArc(node1);
+ int arc2 = fsa2.getFirstArc(node2);
+
+ if (visited1.get(node1) != visited2.get(node2)) {
+ throw new RuntimeException("Two nodes should either be visited or not visited: "
+ + Arrays.toString(fromRoot.toArray()) + " "
+ + " node1: " + node1 + " "
+ + " node2: " + node2);
+ }
+ visited1.set(node1);
+ visited2.set(node2);
+
+ TreeSet<Character> labels1 = new TreeSet<Character>();
+ TreeSet<Character> labels2 = new TreeSet<Character>();
+ while (true) {
+ labels1.add((char) fsa1.getArcLabel(arc1));
+ labels2.add((char) fsa2.getArcLabel(arc2));
+
+ arc1 = fsa1.getNextArc(arc1);
+ arc2 = fsa2.getNextArc(arc2);
+
+ if (arc1 == 0 || arc2 == 0) {
+ if (arc1 != arc2) {
+ throw new RuntimeException("Different number of labels at path: "
+ + Arrays.toString(fromRoot.toArray()));
+ }
+ break;
+ }
+ }
+
+ if (!labels1.equals(labels2)) {
+ throw new RuntimeException("Different sets of labels at path: "
+ + Arrays.toString(fromRoot.toArray()) + ":\n"
+ + labels1 + "\n" + labels2);
+ }
+
+ // recurse.
+ for (char chr : labels1) {
+ byte label = (byte) chr;
+ fromRoot.push(Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr));
+
+ arc1 = fsa1.getArc(node1, label);
+ arc2 = fsa2.getArc(node2, label);
+
+ if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) {
+ throw new RuntimeException("Different final flag on arcs at: "
+ + Arrays.toString(fromRoot.toArray()) + ", label: " + label);
+ }
+
+ if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) {
+ throw new RuntimeException("Different terminal flag on arcs at: "
+ + Arrays.toString(fromRoot.toArray()) + ", label: " + label);
+ }
+
+ if (!fsa1.isArcTerminal(arc1)) {
+ checkIdentical(fromRoot,
+ fsa1, fsa1.getEndNode(arc1), visited1,
+ fsa2, fsa2.getEndNode(arc2), visited2);
+ }
+
+ fromRoot.pop();
+ }
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java
new file mode 100644
index 0000000..ddafb6d
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java
@@ -0,0 +1,160 @@
+package morfologik.fsa;
+
+import static org.junit.Assert.*;
+import static morfologik.fsa.MatchResult.*;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.HashSet;
+
+
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests {@link FSATraversal}.
+ */
+public final class FSATraversalTest {
+ private FSA fsa;
+
+ /**
+ *
+ */
+ @Before
+ public void setUp() throws Exception {
+ fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict"));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testTraversalWithIterable() {
+ int count = 0;
+ for (ByteBuffer bb : fsa.getSequences()) {
+ assertEquals(0, bb.arrayOffset());
+ assertEquals(0, bb.position());
+ count++;
+ }
+ assertEquals(346773, count);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testPerfectHash() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a' },
+ { 'a', 'b', 'a' },
+ { 'a', 'c' },
+ { 'b' },
+ { 'b', 'a' },
+ { 'c' },
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ final byte[] fsaData =
+ new FSA5Serializer()
+ .withNumbers()
+ .serialize(s, new ByteArrayOutputStream())
+ .toByteArray();
+
+ final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData));
+ final FSATraversal traversal = new FSATraversal(fsa);
+
+ int i = 0;
+ for (byte [] seq : input)
+ {
+ assertEquals(new String(seq), i++, traversal.perfectHash(seq));
+ }
+
+ // Check if the total number of sequences is encoded at the root node.
+ assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode()));
+
+ // Check sub/super sequence scenarios.
+ assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8")));
+ assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8")));
+ assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8")));
+ assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0}));
+
+ assertTrue(AUTOMATON_HAS_PREFIX < 0);
+ assertTrue(SEQUENCE_IS_A_PREFIX < 0);
+ assertTrue(NO_MATCH < 0);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testRecursiveTraversal() {
+ final int[] counter = new int[] { 0 };
+
+ class Recursion {
+ public void dumpNode(final int node) {
+ int arc = fsa.getFirstArc(node);
+ do {
+ if (fsa.isArcFinal(arc)) {
+ counter[0]++;
+ }
+
+ if (!fsa.isArcTerminal(arc)) {
+ dumpNode(fsa.getEndNode(arc));
+ }
+
+ arc = fsa.getNextArc(arc);
+ } while (arc != 0);
+ }
+ }
+
+ new Recursion().dumpNode(fsa.getRootNode());
+
+ assertEquals(346773, counter[0]);
+ }
+
+ /**
+ * Test {@link FSATraversal} and matching results.
+ */
+ @Test
+ public void testMatch() throws IOException {
+ final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
+ final FSATraversal traversalHelper = new FSATraversal(fsa);
+
+ MatchResult m = traversalHelper.match("ax".getBytes());
+ assertEquals(NO_MATCH, m.kind);
+ assertEquals(1, m.index);
+ assertEquals(new HashSet<String>(Arrays.asList("ba", "c")),
+ suffixes(fsa, m.node));
+
+ assertEquals(EXACT_MATCH,
+ traversalHelper.match("aba".getBytes()).kind);
+
+ m = traversalHelper.match("abalonger".getBytes());
+ assertEquals(AUTOMATON_HAS_PREFIX, m.kind);
+ assertEquals("longer", "abalonger".substring(m.index));
+
+ m = traversalHelper.match("ab".getBytes());
+ assertEquals(SEQUENCE_IS_A_PREFIX, m.kind);
+ assertEquals(new HashSet<String>(Arrays.asList("a")),
+ suffixes(fsa, m.node));
+ }
+
+ /**
+ * Return all sequences reachable from a given node, as strings.
+ */
+ private HashSet<String> suffixes(FSA fsa, int node) {
+ HashSet<String> result = new HashSet<String>();
+ for (ByteBuffer bb : fsa.getSequences(node))
+ {
+ try {
+ result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return result;
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java
new file mode 100644
index 0000000..ce373ba
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java
@@ -0,0 +1,256 @@
+package morfologik.fsa;
+
+import static morfologik.fsa.FSAFlags.NUMBERS;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import morfologik.util.BufferUtils;
+
+import org.junit.*;
+
+public abstract class SerializerTestBase {
+ @Test
+ public void testA() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a' },
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ @Test
+ public void testArcsSharing() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a', 'c', 'f' },
+ { 'a', 'd', 'g' },
+ { 'a', 'e', 'h' },
+ { 'b', 'd', 'g' },
+ { 'b', 'e', 'h' },
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ @Test
+ public void testFSA5SerializerSimple() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a' },
+ { 'a', 'b', 'a' },
+ { 'a', 'c' },
+ { 'b' },
+ { 'b', 'a' },
+ { 'c' },
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ @Test
+ public void testNotMinimal() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a', 'b', 'a' },
+ { 'b' },
+ { 'b', 'a' }
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testFSA5Bug0() throws IOException {
+ checkCorrect(new String[] {
+ "3-D+A+JJ",
+ "3-D+A+NN",
+ "4-F+A+NN",
+ "z+A+NN", });
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testFSA5Bug1() throws IOException {
+ checkCorrect(new String[] { "+NP", "n+N", "n+NP", });
+ }
+
+ private void checkCorrect(String[] strings) throws IOException {
+ byte[][] input = new byte[strings.length][];
+ for (int i = 0; i < strings.length; i++) {
+ input[i] = strings[i].getBytes("ISO8859-1");
+ }
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testEmptyInput() throws IOException {
+ byte[][] input = new byte[][] {};
+ FSA s = FSABuilder.build(input);
+
+ checkSerialization(input, s);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void test_abc() throws IOException {
+ testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa")));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void test_minimal() throws IOException {
+ testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa")));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void test_minimal2() throws IOException {
+ testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa")));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void test_en_tst() throws IOException {
+ testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict")));
+ }
+
+ private void testBuiltIn(FSA fsa) throws IOException {
+ final ArrayList<byte[]> sequences = new ArrayList<byte[]>();
+
+ sequences.clear();
+ for (ByteBuffer bb : fsa) {
+ sequences.add(Arrays.copyOf(bb.array(), bb.remaining()));
+ }
+
+ Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING);
+
+ final byte[][] in = sequences.toArray(new byte[sequences.size()][]);
+ FSA root = FSABuilder.build(in);
+
+ // Check if the DFSA is correct first.
+ FSATestUtils.checkCorrect(in, root);
+
+ // Check serialization.
+ checkSerialization(in, root);
+ }
+
+ /** */
+ private void checkSerialization(byte[][] input, FSA root)
+ throws IOException {
+ checkSerialization0(createSerializer(), input, root);
+ if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) {
+ checkSerialization0(createSerializer().withNumbers(), input, root);
+ }
+ }
+
+ /** */
+ private void checkSerialization0(FSASerializer serializer,
+ final byte[][] in, FSA root) throws IOException {
+ final byte[] fsaData = serializer.serialize(root,
+ new ByteArrayOutputStream()).toByteArray();
+
+ FSA fsa = FSA.read(new ByteArrayInputStream(fsaData));
+ checkCorrect(in, fsa);
+ }
+
+ /**
+ * Check if the FSA is correct with respect to the given input.
+ */
+ protected void checkCorrect(byte[][] input, FSA fsa) {
+ // (1) All input sequences are in the right language.
+ HashSet<ByteBuffer> rl = new HashSet<ByteBuffer>();
+ for (ByteBuffer bb : fsa) {
+ byte[] array = bb.array();
+ int length = bb.remaining();
+ rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length)));
+ }
+
+ HashSet<ByteBuffer> uniqueInput = new HashSet<ByteBuffer>();
+ for (byte[] sequence : input) {
+ uniqueInput.add(ByteBuffer.wrap(sequence));
+ }
+
+ for (ByteBuffer sequence : uniqueInput) {
+ Assert.assertTrue("Not present in the right language: "
+ + BufferUtils.toString(sequence), rl.remove(sequence));
+ }
+
+ // (2) No other sequence _other_ than the input is in the right
+ // language.
+ Assert.assertEquals(0, rl.size());
+ }
+
+ @Test
+ public void testAutomatonWithNodeNumbers() throws IOException {
+ Assume.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS));
+
+ byte[][] input = new byte[][] {
+ { 'a' },
+ { 'a', 'b', 'a' },
+ { 'a', 'c' },
+ { 'b' },
+ { 'b', 'a' },
+ { 'c' }, };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ final byte[] fsaData =
+ createSerializer()
+ .withNumbers()
+ .serialize(s, new ByteArrayOutputStream()).toByteArray();
+
+ FSA fsa = FSA.read(new ByteArrayInputStream(fsaData));
+
+ // Ensure we have the NUMBERS flag set.
+ assertTrue(fsa.getFlags().contains(NUMBERS));
+
+ // Get all numbers from nodes.
+ byte[] buffer = new byte[128];
+ final ArrayList<String> result = new ArrayList<String>();
+ FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result);
+
+ Collections.sort(result);
+ assertEquals(
+ Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"),
+ result);
+ }
+
+ /**
+ *
+ */
+ protected abstract FSASerializer createSerializer();
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/util/MinMax.java b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java
new file mode 100644
index 0000000..4af6118
--- /dev/null
+++ b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java
@@ -0,0 +1,21 @@
+package morfologik.util;
+
+/**
+ * Minimum/maximum and range.
+ */
+public final class MinMax
+{
+ public final int min;
+ public final int max;
+
+ public MinMax(int min, int max)
+ {
+ this.min = Math.min(min, max);
+ this.max = Math.max(min, max);
+ }
+
+ public int range()
+ {
+ return max - min;
+ }
+} \ No newline at end of file