diff options
Diffstat (limited to 'morfologik-fsa/src/test/java')
8 files changed, 870 insertions, 0 deletions
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java new file mode 100644 index 0000000..332bbcc --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java @@ -0,0 +1,27 @@ +package morfologik.fsa;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+/**
+ *
+ */
+public class CFSA2SerializerTest extends SerializerTestBase {
+ protected CFSA2Serializer createSerializer() {
+ return new CFSA2Serializer();
+ }
+
+ @Test
+ public void testVIntCoding() {
+ byte [] scratch = new byte [5];
+
+ int [] values = {0, 1, 128, 256, 0x1000, Integer.MAX_VALUE };
+
+ for (int v : values) {
+ int len = CFSA2.writeVInt(scratch, 0, v);
+ assertEquals(v, CFSA2.readVInt(scratch, 0));
+ assertEquals(len, CFSA2.vIntLength(v));
+ }
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java new file mode 100644 index 0000000..1d05cfc --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java @@ -0,0 +1,10 @@ +package morfologik.fsa;
+
+/**
+ *
+ */
+public class FSA5SerializerTest extends SerializerTestBase {
+ protected FSA5Serializer createSerializer() {
+ return new FSA5Serializer();
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java new file mode 100644 index 0000000..97b2f05 --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java @@ -0,0 +1,105 @@ +package morfologik.fsa;
+
+import static morfologik.fsa.FSAFlags.NEXTBIT;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.Test;
+
+/**
+ * Additional tests for {@link FSA5}.
+ */
+public final class FSA5Test {
+ public ArrayList<String> expected = new ArrayList<String>(Arrays.asList(
+ "a", "aba", "ac", "b", "ba", "c"));
+
+ @Test
+ public void testVersion5() throws IOException {
+ final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
+ assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS));
+ verifyContent(expected, fsa);
+ }
+
+ @Test
+ public void testVersion5WithNumbers() throws IOException {
+ final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
+
+ verifyContent(expected, fsa);
+ assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS));
+ }
+
+ @Test
+ public void testArcsAndNodes() throws IOException {
+ final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream(
+ "abc.fsa"));
+ final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream(
+ "abc-numbers.fsa"));
+
+ FSAInfo info1 = new FSAInfo(fsa1);
+ FSAInfo info2 = new FSAInfo(fsa2);
+
+ assertEquals(info1.arcsCount, info2.arcsCount);
+ assertEquals(info1.nodeCount, info2.nodeCount);
+
+ assertEquals(4, info2.nodeCount);
+ assertEquals(7, info2.arcsCount);
+ }
+
+ @Test
+ public void testNumbers() throws IOException {
+ final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
+
+ assertTrue(fsa.getFlags().contains(NEXTBIT));
+
+ // Get all numbers for nodes.
+ byte[] buffer = new byte[128];
+ final ArrayList<String> result = new ArrayList<String>();
+ walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result);
+
+ Collections.sort(result);
+ assertEquals(Arrays
+ .asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result);
+ }
+
+ public static void walkNode(byte[] buffer, int depth, FSA fsa, int node,
+ int cnt, List<String> result) throws IOException {
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ buffer[depth] = fsa.getArcLabel(arc);
+
+ if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) {
+ result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8"));
+ }
+
+ if (fsa.isArcFinal(arc))
+ cnt++;
+
+ if (!fsa.isArcTerminal(arc)) {
+ walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result);
+ cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc));
+ }
+ }
+ }
+
+ private static void verifyContent(List<String> expected, FSA fsa) throws IOException {
+ final ArrayList<String> actual = new ArrayList<String>();
+
+ int count = 0;
+ for (ByteBuffer bb : fsa.getSequences()) {
+ assertEquals(0, bb.arrayOffset());
+ assertEquals(0, bb.position());
+ actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8"));
+ count++;
+ }
+ assertEquals(expected.size(), count);
+ Collections.sort(actual);
+ assertEquals(expected, actual);
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java new file mode 100644 index 0000000..d2e1bad --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java @@ -0,0 +1,112 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSATestUtils.*; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.Arrays; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSABuilder; +import morfologik.util.MinMax; + +import org.junit.BeforeClass; +import org.junit.Test; + +public class FSABuilderTest { + private static byte[][] input; + private static byte[][] input2; + + @BeforeClass + public static void prepareByteInput() { + input = generateRandom(25000, new MinMax(1, 20), new MinMax(0, 255)); + input2 = generateRandom(40, new MinMax(1, 20), new MinMax(0, 3)); + } + + /** + * + */ + @Test + public void testEmptyInput() { + byte[][] input = {}; + checkCorrect(input, FSABuilder.build(input)); + } + + /** + * + */ + @Test + public void testHashResizeBug() throws Exception { + byte[][] input = { + {0, 1 }, + {0, 2 }, + {1, 1 }, + {2, 1 }, + }; + + FSA fsa = FSABuilder.build(input); + checkCorrect(input, FSABuilder.build(input)); + checkMinimal(fsa); + } + + /** + * + */ + @Test + public void testSmallInput() throws Exception { + byte[][] input = { + "abc".getBytes("UTF-8"), + "bbc".getBytes("UTF-8"), + "d".getBytes("UTF-8"), + }; + checkCorrect(input, FSABuilder.build(input)); + } + + /** + * Verify absolute byte-value ordering in the comparators and serialized automaton. + */ + @Test + public void testLexicographicOrder() throws IOException { + byte[][] input = { + {0}, + {1}, + {(byte) 0xff}, + }; + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + + // Check if lexical ordering is consistent with absolute byte value. + assertEquals(0, input[0][0]); + assertEquals(1, input[1][0]); + assertEquals((byte) 0xff, input[2][0]); + + final FSA fsa; + checkCorrect(input, fsa = FSABuilder.build(input)); + + int arc = fsa.getFirstArc(fsa.getRootNode()); + assertEquals(0, fsa.getArcLabel(arc)); + arc = fsa.getNextArc(arc); + assertEquals(1, fsa.getArcLabel(arc)); + arc = fsa.getNextArc(arc); + assertEquals((byte) 0xff, fsa.getArcLabel(arc)); + } + + /** + * + */ + @Test + public void testRandom25000_largerAlphabet() { + FSA fsa = FSABuilder.build(input); + checkCorrect(input, fsa); + checkMinimal(fsa); + } + + /** + * + */ + @Test + public void testRandom25000_smallAlphabet() throws IOException { + FSA fsa = FSABuilder.build(input2); + checkCorrect(input2, fsa); + checkMinimal(fsa); + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java new file mode 100644 index 0000000..d6cfeee --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java @@ -0,0 +1,179 @@ +package morfologik.fsa; + +import java.nio.ByteBuffer; +import java.util.*; + +import morfologik.util.BufferUtils; +import morfologik.util.MinMax; + +import org.junit.Assert; + +public class FSATestUtils { + /** + * Generate a sorted list of random sequences. + */ + public static byte[][] generateRandom(int count, MinMax length, + MinMax alphabet) { + final byte[][] input = new byte[count][]; + final Random rnd = new Random(0x11223344); + for (int i = 0; i < count; i++) { + input[i] = randomByteSequence(rnd, length, alphabet); + } + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + return input; + } + + /** + * Generate a random string. + */ + private static byte[] randomByteSequence(Random rnd, MinMax length, + MinMax alphabet) { + byte[] bytes = new byte[length.min + rnd.nextInt(length.range())]; + for (int i = 0; i < bytes.length; i++) { + bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range())); + } + return bytes; + } + + /** + * Check if the DFSA is correct with respect to the given input. + */ + public static void checkCorrect(byte[][] input, FSA fsa) { + // (1) All input sequences are in the right language. + HashSet<ByteBuffer> rl = new HashSet<ByteBuffer>(); + for (ByteBuffer bb : fsa) { + rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining()))); + } + + HashSet<ByteBuffer> uniqueInput = new HashSet<ByteBuffer>(); + for (byte[] sequence : input) { + uniqueInput.add(ByteBuffer.wrap(sequence)); + } + + for (ByteBuffer sequence : uniqueInput) { + Assert.assertTrue("Not present in the right language: " + + BufferUtils.toString(sequence), rl.remove(sequence)); + } + + // (2) No other sequence _other_ than the input is in the right language. + Assert.assertEquals(0, rl.size()); + } + + /** + * Check if the DFSA reachable from a given state is minimal. This means no + * two states have the same right language. + */ + public static void checkMinimal(final FSA fsa) { + final HashMap<String, Integer> stateLanguages = new HashMap<String, Integer>(); + + fsa.visitInPostOrder(new StateVisitor() { + private StringBuilder b = new StringBuilder(); + + public boolean accept(int state) { + List<byte[]> rightLanguage = allSequences(fsa, state); + Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING); + + b.setLength(0); + for (byte[] seq : rightLanguage) { + b.append(Arrays.toString(seq)); + b.append(','); + } + + String full = b.toString(); + Assert.assertFalse("State exists: " + state + " " + + full + " " + stateLanguages.get(full), stateLanguages.containsKey(full)); + stateLanguages.put(full, state); + + return true; + } + }); + } + + static List<byte[]> allSequences(FSA fsa, int state) { + ArrayList<byte[]> seq = new ArrayList<byte[]>(); + for (ByteBuffer bb : fsa.getSequences(state)) { + seq.add(Arrays.copyOf(bb.array(), bb.remaining())); + } + return seq; + } + + /** + * Check if two FSAs are identical. + */ + public static void checkIdentical(FSA fsa1, FSA fsa2) { + ArrayDeque<String> fromRoot = new ArrayDeque<String>(); + checkIdentical(fromRoot, + fsa1, fsa1.getRootNode(), new BitSet(), + fsa2, fsa2.getRootNode(), new BitSet()); + } + + /* + * + */ + static void checkIdentical(ArrayDeque<String> fromRoot, + FSA fsa1, int node1, BitSet visited1, + FSA fsa2, int node2, BitSet visited2) { + int arc1 = fsa1.getFirstArc(node1); + int arc2 = fsa2.getFirstArc(node2); + + if (visited1.get(node1) != visited2.get(node2)) { + throw new RuntimeException("Two nodes should either be visited or not visited: " + + Arrays.toString(fromRoot.toArray()) + " " + + " node1: " + node1 + " " + + " node2: " + node2); + } + visited1.set(node1); + visited2.set(node2); + + TreeSet<Character> labels1 = new TreeSet<Character>(); + TreeSet<Character> labels2 = new TreeSet<Character>(); + while (true) { + labels1.add((char) fsa1.getArcLabel(arc1)); + labels2.add((char) fsa2.getArcLabel(arc2)); + + arc1 = fsa1.getNextArc(arc1); + arc2 = fsa2.getNextArc(arc2); + + if (arc1 == 0 || arc2 == 0) { + if (arc1 != arc2) { + throw new RuntimeException("Different number of labels at path: " + + Arrays.toString(fromRoot.toArray())); + } + break; + } + } + + if (!labels1.equals(labels2)) { + throw new RuntimeException("Different sets of labels at path: " + + Arrays.toString(fromRoot.toArray()) + ":\n" + + labels1 + "\n" + labels2); + } + + // recurse. + for (char chr : labels1) { + byte label = (byte) chr; + fromRoot.push(Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr)); + + arc1 = fsa1.getArc(node1, label); + arc2 = fsa2.getArc(node2, label); + + if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) { + throw new RuntimeException("Different final flag on arcs at: " + + Arrays.toString(fromRoot.toArray()) + ", label: " + label); + } + + if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) { + throw new RuntimeException("Different terminal flag on arcs at: " + + Arrays.toString(fromRoot.toArray()) + ", label: " + label); + } + + if (!fsa1.isArcTerminal(arc1)) { + checkIdentical(fromRoot, + fsa1, fsa1.getEndNode(arc1), visited1, + fsa2, fsa2.getEndNode(arc2), visited2); + } + + fromRoot.pop(); + } + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java new file mode 100644 index 0000000..ddafb6d --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java @@ -0,0 +1,160 @@ +package morfologik.fsa;
+
+import static org.junit.Assert.*;
+import static morfologik.fsa.MatchResult.*;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.HashSet;
+
+
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests {@link FSATraversal}.
+ */
+public final class FSATraversalTest {
+ private FSA fsa;
+
+ /**
+ *
+ */
+ @Before
+ public void setUp() throws Exception {
+ fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict"));
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testTraversalWithIterable() {
+ int count = 0;
+ for (ByteBuffer bb : fsa.getSequences()) {
+ assertEquals(0, bb.arrayOffset());
+ assertEquals(0, bb.position());
+ count++;
+ }
+ assertEquals(346773, count);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testPerfectHash() throws IOException {
+ byte[][] input = new byte[][] {
+ { 'a' },
+ { 'a', 'b', 'a' },
+ { 'a', 'c' },
+ { 'b' },
+ { 'b', 'a' },
+ { 'c' },
+ };
+
+ Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
+ FSA s = FSABuilder.build(input);
+
+ final byte[] fsaData =
+ new FSA5Serializer()
+ .withNumbers()
+ .serialize(s, new ByteArrayOutputStream())
+ .toByteArray();
+
+ final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData));
+ final FSATraversal traversal = new FSATraversal(fsa);
+
+ int i = 0;
+ for (byte [] seq : input)
+ {
+ assertEquals(new String(seq), i++, traversal.perfectHash(seq));
+ }
+
+ // Check if the total number of sequences is encoded at the root node.
+ assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode()));
+
+ // Check sub/super sequence scenarios.
+ assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8")));
+ assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8")));
+ assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8")));
+ assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0}));
+
+ assertTrue(AUTOMATON_HAS_PREFIX < 0);
+ assertTrue(SEQUENCE_IS_A_PREFIX < 0);
+ assertTrue(NO_MATCH < 0);
+ }
+
+ /**
+ *
+ */
+ @Test
+ public void testRecursiveTraversal() {
+ final int[] counter = new int[] { 0 };
+
+ class Recursion {
+ public void dumpNode(final int node) {
+ int arc = fsa.getFirstArc(node);
+ do {
+ if (fsa.isArcFinal(arc)) {
+ counter[0]++;
+ }
+
+ if (!fsa.isArcTerminal(arc)) {
+ dumpNode(fsa.getEndNode(arc));
+ }
+
+ arc = fsa.getNextArc(arc);
+ } while (arc != 0);
+ }
+ }
+
+ new Recursion().dumpNode(fsa.getRootNode());
+
+ assertEquals(346773, counter[0]);
+ }
+
+ /**
+ * Test {@link FSATraversal} and matching results.
+ */
+ @Test
+ public void testMatch() throws IOException {
+ final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
+ final FSATraversal traversalHelper = new FSATraversal(fsa);
+
+ MatchResult m = traversalHelper.match("ax".getBytes());
+ assertEquals(NO_MATCH, m.kind);
+ assertEquals(1, m.index);
+ assertEquals(new HashSet<String>(Arrays.asList("ba", "c")),
+ suffixes(fsa, m.node));
+
+ assertEquals(EXACT_MATCH,
+ traversalHelper.match("aba".getBytes()).kind);
+
+ m = traversalHelper.match("abalonger".getBytes());
+ assertEquals(AUTOMATON_HAS_PREFIX, m.kind);
+ assertEquals("longer", "abalonger".substring(m.index));
+
+ m = traversalHelper.match("ab".getBytes());
+ assertEquals(SEQUENCE_IS_A_PREFIX, m.kind);
+ assertEquals(new HashSet<String>(Arrays.asList("a")),
+ suffixes(fsa, m.node));
+ }
+
+ /**
+ * Return all sequences reachable from a given node, as strings.
+ */
+ private HashSet<String> suffixes(FSA fsa, int node) {
+ HashSet<String> result = new HashSet<String>();
+ for (ByteBuffer bb : fsa.getSequences(node))
+ {
+ try {
+ result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return result;
+ }
+}
diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java new file mode 100644 index 0000000..ce373ba --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java @@ -0,0 +1,256 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSAFlags.NUMBERS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.*; +import java.nio.ByteBuffer; +import java.util.*; + +import morfologik.util.BufferUtils; + +import org.junit.*; + +public abstract class SerializerTestBase { + @Test + public void testA() throws IOException { + byte[][] input = new byte[][] { + { 'a' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testArcsSharing() throws IOException { + byte[][] input = new byte[][] { + { 'a', 'c', 'f' }, + { 'a', 'd', 'g' }, + { 'a', 'e', 'h' }, + { 'b', 'd', 'g' }, + { 'b', 'e', 'h' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testFSA5SerializerSimple() throws IOException { + byte[][] input = new byte[][] { + { 'a' }, + { 'a', 'b', 'a' }, + { 'a', 'c' }, + { 'b' }, + { 'b', 'a' }, + { 'c' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testNotMinimal() throws IOException { + byte[][] input = new byte[][] { + { 'a', 'b', 'a' }, + { 'b' }, + { 'b', 'a' } + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void testFSA5Bug0() throws IOException { + checkCorrect(new String[] { + "3-D+A+JJ", + "3-D+A+NN", + "4-F+A+NN", + "z+A+NN", }); + } + + /** + * + */ + @Test + public void testFSA5Bug1() throws IOException { + checkCorrect(new String[] { "+NP", "n+N", "n+NP", }); + } + + private void checkCorrect(String[] strings) throws IOException { + byte[][] input = new byte[strings.length][]; + for (int i = 0; i < strings.length; i++) { + input[i] = strings[i].getBytes("ISO8859-1"); + } + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void testEmptyInput() throws IOException { + byte[][] input = new byte[][] {}; + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void test_abc() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa"))); + } + + /** + * + */ + @Test + public void test_minimal() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa"))); + } + + /** + * + */ + @Test + public void test_minimal2() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa"))); + } + + /** + * + */ + @Test + public void test_en_tst() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict"))); + } + + private void testBuiltIn(FSA fsa) throws IOException { + final ArrayList<byte[]> sequences = new ArrayList<byte[]>(); + + sequences.clear(); + for (ByteBuffer bb : fsa) { + sequences.add(Arrays.copyOf(bb.array(), bb.remaining())); + } + + Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); + + final byte[][] in = sequences.toArray(new byte[sequences.size()][]); + FSA root = FSABuilder.build(in); + + // Check if the DFSA is correct first. + FSATestUtils.checkCorrect(in, root); + + // Check serialization. + checkSerialization(in, root); + } + + /** */ + private void checkSerialization(byte[][] input, FSA root) + throws IOException { + checkSerialization0(createSerializer(), input, root); + if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) { + checkSerialization0(createSerializer().withNumbers(), input, root); + } + } + + /** */ + private void checkSerialization0(FSASerializer serializer, + final byte[][] in, FSA root) throws IOException { + final byte[] fsaData = serializer.serialize(root, + new ByteArrayOutputStream()).toByteArray(); + + FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); + checkCorrect(in, fsa); + } + + /** + * Check if the FSA is correct with respect to the given input. + */ + protected void checkCorrect(byte[][] input, FSA fsa) { + // (1) All input sequences are in the right language. + HashSet<ByteBuffer> rl = new HashSet<ByteBuffer>(); + for (ByteBuffer bb : fsa) { + byte[] array = bb.array(); + int length = bb.remaining(); + rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length))); + } + + HashSet<ByteBuffer> uniqueInput = new HashSet<ByteBuffer>(); + for (byte[] sequence : input) { + uniqueInput.add(ByteBuffer.wrap(sequence)); + } + + for (ByteBuffer sequence : uniqueInput) { + Assert.assertTrue("Not present in the right language: " + + BufferUtils.toString(sequence), rl.remove(sequence)); + } + + // (2) No other sequence _other_ than the input is in the right + // language. + Assert.assertEquals(0, rl.size()); + } + + @Test + public void testAutomatonWithNodeNumbers() throws IOException { + Assume.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS)); + + byte[][] input = new byte[][] { + { 'a' }, + { 'a', 'b', 'a' }, + { 'a', 'c' }, + { 'b' }, + { 'b', 'a' }, + { 'c' }, }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + final byte[] fsaData = + createSerializer() + .withNumbers() + .serialize(s, new ByteArrayOutputStream()).toByteArray(); + + FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); + + // Ensure we have the NUMBERS flag set. + assertTrue(fsa.getFlags().contains(NUMBERS)); + + // Get all numbers from nodes. + byte[] buffer = new byte[128]; + final ArrayList<String> result = new ArrayList<String>(); + FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); + + Collections.sort(result); + assertEquals( + Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"), + result); + } + + /** + * + */ + protected abstract FSASerializer createSerializer(); +} diff --git a/morfologik-fsa/src/test/java/morfologik/util/MinMax.java b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java new file mode 100644 index 0000000..4af6118 --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java @@ -0,0 +1,21 @@ +package morfologik.util;
+
+/**
+ * Minimum/maximum and range.
+ */
+public final class MinMax
+{
+ public final int min;
+ public final int max;
+
+ public MinMax(int min, int max)
+ {
+ this.min = Math.min(min, max);
+ this.max = Math.max(min, max);
+ }
+
+ public int range()
+ {
+ return max - min;
+ }
+}
\ No newline at end of file |