summaryrefslogtreecommitdiff
path: root/src-test/morfologik/fsa/FSATraversalTest.java
blob: ddafb6d6f38f83a00ac347d00cd07e3d6d8f051e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
package morfologik.fsa;

import static org.junit.Assert.*;
import static morfologik.fsa.MatchResult.*;

import java.io.*;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashSet;


import org.junit.Before;
import org.junit.Test;

/**
 * Tests {@link FSATraversal}.
 */
public final class FSATraversalTest {
	private FSA fsa;

	/**
     * 
     */
	@Before
	public void setUp() throws Exception {
		fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict"));
	}

	/**
     * 
     */
	@Test
	public void testTraversalWithIterable() {
		int count = 0;
		for (ByteBuffer bb : fsa.getSequences()) {
			assertEquals(0, bb.arrayOffset());
			assertEquals(0, bb.position());
			count++;
		}
		assertEquals(346773, count);
	}

	/**
     * 
     */
	@Test
	public void testPerfectHash() throws IOException {
		byte[][] input = new byte[][] {
				{ 'a' },
				{ 'a', 'b', 'a' },
				{ 'a', 'c' },
		        { 'b' }, 
		        { 'b', 'a' },
		        { 'c' },
		};

		Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
		FSA s = FSABuilder.build(input);

		final byte[] fsaData = 
				new FSA5Serializer()
				.withNumbers()
				.serialize(s, new ByteArrayOutputStream())
				.toByteArray();

		final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData));
		final FSATraversal traversal = new FSATraversal(fsa);

		int i = 0;
		for (byte [] seq : input)
		{
			assertEquals(new String(seq), i++, traversal.perfectHash(seq));
		}

		// Check if the total number of sequences is encoded at the root node.
		assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode()));

		// Check sub/super sequence scenarios.
		assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8")));
		assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8")));
		assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8")));
		assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0}));
		
		assertTrue(AUTOMATON_HAS_PREFIX < 0);
		assertTrue(SEQUENCE_IS_A_PREFIX < 0);
		assertTrue(NO_MATCH < 0);
	}

	/**
     * 
     */
	@Test
	public void testRecursiveTraversal() {
		final int[] counter = new int[] { 0 };

		class Recursion {
			public void dumpNode(final int node) {
				int arc = fsa.getFirstArc(node);
				do {
					if (fsa.isArcFinal(arc)) {
						counter[0]++;
					}

					if (!fsa.isArcTerminal(arc)) {
						dumpNode(fsa.getEndNode(arc));
					}

					arc = fsa.getNextArc(arc);
				} while (arc != 0);
			}
		}

		new Recursion().dumpNode(fsa.getRootNode());

		assertEquals(346773, counter[0]);
	}

	/**
     * Test {@link FSATraversal} and matching results.
     */
	@Test
	public void testMatch() throws IOException {
		final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
		final FSATraversal traversalHelper = new FSATraversal(fsa);

		MatchResult m = traversalHelper.match("ax".getBytes());
		assertEquals(NO_MATCH, m.kind);
		assertEquals(1, m.index);
		assertEquals(new HashSet<String>(Arrays.asList("ba", "c")), 
				suffixes(fsa, m.node));

		assertEquals(EXACT_MATCH, 
				traversalHelper.match("aba".getBytes()).kind);

		m = traversalHelper.match("abalonger".getBytes());
		assertEquals(AUTOMATON_HAS_PREFIX, m.kind);
		assertEquals("longer", "abalonger".substring(m.index));
		
		m = traversalHelper.match("ab".getBytes());
		assertEquals(SEQUENCE_IS_A_PREFIX, m.kind);
		assertEquals(new HashSet<String>(Arrays.asList("a")), 
				suffixes(fsa, m.node));
	}

	/**
	 * Return all sequences reachable from a given node, as strings.  
	 */
	private HashSet<String> suffixes(FSA fsa, int node) {
		HashSet<String> result = new HashSet<String>(); 
		for (ByteBuffer bb : fsa.getSequences(node))
		{
			try {
	            result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8"));
            } catch (UnsupportedEncodingException e) {
	            throw new RuntimeException(e);
            }
		}
	    return result;
    }
}