1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
package morfologik.tools;
import java.nio.ByteBuffer;
import java.util.List;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.DictionaryMetadataBuilder;
import morfologik.stemming.EncoderType;
import org.junit.Test;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
public class SequenceEncodersRandomizedTest extends RandomizedTest {
private final SequenceEncoders.IEncoder coder;
public SequenceEncodersRandomizedTest(@Name("coder") SequenceEncoders.IEncoder coder)
{
this.coder = coder;
}
@ParametersFactory
public static List<Object[]> testFactory() {
List<Object[]> encoders = Lists.newArrayList();
for (EncoderType t : EncoderType.values()) {
encoders.add(new Object [] {SequenceEncoders.forType(t)});
}
return encoders;
}
@Test
public void testEncodeSuffixOnRandomSequences() {
for (int i = 0; i < 10000; i++) {
assertRoundtripEncode(
randomAsciiOfLengthBetween(0, 500),
randomAsciiOfLengthBetween(0, 500));
}
}
@Test
public void testEncodeSamples() {
assertRoundtripEncode("", "");
assertRoundtripEncode("abc", "ab");
assertRoundtripEncode("abc", "abx");
assertRoundtripEncode("ab", "abc");
assertRoundtripEncode("xabc", "abc");
assertRoundtripEncode("axbc", "abc");
assertRoundtripEncode("axybc", "abc");
assertRoundtripEncode("axybc", "abc");
assertRoundtripEncode("azbc", "abcxy");
assertRoundtripEncode("Niemcami", "Niemiec");
assertRoundtripEncode("Niemiec", "Niemcami");
}
private void assertRoundtripEncode(String srcString, String dstString)
{
ByteArrayList src = ByteArrayList.from(srcString.getBytes(UTF8));
ByteArrayList dst = ByteArrayList.from(dstString.getBytes(UTF8));
ByteArrayList encoded = new ByteArrayList();
ByteArrayList decoded = new ByteArrayList();
coder.encode(src, dst, encoded);
coder.decode(src, encoded, decoded);
if (!dst.equals(decoded)) {
System.out.println("src: " + new String(src.toArray(), Charsets.UTF_8));
System.out.println("dst: " + new String(dst.toArray(), Charsets.UTF_8));
System.out.println("enc: " + new String(encoded.toArray(), Charsets.UTF_8));
System.out.println("dec: " + new String(decoded.toArray(), Charsets.UTF_8));
}
assertEquals(dst, decoded);
// DictionaryLookup.decodeBaseForm decoding testing
DictionaryMetadataBuilder builder = new DictionaryMetadataBuilder();
builder.encoding(Charsets.UTF_8);
builder.encoder(coder.type());
ByteBuffer bb = DictionaryLookup.decodeBaseForm(
ByteBuffer.allocate(0),
encoded.toArray(),
encoded.size(),
ByteBuffer.wrap(src.toArray()), builder.build());
ByteArrayList decoded2 = new ByteArrayList();
bb.flip();
while (bb.hasRemaining()) decoded2.add(bb.get());
if (!dst.equals(decoded2)) {
System.out.println("DictionaryLookup.decodeBaseForm incorrect, coder: " + coder);
System.out.println("src : " + new String(src.toArray(), Charsets.UTF_8));
System.out.println("dst : " + new String(dst.toArray(), Charsets.UTF_8));
System.out.println("enc : " + new String(encoded.toArray(), Charsets.UTF_8));
System.out.println("dec : " + new String(decoded.toArray(), Charsets.UTF_8));
System.out.println("dec2: " + new String(decoded2.toArray(), Charsets.UTF_8));
}
assertEquals(dst, decoded2);
}
}
|