summaryrefslogtreecommitdiff
path: root/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java
blob: 40be4a852227b10437bf44e253193a92897b91a7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package morfologik.tools;

import java.nio.ByteBuffer;
import java.util.List;

import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.DictionaryMetadataBuilder;
import morfologik.stemming.EncoderType;

import org.junit.Test;

import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;

public class SequenceEncodersRandomizedTest extends RandomizedTest {
    private final SequenceEncoders.IEncoder coder;

    public SequenceEncodersRandomizedTest(@Name("coder") SequenceEncoders.IEncoder coder)
    {
        this.coder = coder;
    }

    @ParametersFactory
    public static List<Object[]> testFactory() {
        List<Object[]> encoders = Lists.newArrayList();
        for (EncoderType t : EncoderType.values()) {
            encoders.add(new Object [] {SequenceEncoders.forType(t)});
        }
        return encoders;
    }

    @Test
    public void testEncodeSuffixOnRandomSequences() {
        for (int i = 0; i < 10000; i++) {
            assertRoundtripEncode(
                randomAsciiOfLengthBetween(0, 500),
                randomAsciiOfLengthBetween(0, 500));
        }
    }

    @Test
    public void testEncodeSamples() {
        assertRoundtripEncode("", "");
        assertRoundtripEncode("abc", "ab");
        assertRoundtripEncode("abc", "abx");
        assertRoundtripEncode("ab", "abc");
        assertRoundtripEncode("xabc", "abc");
        assertRoundtripEncode("axbc", "abc");
        assertRoundtripEncode("axybc", "abc");
        assertRoundtripEncode("axybc", "abc");
        assertRoundtripEncode("azbc", "abcxy");

        assertRoundtripEncode("Niemcami", "Niemiec");
        assertRoundtripEncode("Niemiec", "Niemcami");
    }

    private void assertRoundtripEncode(String srcString, String dstString)
    {
        ByteArrayList src = ByteArrayList.from(srcString.getBytes(UTF8));
        ByteArrayList dst = ByteArrayList.from(dstString.getBytes(UTF8));
        ByteArrayList encoded = new ByteArrayList();
        ByteArrayList decoded = new ByteArrayList();

        coder.encode(src, dst, encoded);
        coder.decode(src, encoded, decoded);

        if (!dst.equals(decoded)) {
            System.out.println("src: " + new String(src.toArray(), Charsets.UTF_8));
            System.out.println("dst: " + new String(dst.toArray(), Charsets.UTF_8));
            System.out.println("enc: " + new String(encoded.toArray(), Charsets.UTF_8));
            System.out.println("dec: " + new String(decoded.toArray(), Charsets.UTF_8));
        }
        
        assertEquals(dst, decoded);
        
        // DictionaryLookup.decodeBaseForm decoding testing
        DictionaryMetadataBuilder builder = new DictionaryMetadataBuilder();
        builder.encoding(Charsets.UTF_8);
        builder.encoder(coder.type());
            
        ByteBuffer bb = DictionaryLookup.decodeBaseForm(
            ByteBuffer.allocate(0),
            encoded.toArray(), 
            encoded.size(), 
            ByteBuffer.wrap(src.toArray()), builder.build());
        
        ByteArrayList decoded2 = new ByteArrayList();
        bb.flip();
        while (bb.hasRemaining()) decoded2.add(bb.get());

        if (!dst.equals(decoded2)) {
            System.out.println("DictionaryLookup.decodeBaseForm incorrect, coder: " + coder);
            System.out.println("src : " + new String(src.toArray(), Charsets.UTF_8));
            System.out.println("dst : " + new String(dst.toArray(), Charsets.UTF_8));
            System.out.println("enc : " + new String(encoded.toArray(), Charsets.UTF_8));
            System.out.println("dec : " + new String(decoded.toArray(), Charsets.UTF_8));
            System.out.println("dec2: " + new String(decoded2.toArray(), Charsets.UTF_8));
        }

        assertEquals(dst, decoded2);       
    }
}