src-test/morfologik/stemming/PolishStemmerTest.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

package morfologik.stemming;

import static morfologik.stemming.DictionaryLookupTest.assertNoStemFor;
import static morfologik.stemming.DictionaryLookupTest.stem;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.HashSet;
import java.util.TreeSet;

import org.junit.Ignore;
import org.junit.Test;

/*
 * 
 */
public class PolishStemmerTest {
	/* */
	@Test
	public void testLexemes() throws IOException {
		PolishStemmer s = new PolishStemmer();

		assertEquals("żywotopisarstwo", stem(s, "żywotopisarstwie")[0]);
		assertEquals("abradować", stem(s, "abradowałoby")[0]);

		assertArrayEquals(new String[] { "żywotopisarstwo", "subst:sg:loc:n" },
		        stem(s, "żywotopisarstwie"));
		assertArrayEquals(new String[] { "bazia", "subst:pl:inst:f" }, stem(s,
		        "baziami"));

		// This word is not in the dictionary.
		assertNoStemFor(s, "martygalski");
	}

	/* */
	@Test
	@Ignore
	public void listUniqueTags() throws IOException {
		HashSet<String> forms = new HashSet<String>();
		for (WordData wd : new PolishStemmer()) {
			final CharSequence chs = wd.getTag();
			if (chs == null) {
				System.err.println("Missing tag for: " + wd.getWord());
				continue;
			}
			forms.add(chs.toString());
		}

		for (String s : new TreeSet<String>(forms)) {
			System.out.println(s);
		}
	}
}