summaryrefslogtreecommitdiff
path: root/morfologik-stemming/src/test
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-stemming/src/test')
-rw-r--r--morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java247
-rw-r--r--morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java49
-rw-r--r--morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java27
-rw-r--r--morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java62
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dictbin0 -> 136 bytes
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info9
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dictbin0 -> 1859 bytes
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info8
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dictbin0 -> 1776 bytes
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info11
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dictbin0 -> 155 bytes
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info10
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt8
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dictbin0 -> 1354192 bytes
-rw-r--r--morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info6
15 files changed, 437 insertions, 0 deletions
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java
new file mode 100644
index 0000000..1fd4e62
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java
@@ -0,0 +1,247 @@
+package morfologik.stemming;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSABuilder;
+import morfologik.fsa.FSAUtils;
+
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryLookupTest {
+ /* */
+ @Test
+ public void testPrefixDictionaries() throws IOException {
+ final URL url = this.getClass().getResource("test-prefix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzeczypospolitej"));
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecząpospolitą"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ }
+
+ @Test
+ public void testInputConversion() throws IOException {
+ final URL url = this.getClass().getResource("test-prefix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecz\\apospolit\\a"));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "krowa\\apospolit\\a"));
+ }
+
+ /* */
+ @Test
+ public void testInfixDictionaries() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzeczypospolitej"));
+ assertArrayEquals(new String[] { "Rzeczycki", "adj:pl:nom:m" }, stem(s,
+ "Rzeczyccy"));
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecząpospolitą"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ assertNoStemFor(s, "Rzeczyckiõh");
+ }
+
+ /* */
+ @Test
+ public void testWordDataIterator() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ final HashSet<String> entries = new HashSet<String>();
+ for (WordData wd : s) {
+ entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag());
+ }
+
+ // Make sure a sample of the entries is present.
+ assertTrue(entries.contains("Rzekunia Rzekuń subst:sg:gen:m"));
+ assertTrue(entries
+ .contains("Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n"));
+ assertTrue(entries
+ .contains("Rzecząpospolitą Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczypospolita Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczypospolitych Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f"));
+ }
+
+ /* */
+ @Test
+ public void testWordDataCloning() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ ArrayList<WordData> words = new ArrayList<WordData>();
+ for (WordData wd : s) {
+ WordData clone = wd.clone();
+ words.add(clone);
+ }
+
+ // Reiterate and verify that we have the same entries.
+ final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url));
+ int i = 0;
+ for (WordData wd : s2) {
+ WordData clone = words.get(i++);
+ assertEqualSequences(clone.getStem(), wd.getStem());
+ assertEqualSequences(clone.getTag(), wd.getTag());
+ assertEqualSequences(clone.getWord(), wd.getWord());
+ assertEqualSequences(clone.wordCharSequence, wd.wordCharSequence);
+ }
+
+ // Check collections contract.
+ final HashSet<WordData> entries = new HashSet<WordData>();
+ try {
+ entries.add(words.get(0));
+ fail();
+ } catch (RuntimeException e) {
+ // Expected.
+ }
+ }
+
+ private void assertEqualSequences(CharSequence s1, CharSequence s2) {
+ assertEquals(s1.toString(), s2.toString());
+ }
+
+ /* */
+ @Test
+ public void testMultibyteEncodingUTF8() throws IOException {
+ final URL url = this.getClass().getResource("test-diacritics-utf8.dict");
+ Dictionary read = Dictionary.read(url);
+ final IStemmer s = new DictionaryLookup(read);
+
+ for (byte[] ba : FSAUtils.rightLanguage(read.fsa, read.fsa.getRootNode())) {
+ System.out.println(new String(ba, "UTF-8"));
+ }
+
+ assertArrayEquals(new String[] { "merge", "001" }, stem(s, "mergeam"));
+ assertArrayEquals(new String[] { "merge", "002" }, stem(s, "merseserăm"));
+ }
+
+ /* */
+ @Test
+ public void testSynthesis() throws IOException {
+ final URL url = this.getClass().getResource("test-synth.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "miała", null }, stem(s,
+ "mieć|verb:praet:sg:ter:f:?perf"));
+ assertArrayEquals(new String[] { "a", null }, stem(s, "a|conj"));
+ assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ }
+
+ /* */
+ @Test
+ public void testInputWithSeparators() throws IOException {
+ final URL url = this.getClass().getResource("test-separators.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ /*
+ * Attemp to reconstruct input sequences using WordData iterator.
+ */
+ ArrayList<String> sequences = new ArrayList<String>();
+ for (WordData wd : s) {
+ sequences.add("" + wd.getWord() + " " + wd.getStem() + " "
+ + wd.getTag());
+ }
+ Collections.sort(sequences);
+
+ assertEquals("token1 null null", sequences.get(0));
+ assertEquals("token2 null null", sequences.get(1));
+ assertEquals("token3 null +", sequences.get(2));
+ assertEquals("token4 token2 null", sequences.get(3));
+ assertEquals("token5 token2 null", sequences.get(4));
+ assertEquals("token6 token2 +", sequences.get(5));
+ assertEquals("token7 token2 token3+", sequences.get(6));
+ assertEquals("token8 token2 token3++", sequences.get(7));
+ }
+
+ /* */
+ @Test
+ public void testSeparatorInLookupTerm() throws IOException {
+ FSA fsa = FSABuilder.build(toBytes("iso8859-1", new String [] {
+ "l+A+LW",
+ "l+A+NN1d",
+ }));
+
+ DictionaryMetadata metadata = new DictionaryMetadataBuilder()
+ .separator('+')
+ .encoding("iso8859-1")
+ .encoder(EncoderType.INFIX)
+ .build();
+
+ final DictionaryLookup s = new DictionaryLookup(new Dictionary(fsa, metadata));
+ assertEquals(0, s.lookup("l+A").size());
+ }
+
+ /* */
+ @Test
+ public void testGetSeparator() throws IOException {
+ final URL url = this.getClass().getResource("test-separators.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+ assertEquals('+', s.getSeparatorChar());
+ }
+
+ private static byte[][] toBytes(String charset, String[] strings) {
+ byte [][] out = new byte [strings.length][];
+ for (int i = 0; i < strings.length; i++) {
+ try {
+ out[i] = strings[i].getBytes(charset);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return out;
+ }
+
+ /* */
+ public static String asString(CharSequence s) {
+ if (s == null)
+ return null;
+ return s.toString();
+ }
+
+ /* */
+ public static String[] stem(IStemmer s, String word) {
+ ArrayList<String> result = new ArrayList<String>();
+ for (WordData wd : s.lookup(word)) {
+ result.add(asString(wd.getStem()));
+ result.add(asString(wd.getTag()));
+ }
+ return result.toArray(new String[result.size()]);
+ }
+
+ /* */
+ public static void assertNoStemFor(IStemmer s, String word) {
+ assertArrayEquals(new String[] {}, stem(s, word));
+ }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java
new file mode 100644
index 0000000..32e7fc7
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java
@@ -0,0 +1,49 @@
+package morfologik.stemming;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryMetadataBuilderTest {
+ /* */
+ @Test
+ public void testAllConstantsHaveBuilderMethods() throws IOException {
+ Set<DictionaryAttribute> keySet = new DictionaryMetadataBuilder()
+ .convertCase()
+ .encoding(Charset.defaultCharset())
+ .encoding("UTF-8")
+ .frequencyIncluded()
+ .ignoreAllUppercase()
+ .ignoreCamelCase()
+ .ignoreDiacritics()
+ .ignoreNumbers()
+ .ignorePunctuation()
+ .separator('+')
+ .supportRunOnWords()
+ .encoder(EncoderType.SUFFIX)
+ .withEquivalentChars(Collections.<Character,List<Character>>emptyMap())
+ .withReplacementPairs(Collections.<String,List<String>>emptyMap())
+ .withInputConversionPairs(Collections.<String,String>emptyMap())
+ .withOutputConversionPairs(Collections.<String,String>emptyMap())
+ .locale(Locale.getDefault())
+ .license("")
+ .author("")
+ .creationDate("")
+ .toMap().keySet();
+
+ Set<DictionaryAttribute> all = EnumSet.allOf(DictionaryAttribute.class);
+ all.removeAll(keySet);
+
+ Assertions.assertThat(all).isEmpty();
+ }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java
new file mode 100644
index 0000000..13c61d7
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java
@@ -0,0 +1,27 @@
+package morfologik.stemming;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryTest {
+ /* */
+
+ @Test
+ public void testConvertText() {
+ Map<String, String> conversion = new HashMap<String, String>();
+ conversion.put("'", "`");
+ conversion.put("fi", "fi");
+ conversion.put("\\a", "ą");
+ conversion.put("Barack", "George");
+ assertEquals("filut", Dictionary.convertText("filut", conversion));
+ assertEquals("fizdrygałką", Dictionary.convertText("fizdrygałk\\a", conversion));
+ assertEquals("George Bush", Dictionary.convertText("Barack Bush", conversion));
+ }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java
new file mode 100644
index 0000000..e8c6c17
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java
@@ -0,0 +1,62 @@
+package morfologik.stemming;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import com.carrotsearch.junitbenchmarks.AbstractBenchmark;
+import com.carrotsearch.junitbenchmarks.BenchmarkOptions;
+
+@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 20)
+@Ignore
+public class StringDecoderBenchmarkTest extends AbstractBenchmark {
+ /* Guard against escape analysis and HotSpot opts. */
+ public volatile int guard;
+
+ private final int sequences = 1000000;
+
+ final String input = "dbaoidbhoei";
+ final CharBuffer chars = CharBuffer.allocate(100);
+ final ByteBuffer bytes = ByteBuffer.allocate(100);
+ final CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder();
+
+ /**
+ * This is a simple comparison of performance converting a string to bytes
+ * using String.getBytes and CharsetEncoder (which String.getBytes uses
+ * internally in SUN's JDK).
+ */
+ @Test
+ public void stringGetBytes() throws Exception {
+ int guard = 0;
+ for (int i = 0; i < sequences; i++) {
+ guard += input.getBytes("UTF-8").length;
+ }
+ this.guard = guard;
+ }
+
+ @Test
+ public void charsetEncoder() throws Exception {
+ int guard = 0;
+ for (int i = 0; i < sequences; i++) {
+ chars.clear();
+ for (int j = 0; j < input.length(); j++) {
+ chars.put(input.charAt(j));
+ }
+ chars.flip();
+
+ bytes.clear();
+ chars.mark();
+ encoder.encode(chars, bytes, true);
+ bytes.flip();
+ chars.reset();
+
+ guard += chars.remaining();
+ }
+
+ this.guard = guard;
+ }
+}
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict
new file mode 100644
index 0000000..2a62f21
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict
Binary files differ
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info
new file mode 100644
index 0000000..058aed2
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info
@@ -0,0 +1,9 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=UTF-8
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict
new file mode 100644
index 0000000..cc91f70
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict
Binary files differ
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info
new file mode 100644
index 0000000..535fac3
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info
@@ -0,0 +1,8 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-infixes=true \ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict
new file mode 100644
index 0000000..d0bed4c
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict
Binary files differ
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info
new file mode 100644
index 0000000..520739e
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info
@@ -0,0 +1,11 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=true
+fsa.dict.uses-infixes=false
+
+fsa.dict.input-conversion=\\a ą, krowa Rzecz \ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict
new file mode 100644
index 0000000..a71b9e7
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict
Binary files differ
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info
new file mode 100644
index 0000000..8ec14c3
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info
@@ -0,0 +1,10 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso8859-1
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+fsa.dict.uses-suffixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt
new file mode 100644
index 0000000..cd77945
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt
@@ -0,0 +1,8 @@
+token1+
+token2++
+token3+++
+token4+token2
+token5+token2+
+token6+token2++
+token7+token2+token3+
+token8+token2+token3++ \ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict
new file mode 100644
index 0000000..6890253
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict
Binary files differ
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info
new file mode 100644
index 0000000..ffce33e
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info
@@ -0,0 +1,6 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2 \ No newline at end of file