15 files changed, 437 insertions, 0 deletions
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java
new file mode 100644
index 0000000..1fd4e62
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java
@@ -0,0 +1,247 @@
+package morfologik.stemming;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSABuilder;
+import morfologik.fsa.FSAUtils;
+
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryLookupTest {
+  /* */
+  @Test
+  public void testPrefixDictionaries() throws IOException {
+    final URL url = this.getClass().getResource("test-prefix.dict");
+    final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "Rzeczypospolitej"));
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "Rzecząpospolitą"));
+
+    // This word is not in the dictionary.
+    assertNoStemFor(s, "martygalski");
+  }
+
+  @Test
+  public void testInputConversion() throws IOException {
+    final URL url = this.getClass().getResource("test-prefix.dict");
+    final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "Rzecz\\apospolit\\a"));
+
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "krowa\\apospolit\\a"));
+  }
+
+  /* */
+  @Test
+  public void testInfixDictionaries() throws IOException {
+    final URL url = this.getClass().getResource("test-infix.dict");
+    final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "Rzeczypospolitej"));
+    assertArrayEquals(new String[] { "Rzeczycki", "adj:pl:nom:m" }, stem(s,
+        "Rzeczyccy"));
+    assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+        stem(s, "Rzecząpospolitą"));
+
+    // This word is not in the dictionary.
+    assertNoStemFor(s, "martygalski");
+    assertNoStemFor(s, "Rzeczyckiõh");
+  }
+
+  /* */
+  @Test
+  public void testWordDataIterator() throws IOException {
+    final URL url = this.getClass().getResource("test-infix.dict");
+    final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+    final HashSet<String> entries = new HashSet<String>();
+    for (WordData wd : s) {
+      entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag());
+    }
+
+    // Make sure a sample of the entries is present.
+    assertTrue(entries.contains("Rzekunia Rzekuń subst:sg:gen:m"));
+    assertTrue(entries
+        .contains("Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n"));
+    assertTrue(entries
+        .contains("Rzecząpospolitą Rzeczpospolita subst:irreg"));
+    assertTrue(entries
+        .contains("Rzeczypospolita Rzeczpospolita subst:irreg"));
+    assertTrue(entries
+        .contains("Rzeczypospolitych Rzeczpospolita subst:irreg"));
+    assertTrue(entries
+        .contains("Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f"));
+  }
+
+  /* */
+  @Test
+  public void testWordDataCloning() throws IOException {
+    final URL url = this.getClass().getResource("test-infix.dict");
+    final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+    ArrayList<WordData> words = new ArrayList<WordData>();
+    for (WordData wd : s) {
+      WordData clone = wd.clone();
+      words.add(clone);
+    }
+
+    // Reiterate and verify that we have the same entries.
+    final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url));
+    int i = 0;
+    for (WordData wd : s2) {
+      WordData clone = words.get(i++);
+      assertEqualSequences(clone.getStem(), wd.getStem());
+      assertEqualSequences(clone.getTag(), wd.getTag());
+      assertEqualSequences(clone.getWord(), wd.getWord());
+      assertEqualSequences(clone.wordCharSequence, wd.wordCharSequence);
+    }
+
+    // Check collections contract.
+    final HashSet<WordData> entries = new HashSet<WordData>();
+    try {
+      entries.add(words.get(0));
+      fail();
+    } catch (RuntimeException e) {
+      // Expected.
+    }
+  }
+
+  private void assertEqualSequences(CharSequence s1, CharSequence s2) {
+    assertEquals(s1.toString(), s2.toString());
+  }
+
+  /* */
+  @Test
+  public void testMultibyteEncodingUTF8() throws IOException {
+    final URL url = this.getClass().getResource("test-diacritics-utf8.dict");
+    Dictionary read = Dictionary.read(url);
+    final IStemmer s = new DictionaryLookup(read);
+
+    for (byte[] ba : FSAUtils.rightLanguage(read.fsa, read.fsa.getRootNode())) {
+      System.out.println(new String(ba, "UTF-8"));
+    }
+
+    assertArrayEquals(new String[] { "merge", "001" }, stem(s, "mergeam"));
+    assertArrayEquals(new String[] { "merge", "002" }, stem(s, "merseserăm"));
+  }
+
+  /* */
+  @Test
+  public void testSynthesis() throws IOException {
+    final URL url = this.getClass().getResource("test-synth.dict");
+    final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+    assertArrayEquals(new String[] { "miała", null }, stem(s,
+        "mieć|verb:praet:sg:ter:f:?perf"));
+    assertArrayEquals(new String[] { "a", null }, stem(s, "a|conj"));
+    assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n"));
+
+    // This word is not in the dictionary.
+    assertNoStemFor(s, "martygalski");
+  }
+
+  /* */
+  @Test
+  public void testInputWithSeparators() throws IOException {
+    final URL url = this.getClass().getResource("test-separators.dict");
+    final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+    /*
+     * Attemp to reconstruct input sequences using WordData iterator.
+     */
+    ArrayList<String> sequences = new ArrayList<String>();
+    for (WordData wd : s) {
+      sequences.add("" + wd.getWord() + " " + wd.getStem() + " "
+          + wd.getTag());
+    }
+    Collections.sort(sequences);
+
+    assertEquals("token1 null null", sequences.get(0));
+    assertEquals("token2 null null", sequences.get(1));
+    assertEquals("token3 null +", sequences.get(2));
+    assertEquals("token4 token2 null", sequences.get(3));
+    assertEquals("token5 token2 null", sequences.get(4));
+    assertEquals("token6 token2 +", sequences.get(5));
+    assertEquals("token7 token2 token3+", sequences.get(6));
+    assertEquals("token8 token2 token3++", sequences.get(7));
+  }
+
+  /* */
+  @Test
+  public void testSeparatorInLookupTerm() throws IOException {
+    FSA fsa = FSABuilder.build(toBytes("iso8859-1", new String [] {
+        "l+A+LW",
+        "l+A+NN1d",
+    }));
+
+    DictionaryMetadata metadata = new DictionaryMetadataBuilder()
+    .separator('+')
+    .encoding("iso8859-1")
+    .encoder(EncoderType.INFIX)
+    .build();
+
+    final DictionaryLookup s = new DictionaryLookup(new Dictionary(fsa, metadata));
+    assertEquals(0, s.lookup("l+A").size());
+  }
+
+  /* */
+  @Test
+  public void testGetSeparator() throws IOException {
+    final URL url = this.getClass().getResource("test-separators.dict");
+    final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+    assertEquals('+', s.getSeparatorChar());
+  }
+
+  private static byte[][] toBytes(String charset, String[] strings) {
+    byte [][] out = new byte [strings.length][];
+    for (int i = 0; i < strings.length; i++) {
+      try {
+        out[i] = strings[i].getBytes(charset);
+      } catch (UnsupportedEncodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return out;
+  }
+
+  /* */
+  public static String asString(CharSequence s) {
+    if (s == null)
+      return null;
+    return s.toString();
+  }
+
+  /* */
+  public static String[] stem(IStemmer s, String word) {
+    ArrayList<String> result = new ArrayList<String>();
+    for (WordData wd : s.lookup(word)) {
+      result.add(asString(wd.getStem()));
+      result.add(asString(wd.getTag()));
+    }
+    return result.toArray(new String[result.size()]);
+  }
+
+  /* */
+  public static void assertNoStemFor(IStemmer s, String word) {
+    assertArrayEquals(new String[] {}, stem(s, word));
+  }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java
new file mode 100644
index 0000000..32e7fc7
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java
@@ -0,0 +1,49 @@
+package morfologik.stemming;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryMetadataBuilderTest {
+  /* */
+  @Test
+  public void testAllConstantsHaveBuilderMethods() throws IOException {
+    Set<DictionaryAttribute> keySet = new DictionaryMetadataBuilder()
+    .convertCase()
+    .encoding(Charset.defaultCharset())
+    .encoding("UTF-8")
+    .frequencyIncluded()
+    .ignoreAllUppercase()
+    .ignoreCamelCase()
+    .ignoreDiacritics()
+    .ignoreNumbers()
+    .ignorePunctuation()
+    .separator('+')
+    .supportRunOnWords()
+    .encoder(EncoderType.SUFFIX)
+    .withEquivalentChars(Collections.<Character,List<Character>>emptyMap())
+    .withReplacementPairs(Collections.<String,List<String>>emptyMap())
+    .withInputConversionPairs(Collections.<String,String>emptyMap())
+    .withOutputConversionPairs(Collections.<String,String>emptyMap())
+    .locale(Locale.getDefault())
+    .license("")
+    .author("")
+    .creationDate("")
+    .toMap().keySet();
+
+    Set<DictionaryAttribute> all = EnumSet.allOf(DictionaryAttribute.class);
+    all.removeAll(keySet);
+
+    Assertions.assertThat(all).isEmpty();
+  }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java
new file mode 100644
index 0000000..13c61d7
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java
@@ -0,0 +1,27 @@
+package morfologik.stemming;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryTest {
+  /* */
+
+  @Test
+  public void testConvertText() {
+    Map<String, String> conversion = new HashMap<String, String>();
+    conversion.put("'", "`");
+    conversion.put("fi", "ﬁ");
+    conversion.put("\\a", "ą");
+    conversion.put("Barack", "George");
+    assertEquals("ﬁlut", Dictionary.convertText("filut", conversion));
+    assertEquals("ﬁzdrygałką", Dictionary.convertText("fizdrygałk\\a", conversion));
+    assertEquals("George Bush", Dictionary.convertText("Barack Bush", conversion));
+  }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java
new file mode 100644
index 0000000..e8c6c17
--- /dev/null
+++ b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java
@@ -0,0 +1,62 @@
+package morfologik.stemming;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import com.carrotsearch.junitbenchmarks.AbstractBenchmark;
+import com.carrotsearch.junitbenchmarks.BenchmarkOptions;
+
+@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 20)
+@Ignore
+public class StringDecoderBenchmarkTest extends AbstractBenchmark {
+	/* Guard against escape analysis and HotSpot opts. */
+	public volatile int guard;
+
+	private final int sequences = 1000000;
+
+	final String input = "dbaoidbhoei";
+	final CharBuffer chars = CharBuffer.allocate(100);
+	final ByteBuffer bytes = ByteBuffer.allocate(100);
+	final CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder();
+
+	/**
+	 * This is a simple comparison of performance converting a string to bytes
+	 * using String.getBytes and CharsetEncoder (which String.getBytes uses
+	 * internally in SUN's JDK).
+	 */
+	@Test
+	public void stringGetBytes() throws Exception {
+		int guard = 0;
+		for (int i = 0; i < sequences; i++) {
+			guard += input.getBytes("UTF-8").length;
+		}
+		this.guard = guard;
+	}
+
+	@Test
+	public void charsetEncoder() throws Exception {
+		int guard = 0;
+		for (int i = 0; i < sequences; i++) {
+			chars.clear();
+			for (int j = 0; j < input.length(); j++) {
+				chars.put(input.charAt(j));
+			}
+			chars.flip();
+
+			bytes.clear();
+			chars.mark();
+			encoder.encode(chars, bytes, true);
+			bytes.flip();
+			chars.reset();
+			
+			guard += chars.remaining();
+		}
+		
+		this.guard = guard;
+	}
+}
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict
new file mode 100644
index 0000000..2a62f21
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info
new file mode 100644
index 0000000..058aed2
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info
@@ -0,0 +1,9 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=UTF-8
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict
new file mode 100644
index 0000000..cc91f70
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info
new file mode 100644
index 0000000..535fac3
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info
@@ -0,0 +1,8 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-infixes=true
+\ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict
new file mode 100644
index 0000000..d0bed4c
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info
new file mode 100644
index 0000000..520739e
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info
@@ -0,0 +1,11 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=true
+fsa.dict.uses-infixes=false
+
+fsa.dict.input-conversion=\\a ą, krowa Rzecz
+\ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict
new file mode 100644
index 0000000..a71b9e7
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info
new file mode 100644
index 0000000..8ec14c3
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info
@@ -0,0 +1,10 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso8859-1
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+fsa.dict.uses-suffixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt
new file mode 100644
index 0000000..cd77945
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt
@@ -0,0 +1,8 @@
+token1+
+token2++
+token3+++
+token4+token2
+token5+token2+
+token6+token2++
+token7+token2+token3+
+token8+token2+token3++
+\ No newline at end of file
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict
new file mode 100644
index 0000000..6890253
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info
new file mode 100644
index 0000000..ffce33e
--- /dev/null
+++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info
@@ -0,0 +1,6 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+\ No newline at end of file