diff options
Diffstat (limited to 'morfologik-stemming/src/test')
15 files changed, 437 insertions, 0 deletions
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java new file mode 100644 index 0000000..1fd4e62 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java @@ -0,0 +1,247 @@ +package morfologik.stemming;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSABuilder;
+import morfologik.fsa.FSAUtils;
+
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryLookupTest {
+ /* */
+ @Test
+ public void testPrefixDictionaries() throws IOException {
+ final URL url = this.getClass().getResource("test-prefix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzeczypospolitej"));
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecząpospolitą"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ }
+
+ @Test
+ public void testInputConversion() throws IOException {
+ final URL url = this.getClass().getResource("test-prefix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecz\\apospolit\\a"));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "krowa\\apospolit\\a"));
+ }
+
+ /* */
+ @Test
+ public void testInfixDictionaries() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzeczypospolitej"));
+ assertArrayEquals(new String[] { "Rzeczycki", "adj:pl:nom:m" }, stem(s,
+ "Rzeczyccy"));
+ assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" },
+ stem(s, "Rzecząpospolitą"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ assertNoStemFor(s, "Rzeczyckiõh");
+ }
+
+ /* */
+ @Test
+ public void testWordDataIterator() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ final HashSet<String> entries = new HashSet<String>();
+ for (WordData wd : s) {
+ entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag());
+ }
+
+ // Make sure a sample of the entries is present.
+ assertTrue(entries.contains("Rzekunia Rzekuń subst:sg:gen:m"));
+ assertTrue(entries
+ .contains("Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n"));
+ assertTrue(entries
+ .contains("Rzecząpospolitą Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczypospolita Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczypospolitych Rzeczpospolita subst:irreg"));
+ assertTrue(entries
+ .contains("Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f"));
+ }
+
+ /* */
+ @Test
+ public void testWordDataCloning() throws IOException {
+ final URL url = this.getClass().getResource("test-infix.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ ArrayList<WordData> words = new ArrayList<WordData>();
+ for (WordData wd : s) {
+ WordData clone = wd.clone();
+ words.add(clone);
+ }
+
+ // Reiterate and verify that we have the same entries.
+ final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url));
+ int i = 0;
+ for (WordData wd : s2) {
+ WordData clone = words.get(i++);
+ assertEqualSequences(clone.getStem(), wd.getStem());
+ assertEqualSequences(clone.getTag(), wd.getTag());
+ assertEqualSequences(clone.getWord(), wd.getWord());
+ assertEqualSequences(clone.wordCharSequence, wd.wordCharSequence);
+ }
+
+ // Check collections contract.
+ final HashSet<WordData> entries = new HashSet<WordData>();
+ try {
+ entries.add(words.get(0));
+ fail();
+ } catch (RuntimeException e) {
+ // Expected.
+ }
+ }
+
+ private void assertEqualSequences(CharSequence s1, CharSequence s2) {
+ assertEquals(s1.toString(), s2.toString());
+ }
+
+ /* */
+ @Test
+ public void testMultibyteEncodingUTF8() throws IOException {
+ final URL url = this.getClass().getResource("test-diacritics-utf8.dict");
+ Dictionary read = Dictionary.read(url);
+ final IStemmer s = new DictionaryLookup(read);
+
+ for (byte[] ba : FSAUtils.rightLanguage(read.fsa, read.fsa.getRootNode())) {
+ System.out.println(new String(ba, "UTF-8"));
+ }
+
+ assertArrayEquals(new String[] { "merge", "001" }, stem(s, "mergeam"));
+ assertArrayEquals(new String[] { "merge", "002" }, stem(s, "merseserăm"));
+ }
+
+ /* */
+ @Test
+ public void testSynthesis() throws IOException {
+ final URL url = this.getClass().getResource("test-synth.dict");
+ final IStemmer s = new DictionaryLookup(Dictionary.read(url));
+
+ assertArrayEquals(new String[] { "miała", null }, stem(s,
+ "mieć|verb:praet:sg:ter:f:?perf"));
+ assertArrayEquals(new String[] { "a", null }, stem(s, "a|conj"));
+ assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n"));
+
+ // This word is not in the dictionary.
+ assertNoStemFor(s, "martygalski");
+ }
+
+ /* */
+ @Test
+ public void testInputWithSeparators() throws IOException {
+ final URL url = this.getClass().getResource("test-separators.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+
+ /*
+ * Attemp to reconstruct input sequences using WordData iterator.
+ */
+ ArrayList<String> sequences = new ArrayList<String>();
+ for (WordData wd : s) {
+ sequences.add("" + wd.getWord() + " " + wd.getStem() + " "
+ + wd.getTag());
+ }
+ Collections.sort(sequences);
+
+ assertEquals("token1 null null", sequences.get(0));
+ assertEquals("token2 null null", sequences.get(1));
+ assertEquals("token3 null +", sequences.get(2));
+ assertEquals("token4 token2 null", sequences.get(3));
+ assertEquals("token5 token2 null", sequences.get(4));
+ assertEquals("token6 token2 +", sequences.get(5));
+ assertEquals("token7 token2 token3+", sequences.get(6));
+ assertEquals("token8 token2 token3++", sequences.get(7));
+ }
+
+ /* */
+ @Test
+ public void testSeparatorInLookupTerm() throws IOException {
+ FSA fsa = FSABuilder.build(toBytes("iso8859-1", new String [] {
+ "l+A+LW",
+ "l+A+NN1d",
+ }));
+
+ DictionaryMetadata metadata = new DictionaryMetadataBuilder()
+ .separator('+')
+ .encoding("iso8859-1")
+ .encoder(EncoderType.INFIX)
+ .build();
+
+ final DictionaryLookup s = new DictionaryLookup(new Dictionary(fsa, metadata));
+ assertEquals(0, s.lookup("l+A").size());
+ }
+
+ /* */
+ @Test
+ public void testGetSeparator() throws IOException {
+ final URL url = this.getClass().getResource("test-separators.dict");
+ final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url));
+ assertEquals('+', s.getSeparatorChar());
+ }
+
+ private static byte[][] toBytes(String charset, String[] strings) {
+ byte [][] out = new byte [strings.length][];
+ for (int i = 0; i < strings.length; i++) {
+ try {
+ out[i] = strings[i].getBytes(charset);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return out;
+ }
+
+ /* */
+ public static String asString(CharSequence s) {
+ if (s == null)
+ return null;
+ return s.toString();
+ }
+
+ /* */
+ public static String[] stem(IStemmer s, String word) {
+ ArrayList<String> result = new ArrayList<String>();
+ for (WordData wd : s.lookup(word)) {
+ result.add(asString(wd.getStem()));
+ result.add(asString(wd.getTag()));
+ }
+ return result.toArray(new String[result.size()]);
+ }
+
+ /* */
+ public static void assertNoStemFor(IStemmer s, String word) {
+ assertArrayEquals(new String[] {}, stem(s, word));
+ }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java new file mode 100644 index 0000000..32e7fc7 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java @@ -0,0 +1,49 @@ +package morfologik.stemming;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class DictionaryMetadataBuilderTest {
+ /* */
+ @Test
+ public void testAllConstantsHaveBuilderMethods() throws IOException {
+ Set<DictionaryAttribute> keySet = new DictionaryMetadataBuilder()
+ .convertCase()
+ .encoding(Charset.defaultCharset())
+ .encoding("UTF-8")
+ .frequencyIncluded()
+ .ignoreAllUppercase()
+ .ignoreCamelCase()
+ .ignoreDiacritics()
+ .ignoreNumbers()
+ .ignorePunctuation()
+ .separator('+')
+ .supportRunOnWords()
+ .encoder(EncoderType.SUFFIX)
+ .withEquivalentChars(Collections.<Character,List<Character>>emptyMap())
+ .withReplacementPairs(Collections.<String,List<String>>emptyMap())
+ .withInputConversionPairs(Collections.<String,String>emptyMap())
+ .withOutputConversionPairs(Collections.<String,String>emptyMap())
+ .locale(Locale.getDefault())
+ .license("")
+ .author("")
+ .creationDate("")
+ .toMap().keySet();
+
+ Set<DictionaryAttribute> all = EnumSet.allOf(DictionaryAttribute.class);
+ all.removeAll(keySet);
+
+ Assertions.assertThat(all).isEmpty();
+ }
+}
diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java new file mode 100644 index 0000000..13c61d7 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java @@ -0,0 +1,27 @@ +package morfologik.stemming; + +import static org.junit.Assert.assertEquals; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +/* + * + */ +public class DictionaryTest { + /* */ + + @Test + public void testConvertText() { + Map<String, String> conversion = new HashMap<String, String>(); + conversion.put("'", "`"); + conversion.put("fi", "fi"); + conversion.put("\\a", "ą"); + conversion.put("Barack", "George"); + assertEquals("filut", Dictionary.convertText("filut", conversion)); + assertEquals("fizdrygałką", Dictionary.convertText("fizdrygałk\\a", conversion)); + assertEquals("George Bush", Dictionary.convertText("Barack Bush", conversion)); + } +} diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java new file mode 100644 index 0000000..e8c6c17 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java @@ -0,0 +1,62 @@ +package morfologik.stemming; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; + +import org.junit.Ignore; +import org.junit.Test; + +import com.carrotsearch.junitbenchmarks.AbstractBenchmark; +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; + +@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 20) +@Ignore +public class StringDecoderBenchmarkTest extends AbstractBenchmark { + /* Guard against escape analysis and HotSpot opts. */ + public volatile int guard; + + private final int sequences = 1000000; + + final String input = "dbaoidbhoei"; + final CharBuffer chars = CharBuffer.allocate(100); + final ByteBuffer bytes = ByteBuffer.allocate(100); + final CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder(); + + /** + * This is a simple comparison of performance converting a string to bytes + * using String.getBytes and CharsetEncoder (which String.getBytes uses + * internally in SUN's JDK). + */ + @Test + public void stringGetBytes() throws Exception { + int guard = 0; + for (int i = 0; i < sequences; i++) { + guard += input.getBytes("UTF-8").length; + } + this.guard = guard; + } + + @Test + public void charsetEncoder() throws Exception { + int guard = 0; + for (int i = 0; i < sequences; i++) { + chars.clear(); + for (int j = 0; j < input.length(); j++) { + chars.put(input.charAt(j)); + } + chars.flip(); + + bytes.clear(); + chars.mark(); + encoder.encode(chars, bytes, true); + bytes.flip(); + chars.reset(); + + guard += chars.remaining(); + } + + this.guard = guard; + } +} diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict Binary files differnew file mode 100644 index 0000000..2a62f21 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info new file mode 100644 index 0000000..058aed2 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info @@ -0,0 +1,9 @@ +#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=UTF-8
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict Binary files differnew file mode 100644 index 0000000..cc91f70 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info new file mode 100644 index 0000000..535fac3 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info @@ -0,0 +1,8 @@ +#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-infixes=true
\ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict Binary files differnew file mode 100644 index 0000000..d0bed4c --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info new file mode 100644 index 0000000..520739e --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info @@ -0,0 +1,11 @@ +#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=true
+fsa.dict.uses-infixes=false
+
+fsa.dict.input-conversion=\\a ą, krowa Rzecz
\ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict Binary files differnew file mode 100644 index 0000000..a71b9e7 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info new file mode 100644 index 0000000..8ec14c3 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info @@ -0,0 +1,10 @@ +#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso8859-1
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+fsa.dict.uses-suffixes=false
diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt new file mode 100644 index 0000000..cd77945 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt @@ -0,0 +1,8 @@ +token1+ +token2++ +token3+++ +token4+token2 +token5+token2+ +token6+token2++ +token7+token2+token3+ +token8+token2+token3++
\ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict Binary files differnew file mode 100644 index 0000000..6890253 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info new file mode 100644 index 0000000..ffce33e --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info @@ -0,0 +1,6 @@ +#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
\ No newline at end of file |