diff options
Diffstat (limited to 'morfologik-tools/src/test/java/morfologik/tools')
6 files changed, 561 insertions, 0 deletions
diff --git a/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java new file mode 100644 index 0000000..4d45f9c --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java @@ -0,0 +1,53 @@ +package morfologik.tools; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; +import java.nio.charset.Charset; + +import org.hamcrest.core.StringContains; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; +import com.google.common.io.ByteStreams; +import com.google.common.io.Files; + +public class FSABuildToolTest +{ + /* */ + @Test + public void testStemmingFile() throws Exception { + // Create a simple plain text file. + File input = File.createTempFile("input", "in"); + File output = File.createTempFile("output", "fsa.txt"); + input.deleteOnExit(); + output.deleteOnExit(); + + // Populate the file with data. + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // Emit UTF-8 BOM prefixed list of three strings. + baos.write(new byte [] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}); + baos.write(Joiner.on('\n').join("abc", "def", "xyz").getBytes(Charsets.UTF_8)); + Files.copy(ByteStreams.newInputStreamSupplier(baos.toByteArray()), input); + + baos.reset(); + PrintStream prev = System.err; + PrintStream ps = new PrintStream(baos); + System.setErr(ps); + try { + FSABuildTool.main(new String [] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath() + }); + } finally { + System.setErr(prev); + } + + String logs = new String(baos.toByteArray(), Charset.defaultCharset()); + Assert.assertThat(logs, StringContains.containsString("UTF-8 BOM")); + + System.out.println(logs); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java new file mode 100644 index 0000000..8e1d0e9 --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java @@ -0,0 +1,26 @@ +package morfologik.tools;
+
+import java.util.Map;
+
+import morfologik.tools.Launcher.ToolInfo;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class LauncherTest {
+ /* */
+ @Test
+ public void testTools() throws Exception {
+ for (Map.Entry<String, ToolInfo> e : Launcher.initTools().entrySet()) {
+ try {
+ e.getValue().invoke(new String[] {"--help"});
+ } catch (Throwable t) {
+ Assert.fail("Unable to launch " + e.getKey() + ": "
+ + t.getMessage());
+ }
+ }
+ }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java new file mode 100644 index 0000000..496880f --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java @@ -0,0 +1,243 @@ +package morfologik.tools;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.util.List;
+
+import morfologik.fsa.FSA;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.DictionaryMetadataBuilder;
+import morfologik.stemming.EncoderType;
+import morfologik.stemming.WordData;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.google.common.base.Charsets;
+import com.google.common.io.Closer;
+
+/*
+ *
+ */
+public class MorphEncodingToolTest extends RandomizedTest {
+ private Closer closer = Closer.create();
+
+ @After
+ public void cleanup() throws IOException {
+ closer.close();
+ }
+
+ @Test
+ public void testTool() throws Exception {
+ // Create a simple plain text file.
+ File input = super.newTempFile();
+ File output = super.newTempFile();
+
+ // Populate the file with data.
+ PrintWriter w =
+ new PrintWriter(
+ new OutputStreamWriter(
+ closer.register(new FileOutputStream(input)), "UTF-8"));
+ w.println("passagère\tpassager\ttag");
+ w.println("nieduży\tduży\ttest");
+ w.print("abcd\tabc\txyz");
+ w.close();
+
+ // suffix
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "--encoder", "suffix" });
+
+ BufferedReader testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+ Assert.assertEquals("passagère+Eer+tag", testOutput.readLine());
+ Assert.assertEquals("nieduży+Iduży+test", testOutput.readLine());
+ Assert.assertEquals("abcd+B+xyz", testOutput.readLine());
+
+ // prefix
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "--encoder", "prefix" });
+
+ testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+ Assert.assertEquals("passagère+AEer+tag", testOutput.readLine());
+ Assert.assertEquals("nieduży+DA+test", testOutput.readLine());
+ Assert.assertEquals("abcd+AB+xyz", testOutput.readLine());
+
+ // infix
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "--encoder", "infix" });
+
+ testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+ Assert.assertEquals("passagère+GDAr+tag", testOutput.readLine());
+ Assert.assertEquals("nieduży+ADA+test", testOutput.readLine());
+ Assert.assertEquals("abcd+AAB+xyz", testOutput.readLine());
+
+ // custom annotation - test tabs
+ MorphEncodingTool.main(new String[] {
+ "--annotation", "\t",
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "--encoder", "infix" });
+
+ testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+ Assert.assertEquals("passagère\tGDAr\ttag", testOutput.readLine());
+ Assert.assertEquals("nieduży\tADA\ttest", testOutput.readLine());
+ Assert.assertEquals("abcd\tAAB\txyz", testOutput.readLine());
+ }
+
+ /* */
+ @Test
+ public void testStemmingFile() throws Exception {
+ // Create a simple plain text file.
+ File input = super.newTempFile();
+ File output = super.newTempFile();
+
+ PrintWriter w =
+ new PrintWriter(
+ new OutputStreamWriter(
+ closer.register(new FileOutputStream(input)), "UTF-8"));
+ w.println("passagère\tpassager");
+ w.println("nieduży\tduży");
+ w.println();
+ w.println("abcd\tabc");
+ w.close();
+
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "-e", "suffix" });
+
+ BufferedReader testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+ Assert.assertEquals("passagère+Eer+", testOutput.readLine());
+ Assert.assertEquals("nieduży+Iduży+", testOutput.readLine());
+ Assert.assertEquals("abcd+B+", testOutput.readLine());
+
+ testOutput.close();
+ }
+
+ /* */
+ @Test
+ public void testZeroByteSeparator() throws Exception {
+ // Create a simple plain text file.
+ File input = newTempFile();
+ File output = newTempFile();
+
+ // Populate the file with data.
+ PrintWriter w =
+ new PrintWriter(
+ new OutputStreamWriter(
+ closer.register(new FileOutputStream(input)), "UTF-8"));
+ w.println("passagère\tpassager\tTAG1");
+ w.println("nieduży\tduży\tTAG2");
+ w.println("abcd\tabc\tTAG3");
+ w.close();
+
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "-e", "suffix",
+ "--annotation", "\u0000"});
+
+ BufferedReader testOutput =
+ new BufferedReader(
+ new InputStreamReader(
+ closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+
+ Assert.assertEquals("passagère\u0000Eer\u0000TAG1", testOutput.readLine());
+ Assert.assertEquals("nieduży\u0000Iduży\u0000TAG2", testOutput.readLine());
+ Assert.assertEquals("abcd\u0000B\u0000TAG3", testOutput.readLine());
+
+ File fsaFile = newTempFile();
+ FSABuildTool.main(
+ "--input", output.getAbsolutePath(),
+ "--output", fsaFile.getAbsolutePath());
+
+ FSA fsa = FSA.read(fsaFile);
+ DictionaryLookup dl = new DictionaryLookup(
+ new Dictionary(
+ fsa,
+ new DictionaryMetadataBuilder()
+ .separator((char) 0)
+ .encoding(Charsets.UTF_8)
+ .encoder(EncoderType.SUFFIX)
+ .build()));
+
+ checkEntry(dl, "passagère", "passager", "TAG1");
+ checkEntry(dl, "nieduży", "duży", "TAG2");
+ checkEntry(dl, "abcd", "abc", "TAG3");
+ }
+
+ /* */
+ @Test
+ public void testAnnotationCharacterInBaseOrDerivedWord() throws Exception {
+ // Create a simple plain text file.
+ File input = newTempFile();
+ File output = newTempFile();
+
+ // Populate the file with data.
+ PrintWriter w =
+ new PrintWriter(
+ new OutputStreamWriter(
+ closer.register(new FileOutputStream(input)), "UTF-8"));
+ w.println("foo+\tbar-\tTAG1");
+ w.close();
+
+ PrintStream err = System.err;
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+ try {
+ System.setErr(new PrintStream(baos, true, "UTF-8"));
+ MorphEncodingTool.main(new String[] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath(),
+ "-e", "suffix",
+ "--annotation", "+"});
+ } finally {
+ System.err.flush();
+ System.setErr(err);
+ }
+
+ Assertions.assertThat(new String(baos.toByteArray(), Charsets.UTF_8))
+ .contains("contain the annotation byte");
+ }
+
+ private void checkEntry(DictionaryLookup dl, String word, String base, String tag) {
+ List<WordData> lookup = dl.lookup(word);
+ Assertions.assertThat(lookup.size()).isEqualTo(1);
+ WordData wordData = lookup.get(0);
+ Assertions.assertThat(wordData.getWord().toString()).isEqualTo(word);
+ Assertions.assertThat(wordData.getStem().toString()).isEqualTo(base);
+ Assertions.assertThat(wordData.getTag().toString()).isEqualTo(tag);
+ }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java new file mode 100644 index 0000000..d0379d7 --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java @@ -0,0 +1,106 @@ +package morfologik.tools; + +import java.nio.ByteBuffer; +import java.util.List; + +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.DictionaryMetadataBuilder; +import morfologik.stemming.EncoderType; + +import org.junit.Test; + +import com.carrotsearch.hppc.ByteArrayList; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import com.google.common.base.Charsets; +import com.google.common.collect.Lists; + +public class SequenceEncodersRandomizedTest extends RandomizedTest { + private final SequenceEncoders.IEncoder coder; + + public SequenceEncodersRandomizedTest(@Name("coder") SequenceEncoders.IEncoder coder) + { + this.coder = coder; + } + + @ParametersFactory + public static List<Object[]> testFactory() { + List<Object[]> encoders = Lists.newArrayList(); + for (EncoderType t : EncoderType.values()) { + encoders.add(new Object [] {SequenceEncoders.forType(t)}); + } + return encoders; + } + + @Test + public void testEncodeSuffixOnRandomSequences() { + for (int i = 0; i < 10000; i++) { + assertRoundtripEncode( + randomAsciiOfLengthBetween(0, 500), + randomAsciiOfLengthBetween(0, 500)); + } + } + + @Test + public void testEncodeSamples() { + assertRoundtripEncode("", ""); + assertRoundtripEncode("abc", "ab"); + assertRoundtripEncode("abc", "abx"); + assertRoundtripEncode("ab", "abc"); + assertRoundtripEncode("xabc", "abc"); + assertRoundtripEncode("axbc", "abc"); + assertRoundtripEncode("axybc", "abc"); + assertRoundtripEncode("axybc", "abc"); + assertRoundtripEncode("azbc", "abcxy"); + + assertRoundtripEncode("Niemcami", "Niemiec"); + assertRoundtripEncode("Niemiec", "Niemcami"); + } + + private void assertRoundtripEncode(String srcString, String dstString) + { + ByteArrayList src = ByteArrayList.from(srcString.getBytes(UTF8)); + ByteArrayList dst = ByteArrayList.from(dstString.getBytes(UTF8)); + ByteArrayList encoded = ByteArrayList.newInstance(); + ByteArrayList decoded = ByteArrayList.newInstance(); + + coder.encode(src, dst, encoded); + coder.decode(src, encoded, decoded); + + if (!dst.equals(decoded)) { + System.out.println("src: " + new String(src.toArray(), Charsets.UTF_8)); + System.out.println("dst: " + new String(dst.toArray(), Charsets.UTF_8)); + System.out.println("enc: " + new String(encoded.toArray(), Charsets.UTF_8)); + System.out.println("dec: " + new String(decoded.toArray(), Charsets.UTF_8)); + } + + assertEquals(dst, decoded); + + // DictionaryLookup.decodeBaseForm decoding testing + DictionaryMetadataBuilder builder = new DictionaryMetadataBuilder(); + builder.encoding(Charsets.UTF_8); + builder.encoder(coder.type()); + + ByteBuffer bb = DictionaryLookup.decodeBaseForm( + ByteBuffer.allocate(0), + encoded.toArray(), + encoded.size(), + ByteBuffer.wrap(src.toArray()), builder.build()); + + ByteArrayList decoded2 = ByteArrayList.newInstance(); + bb.flip(); + while (bb.hasRemaining()) decoded2.add(bb.get()); + + if (!dst.equals(decoded2)) { + System.out.println("DictionaryLookup.decodeBaseForm incorrect, coder: " + coder); + System.out.println("src : " + new String(src.toArray(), Charsets.UTF_8)); + System.out.println("dst : " + new String(dst.toArray(), Charsets.UTF_8)); + System.out.println("enc : " + new String(encoded.toArray(), Charsets.UTF_8)); + System.out.println("dec : " + new String(decoded.toArray(), Charsets.UTF_8)); + System.out.println("dec2: " + new String(decoded2.toArray(), Charsets.UTF_8)); + } + + assertEquals(dst, decoded2); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java new file mode 100644 index 0000000..3f1625d --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java @@ -0,0 +1,96 @@ +package morfologik.tools;
+
+import static org.junit.Assert.*;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.EncoderType;
+
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+/*
+ *
+ */
+public class SequenceEncodersStaticTest {
+ private SequenceAssembler suffix = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder());
+ private SequenceAssembler prefix = new SequenceAssembler(new SequenceEncoders.TrimPrefixAndSuffixEncoder());
+ private SequenceAssembler infix = new SequenceAssembler(new SequenceEncoders.TrimInfixAndSuffixEncoder());
+
+ @Test
+ public void testStandardEncode() throws Exception {
+ assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag"));
+ assertEquals("abc+Dxyz+tag", encode(suffix, "abc", "xyz", "tag"));
+ assertEquals("abc+Bć+tag", encode(suffix, "abc", "abć", "tag"));
+ }
+
+ @Test
+ public void testSeparatorChange() throws Exception {
+ assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag"));
+
+ SequenceAssembler assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '_');
+ assertEquals("abc_Ad_tag", encode(assembler, "abc", "abcd", "tag"));
+
+ assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '\t');
+ assertEquals("abc\tAd\ttag", encode(assembler, "abc", "abcd", "tag"));
+ }
+
+ @Test
+ public void testPrefixEncode() throws UnsupportedEncodingException {
+ assertEquals("abc+AAd+tag", encode(prefix, "abc", "abcd", "tag"));
+ assertEquals("abcd+AB+tag", encode(prefix, "abcd", "abc", "tag"));
+ assertEquals("abc+ADxyz+tag", encode(prefix, "abc", "xyz", "tag"));
+ assertEquals("abc+ABć+tag", encode(prefix, "abc", "abć", "tag"));
+ assertEquals("postmodernizm+AAu+xyz", encode(prefix, "postmodernizm", "postmodernizmu", "xyz"));
+ assertEquals("postmodernizmu+AB+xyz", encode(prefix, "postmodernizmu", "postmodernizm", "xyz"));
+ assertEquals("nieduży+DA+adj", encode(prefix, "nieduży", "duży", "adj"));
+ assertEquals("postmodernizm+EA+xyz", encode(prefix, "postmodernizm", "modernizm", "xyz"));
+ }
+
+ @Test
+ public void testInfixEncode() throws UnsupportedEncodingException {
+ assertEquals("ayz+AACbc+tag", encode(infix, "ayz", "abc", "tag"));
+ assertEquals("xyz+AADabc+tag", encode(infix, "xyz", "abc", "tag"));
+
+ assertEquals("abc+AAAd+tag", encode(infix, "abc", "abcd", "tag"));
+ assertEquals("abcd+AAB+tag", encode(infix, "abcd", "abc", "tag"));
+ assertEquals("abc+AADxyz+tag", encode(infix, "abc", "xyz", "tag"));
+ assertEquals("abc+AABć+tag", encode(infix, "abc", "abć", "tag"));
+ assertEquals("postmodernizm+AAAu+xyz", encode(infix, "postmodernizm", "postmodernizmu", "xyz"));
+ assertEquals("postmodernizmu+AAB+xyz", encode(infix, "postmodernizmu", "postmodernizm", "xyz"));
+ assertEquals("nieduży+ADA+adj", encode(infix, "nieduży", "duży", "adj"));
+
+ // real infix cases
+ assertEquals("kcal+ABA+xyz", encode(infix, "kcal", "cal", "xyz"));
+ assertEquals("aillent+BBCr+xyz", encode(infix, "aillent", "aller", "xyz"));
+ assertEquals("laquelle+AGAquel+D f s", encode(infix, "laquelle", "lequel", "D f s"));
+ assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test"));
+ assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test"));
+ }
+
+ @Test
+ public void testUTF8Boundary() throws Exception {
+ assertEquals("passagère+Eer+tag", encode(suffix, "passagère", "passager", "tag"));
+ assertEquals("passagère+GDAr+tag", encode(infix, "passagère", "passager", "tag"));
+ assertEquals("passagère+AEer+tag", encode(prefix, "passagère", "passager", "tag"));
+ }
+
+ @Test
+ public void testAllEncodersHaveImplementations() {
+ for (EncoderType t : EncoderType.values()) {
+ assertNotNull(null != SequenceEncoders.forType(t));
+ }
+ }
+
+ private String encode(SequenceAssembler assembler, String wordForm,
+ String wordLemma, String wordTag)
+ {
+ Charset UTF8 = Charsets.UTF_8;
+ return new String(assembler.encode(
+ wordForm.getBytes(UTF8),
+ wordLemma.getBytes(UTF8),
+ wordTag.getBytes(UTF8)), UTF8);
+ }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java new file mode 100644 index 0000000..573c5da --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java @@ -0,0 +1,37 @@ +package morfologik.tools;
+
+import java.io.*;
+
+import morfologik.fsa.*;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class Text2FSA5Test {
+ @Test
+ public void testTool() throws Exception {
+ // Create a simple plain text file.
+ File input = File.createTempFile("input", "in");
+ File output = File.createTempFile("output", "fsa");
+ input.deleteOnExit();
+ output.deleteOnExit();
+
+ // Populate the file with data.
+ PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(input), "UTF-8"));
+ w.println("b");
+ w.println("cab");
+ w.println("ab");
+ w.close();
+
+ FSABuildTool.main(new String [] {
+ "--input", input.getAbsolutePath(),
+ "--output", output.getAbsolutePath()
+ });
+
+ FSA5 fsa = FSA.read(new FileInputStream(output));
+ Assert.assertEquals(3, new FSAInfo(fsa).finalStatesCount);
+ }
+}
|