6 files changed, 561 insertions, 0 deletions
diff --git a/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java
new file mode 100644
index 0000000..4d45f9c
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java
@@ -0,0 +1,53 @@
+package morfologik.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.nio.charset.Charset;
+
+import org.hamcrest.core.StringContains;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Joiner;
+import com.google.common.io.ByteStreams;
+import com.google.common.io.Files;
+
+public class FSABuildToolTest
+{
+    /* */
+    @Test
+    public void testStemmingFile() throws Exception {
+        // Create a simple plain text file.
+        File input = File.createTempFile("input", "in");
+        File output = File.createTempFile("output", "fsa.txt");
+        input.deleteOnExit();
+        output.deleteOnExit();
+
+        // Populate the file with data.
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        // Emit UTF-8 BOM prefixed list of three strings.
+        baos.write(new byte [] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF});
+        baos.write(Joiner.on('\n').join("abc", "def", "xyz").getBytes(Charsets.UTF_8));
+        Files.copy(ByteStreams.newInputStreamSupplier(baos.toByteArray()), input);
+
+        baos.reset();
+        PrintStream prev = System.err;
+        PrintStream ps = new PrintStream(baos);
+        System.setErr(ps);
+        try {
+            FSABuildTool.main(new String [] {
+                "--input", input.getAbsolutePath(),
+                "--output", output.getAbsolutePath()
+            });
+        } finally {
+            System.setErr(prev);
+        }
+
+        String logs = new String(baos.toByteArray(), Charset.defaultCharset());
+        Assert.assertThat(logs, StringContains.containsString("UTF-8 BOM"));
+        
+        System.out.println(logs);
+    }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java
new file mode 100644
index 0000000..8e1d0e9
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java
@@ -0,0 +1,26 @@
+package morfologik.tools;
+
+import java.util.Map;
+
+import morfologik.tools.Launcher.ToolInfo;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/*
+ *
+ */
+public class LauncherTest {
+	/* */
+	@Test
+	public void testTools() throws Exception {
+		for (Map.Entry<String, ToolInfo> e : Launcher.initTools().entrySet()) {
+			try {
+				e.getValue().invoke(new String[] {"--help"});
+			} catch (Throwable t) {
+				Assert.fail("Unable to launch " + e.getKey() + ": "
+				        + t.getMessage());
+			}
+		}
+	}
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java
new file mode 100644
index 0000000..496880f
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java
@@ -0,0 +1,243 @@
+package morfologik.tools;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.util.List;
+
+import morfologik.fsa.FSA;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.DictionaryMetadataBuilder;
+import morfologik.stemming.EncoderType;
+import morfologik.stemming.WordData;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.google.common.base.Charsets;
+import com.google.common.io.Closer;
+
+/*
+ * 
+ */
+public class MorphEncodingToolTest extends RandomizedTest {
+    private Closer closer = Closer.create();
+
+    @After
+    public void cleanup() throws IOException {
+        closer.close();
+    }
+    
+	@Test
+	public void testTool() throws Exception {
+		// Create a simple plain text file.
+		File input = super.newTempFile();
+		File output = super.newTempFile();
+
+		// Populate the file with data.
+		PrintWriter w = 
+		    new PrintWriter(
+		        new OutputStreamWriter(
+		            closer.register(new FileOutputStream(input)), "UTF-8"));
+		w.println("passagère\tpassager\ttag");
+		w.println("nieduży\tduży\ttest");
+		w.print("abcd\tabc\txyz");
+		w.close();
+
+		// suffix
+		MorphEncodingTool.main(new String[] { 
+				"--input", input.getAbsolutePath(), 
+				"--output", output.getAbsolutePath(), 
+				"--encoder", "suffix" });
+
+		BufferedReader testOutput = 
+		    new BufferedReader(
+		        new InputStreamReader(
+		            closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+		Assert.assertEquals("passagère+Eer+tag", testOutput.readLine());
+		Assert.assertEquals("nieduży+Iduży+test", testOutput.readLine());
+		Assert.assertEquals("abcd+B+xyz", testOutput.readLine());
+
+		// prefix
+		MorphEncodingTool.main(new String[] { 
+				"--input", input.getAbsolutePath(), 
+				"--output", output.getAbsolutePath(), 
+				"--encoder", "prefix" });
+
+		testOutput = 
+		    new BufferedReader(
+		        new InputStreamReader(
+		            closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+		Assert.assertEquals("passagère+AEer+tag", testOutput.readLine());
+		Assert.assertEquals("nieduży+DA+test", testOutput.readLine());
+		Assert.assertEquals("abcd+AB+xyz", testOutput.readLine());
+
+		// infix
+		MorphEncodingTool.main(new String[] { 
+				"--input", input.getAbsolutePath(), 
+				"--output", output.getAbsolutePath(), 
+				"--encoder", "infix" });
+
+		testOutput = 
+		    new BufferedReader(
+		        new InputStreamReader(
+		            closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+		Assert.assertEquals("passagère+GDAr+tag", testOutput.readLine());
+		Assert.assertEquals("nieduży+ADA+test", testOutput.readLine());
+		Assert.assertEquals("abcd+AAB+xyz", testOutput.readLine());
+
+		// custom annotation - test tabs
+        MorphEncodingTool.main(new String[] {
+                "--annotation", "\t",
+                "--input", input.getAbsolutePath(), 
+                "--output", output.getAbsolutePath(), 
+                "--encoder", "infix" });
+
+        testOutput = 
+            new BufferedReader(
+                new InputStreamReader(
+                    closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+        Assert.assertEquals("passagère\tGDAr\ttag", testOutput.readLine());
+        Assert.assertEquals("nieduży\tADA\ttest", testOutput.readLine());
+        Assert.assertEquals("abcd\tAAB\txyz", testOutput.readLine());
+	}
+
+	/* */
+	@Test
+	public void testStemmingFile() throws Exception {
+		// Create a simple plain text file.
+		File input = super.newTempFile();
+		File output = super.newTempFile();
+
+		PrintWriter w = 
+		    new PrintWriter(
+		        new OutputStreamWriter(
+		            closer.register(new FileOutputStream(input)), "UTF-8"));
+		w.println("passagère\tpassager");
+		w.println("nieduży\tduży");
+		w.println();
+		w.println("abcd\tabc");
+		w.close();
+
+		MorphEncodingTool.main(new String[] { 
+		    "--input", input.getAbsolutePath(), 
+		    "--output", output.getAbsolutePath(),
+		    "-e", "suffix" });
+
+		BufferedReader testOutput = 
+		    new BufferedReader(
+		        new InputStreamReader(
+		            closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+		Assert.assertEquals("passagère+Eer+", testOutput.readLine());
+		Assert.assertEquals("nieduży+Iduży+", testOutput.readLine());
+		Assert.assertEquals("abcd+B+", testOutput.readLine());
+
+		testOutput.close();
+	}
+
+    /* */
+    @Test
+    public void testZeroByteSeparator() throws Exception {
+        // Create a simple plain text file.
+        File input = newTempFile();
+        File output = newTempFile();
+
+        // Populate the file with data.
+        PrintWriter w = 
+            new PrintWriter(
+                new OutputStreamWriter(
+                    closer.register(new FileOutputStream(input)), "UTF-8"));
+        w.println("passagère\tpassager\tTAG1");
+        w.println("nieduży\tduży\tTAG2");
+        w.println("abcd\tabc\tTAG3");
+        w.close();
+
+        MorphEncodingTool.main(new String[] { 
+            "--input", input.getAbsolutePath(), 
+            "--output", output.getAbsolutePath(),
+            "-e", "suffix",
+            "--annotation", "\u0000"});
+
+        BufferedReader testOutput = 
+            new BufferedReader(
+                new InputStreamReader(
+                    closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8"));
+
+        Assert.assertEquals("passagère\u0000Eer\u0000TAG1", testOutput.readLine());
+        Assert.assertEquals("nieduży\u0000Iduży\u0000TAG2", testOutput.readLine());
+        Assert.assertEquals("abcd\u0000B\u0000TAG3", testOutput.readLine());
+ 
+        File fsaFile = newTempFile();
+        FSABuildTool.main(
+            "--input", output.getAbsolutePath(),
+            "--output", fsaFile.getAbsolutePath());
+
+        FSA fsa = FSA.read(fsaFile);
+        DictionaryLookup dl = new DictionaryLookup(
+            new Dictionary(
+                fsa, 
+                new DictionaryMetadataBuilder()
+                    .separator((char) 0)
+                    .encoding(Charsets.UTF_8)
+                    .encoder(EncoderType.SUFFIX)
+                    .build()));
+
+        checkEntry(dl, "passagère", "passager", "TAG1");
+        checkEntry(dl, "nieduży", "duży", "TAG2");
+        checkEntry(dl, "abcd", "abc", "TAG3");
+    }
+
+    /* */
+    @Test
+    public void testAnnotationCharacterInBaseOrDerivedWord() throws Exception {
+        // Create a simple plain text file.
+        File input = newTempFile();
+        File output = newTempFile();
+
+        // Populate the file with data.
+        PrintWriter w = 
+            new PrintWriter(
+                new OutputStreamWriter(
+                    closer.register(new FileOutputStream(input)), "UTF-8"));
+        w.println("foo+\tbar-\tTAG1");
+        w.close();
+
+        PrintStream err = System.err;
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        
+        try {
+            System.setErr(new PrintStream(baos, true, "UTF-8"));
+            MorphEncodingTool.main(new String[] { 
+                "--input", input.getAbsolutePath(), 
+                "--output", output.getAbsolutePath(),
+                "-e", "suffix",
+                "--annotation", "+"});
+        } finally {
+            System.err.flush();
+            System.setErr(err);
+        }
+
+        Assertions.assertThat(new String(baos.toByteArray(), Charsets.UTF_8))
+            .contains("contain the annotation byte");
+    }
+
+    private void checkEntry(DictionaryLookup dl, String word, String base, String tag) {
+        List<WordData> lookup = dl.lookup(word);
+        Assertions.assertThat(lookup.size()).isEqualTo(1);
+        WordData wordData = lookup.get(0);
+        Assertions.assertThat(wordData.getWord().toString()).isEqualTo(word);
+        Assertions.assertThat(wordData.getStem().toString()).isEqualTo(base);
+        Assertions.assertThat(wordData.getTag().toString()).isEqualTo(tag);
+    }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java
new file mode 100644
index 0000000..d0379d7
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java
@@ -0,0 +1,106 @@
+package morfologik.tools;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.DictionaryMetadataBuilder;
+import morfologik.stemming.EncoderType;
+
+import org.junit.Test;
+
+import com.carrotsearch.hppc.ByteArrayList;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+import com.google.common.base.Charsets;
+import com.google.common.collect.Lists;
+
+public class SequenceEncodersRandomizedTest extends RandomizedTest {
+    private final SequenceEncoders.IEncoder coder;
+
+    public SequenceEncodersRandomizedTest(@Name("coder") SequenceEncoders.IEncoder coder)
+    {
+        this.coder = coder;
+    }
+
+    @ParametersFactory
+    public static List<Object[]> testFactory() {
+        List<Object[]> encoders = Lists.newArrayList();
+        for (EncoderType t : EncoderType.values()) {
+            encoders.add(new Object [] {SequenceEncoders.forType(t)});
+        }
+        return encoders;
+    }
+
+    @Test
+    public void testEncodeSuffixOnRandomSequences() {
+        for (int i = 0; i < 10000; i++) {
+            assertRoundtripEncode(
+                randomAsciiOfLengthBetween(0, 500),
+                randomAsciiOfLengthBetween(0, 500));
+        }
+    }
+
+    @Test
+    public void testEncodeSamples() {
+        assertRoundtripEncode("", "");
+        assertRoundtripEncode("abc", "ab");
+        assertRoundtripEncode("abc", "abx");
+        assertRoundtripEncode("ab", "abc");
+        assertRoundtripEncode("xabc", "abc");
+        assertRoundtripEncode("axbc", "abc");
+        assertRoundtripEncode("axybc", "abc");
+        assertRoundtripEncode("axybc", "abc");
+        assertRoundtripEncode("azbc", "abcxy");
+
+        assertRoundtripEncode("Niemcami", "Niemiec");
+        assertRoundtripEncode("Niemiec", "Niemcami");
+    }
+
+    private void assertRoundtripEncode(String srcString, String dstString)
+    {
+        ByteArrayList src = ByteArrayList.from(srcString.getBytes(UTF8));
+        ByteArrayList dst = ByteArrayList.from(dstString.getBytes(UTF8));
+        ByteArrayList encoded = ByteArrayList.newInstance();
+        ByteArrayList decoded = ByteArrayList.newInstance();
+
+        coder.encode(src, dst, encoded);
+        coder.decode(src, encoded, decoded);
+
+        if (!dst.equals(decoded)) {
+            System.out.println("src: " + new String(src.toArray(), Charsets.UTF_8));
+            System.out.println("dst: " + new String(dst.toArray(), Charsets.UTF_8));
+            System.out.println("enc: " + new String(encoded.toArray(), Charsets.UTF_8));
+            System.out.println("dec: " + new String(decoded.toArray(), Charsets.UTF_8));
+        }
+        
+        assertEquals(dst, decoded);
+        
+        // DictionaryLookup.decodeBaseForm decoding testing
+        DictionaryMetadataBuilder builder = new DictionaryMetadataBuilder();
+        builder.encoding(Charsets.UTF_8);
+        builder.encoder(coder.type());
+            
+        ByteBuffer bb = DictionaryLookup.decodeBaseForm(
+            ByteBuffer.allocate(0),
+            encoded.toArray(), 
+            encoded.size(), 
+            ByteBuffer.wrap(src.toArray()), builder.build());
+        
+        ByteArrayList decoded2 = ByteArrayList.newInstance();
+        bb.flip();
+        while (bb.hasRemaining()) decoded2.add(bb.get());
+
+        if (!dst.equals(decoded2)) {
+            System.out.println("DictionaryLookup.decodeBaseForm incorrect, coder: " + coder);
+            System.out.println("src : " + new String(src.toArray(), Charsets.UTF_8));
+            System.out.println("dst : " + new String(dst.toArray(), Charsets.UTF_8));
+            System.out.println("enc : " + new String(encoded.toArray(), Charsets.UTF_8));
+            System.out.println("dec : " + new String(decoded.toArray(), Charsets.UTF_8));
+            System.out.println("dec2: " + new String(decoded2.toArray(), Charsets.UTF_8));
+        }
+
+        assertEquals(dst, decoded2);       
+    }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java
new file mode 100644
index 0000000..3f1625d
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java
@@ -0,0 +1,96 @@
+package morfologik.tools;
+
+import static org.junit.Assert.*;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.EncoderType;
+
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+/*
+ * 
+ */
+public class SequenceEncodersStaticTest {
+    private SequenceAssembler suffix = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder());
+    private SequenceAssembler prefix = new SequenceAssembler(new SequenceEncoders.TrimPrefixAndSuffixEncoder());
+    private SequenceAssembler infix = new SequenceAssembler(new SequenceEncoders.TrimInfixAndSuffixEncoder());
+
+    @Test
+	public void testStandardEncode() throws Exception {
+		assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag"));
+		assertEquals("abc+Dxyz+tag", encode(suffix, "abc", "xyz", "tag"));
+		assertEquals("abc+Bć+tag", encode(suffix, "abc", "abć", "tag"));	
+	}
+
+    @Test
+	public void testSeparatorChange() throws Exception {
+		assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag")); 
+
+		SequenceAssembler assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '_');
+		assertEquals("abc_Ad_tag", encode(assembler, "abc", "abcd", "tag"));
+
+		assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '\t');
+        assertEquals("abc\tAd\ttag", encode(assembler, "abc", "abcd", "tag"));
+	}
+	
+	@Test
+	public void testPrefixEncode() throws UnsupportedEncodingException {
+        assertEquals("abc+AAd+tag", encode(prefix, "abc", "abcd", "tag"));
+        assertEquals("abcd+AB+tag", encode(prefix, "abcd", "abc", "tag"));
+		assertEquals("abc+ADxyz+tag", encode(prefix, "abc", "xyz", "tag"));
+		assertEquals("abc+ABć+tag", encode(prefix, "abc", "abć", "tag"));
+		assertEquals("postmodernizm+AAu+xyz", encode(prefix, "postmodernizm", "postmodernizmu", "xyz"));
+		assertEquals("postmodernizmu+AB+xyz", encode(prefix, "postmodernizmu", "postmodernizm", "xyz"));
+		assertEquals("nieduży+DA+adj", encode(prefix, "nieduży", "duży", "adj"));
+		assertEquals("postmodernizm+EA+xyz", encode(prefix, "postmodernizm", "modernizm", "xyz"));
+	}
+
+	@Test
+	public void testInfixEncode() throws UnsupportedEncodingException {
+        assertEquals("ayz+AACbc+tag", encode(infix, "ayz", "abc", "tag"));
+	    assertEquals("xyz+AADabc+tag", encode(infix, "xyz", "abc", "tag"));
+
+		assertEquals("abc+AAAd+tag", encode(infix, "abc", "abcd", "tag"));
+		assertEquals("abcd+AAB+tag", encode(infix, "abcd", "abc", "tag"));
+		assertEquals("abc+AADxyz+tag", encode(infix, "abc", "xyz", "tag"));
+		assertEquals("abc+AABć+tag", encode(infix, "abc", "abć", "tag"));
+		assertEquals("postmodernizm+AAAu+xyz", encode(infix, "postmodernizm", "postmodernizmu", "xyz"));
+		assertEquals("postmodernizmu+AAB+xyz", encode(infix, "postmodernizmu", "postmodernizm", "xyz"));
+		assertEquals("nieduży+ADA+adj", encode(infix, "nieduży", "duży", "adj"));
+
+		// real infix cases
+		assertEquals("kcal+ABA+xyz", encode(infix, "kcal", "cal", "xyz"));
+		assertEquals("aillent+BBCr+xyz", encode(infix, "aillent", "aller", "xyz"));
+        assertEquals("laquelle+AGAquel+D f s", encode(infix, "laquelle", "lequel", "D f s"));
+		assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test"));
+		assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test"));
+	}
+
+	@Test
+	public void testUTF8Boundary() throws Exception {
+		assertEquals("passagère+Eer+tag", encode(suffix, "passagère", "passager", "tag"));
+		assertEquals("passagère+GDAr+tag", encode(infix, "passagère", "passager", "tag"));
+		assertEquals("passagère+AEer+tag", encode(prefix, "passagère", "passager", "tag"));
+	}
+	
+	@Test
+	public void testAllEncodersHaveImplementations() {
+	    for (EncoderType t : EncoderType.values()) {
+	        assertNotNull(null != SequenceEncoders.forType(t));
+	    }
+	}
+
+    private String encode(SequenceAssembler assembler, String wordForm,
+        String wordLemma, String wordTag)
+    {
+        Charset UTF8 = Charsets.UTF_8;
+        return new String(assembler.encode(
+            wordForm.getBytes(UTF8), 
+            wordLemma.getBytes(UTF8), 
+            wordTag.getBytes(UTF8)), UTF8);
+    }
+}
diff --git a/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java
new file mode 100644
index 0000000..573c5da
--- /dev/null
+++ b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java
@@ -0,0 +1,37 @@
+package morfologik.tools;
+
+import java.io.*;
+
+import morfologik.fsa.*;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/*
+ * 
+ */
+public class Text2FSA5Test {
+	@Test
+	public void testTool() throws Exception {
+		// Create a simple plain text file.
+		File input = File.createTempFile("input", "in");
+		File output = File.createTempFile("output", "fsa");
+		input.deleteOnExit();
+		output.deleteOnExit();
+
+		// Populate the file with data.
+		PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(input), "UTF-8"));
+		w.println("b");
+		w.println("cab");
+		w.println("ab");
+		w.close();
+		
+		FSABuildTool.main(new String [] {
+				"--input", input.getAbsolutePath(),
+				"--output", output.getAbsolutePath()
+		});
+
+		FSA5 fsa = FSA.read(new FileInputStream(output));
+		Assert.assertEquals(3, new FSAInfo(fsa).finalStatesCount);
+	}
+}