summaryrefslogtreecommitdiff
path: root/morfologik-speller/src/test/java
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-speller/src/test/java')
-rw-r--r--morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java21
-rw-r--r--morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java272
2 files changed, 293 insertions, 0 deletions
diff --git a/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java
new file mode 100644
index 0000000..38aa76d
--- /dev/null
+++ b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java
@@ -0,0 +1,21 @@
+package morfologik.speller;
+
+import static org.junit.Assert.*;
+
+import morfologik.speller.HMatrix;
+
+import org.junit.Test;
+
+public class HMatrixTest {
+
+ private static final int MAX_WORD_LENGTH = 120;
+
+ @Test
+ public void stressTestInit() {
+ for (int i = 0; i < 10; i++) { // test if we don't get beyond array limits etc.
+ HMatrix H = new HMatrix(i, MAX_WORD_LENGTH);
+ assertEquals(0, H.get(1, 1));
+ }
+ }
+
+}
diff --git a/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java
new file mode 100644
index 0000000..48ed2c1
--- /dev/null
+++ b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java
@@ -0,0 +1,272 @@
+package morfologik.speller;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class SpellerTest {
+ private static Dictionary dictionary;
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ final URL url = SpellerTest.class.getResource("slownik.dict");
+ dictionary = Dictionary.read(url);
+ }
+
+ /*
+ @Test
+ public void testAbka() throws Exception {
+ final Speller spell = new Speller(dictionary, 2);
+ System.out.println("Replacements:");
+ for (String s : spell.findReplacements("abka")) {
+ System.out.println(s);
+ }
+ }
+ */
+
+ @Test
+ public void testRunonWords() throws IOException {
+ final Speller spell = new Speller(dictionary);
+ Assertions.assertThat(spell.replaceRunOnWords("abaka")).isEmpty();
+ Assertions.assertThat(spell.replaceRunOnWords("abakaabace")).contains("abaka abace");
+
+ // Test on an morphological dictionary - should work as well
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.replaceRunOnWords("Rzekunia").isEmpty());
+ assertTrue(spell1.replaceRunOnWords("RzekuniaRzeczypospolitej").contains("Rzekunia Rzeczypospolitej"));
+ assertTrue(spell1.replaceRunOnWords("RzekuniaRze").isEmpty()); //Rze is not found but is a prefix
+ }
+
+ @Test
+ public void testIsInDictionary() throws IOException {
+ // Test on an morphological dictionary, including separators
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.isInDictionary("Rzekunia"));
+ assertTrue(!spell1.isInDictionary("Rzekunia+"));
+ assertTrue(!spell1.isInDictionary("Rzekunia+aaa"));
+ // test UTF-8 dictionary
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.isInDictionary("jaźń"));
+ assertTrue(spell.isInDictionary("zażółć"));
+ assertTrue(spell.isInDictionary("żółwiową"));
+ assertTrue(spell.isInDictionary("ćwikła"));
+ assertTrue(spell.isInDictionary("Żebrowski"));
+ assertTrue(spell.isInDictionary("Święto"));
+ assertTrue(spell.isInDictionary("Świerczewski"));
+ assertTrue(spell.isInDictionary("abc"));
+ }
+
+ @Test
+ public void testFindReplacements() throws IOException {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.findReplacements("abka").contains("abak"));
+ //check if we get only dictionary words...
+ List<String> reps = spell.findReplacements("bak");
+ for (final String word: reps) {
+ assertTrue(spell.isInDictionary(word));
+ }
+ assertTrue(spell.findReplacements("abka~~").isEmpty()); // 2 characters more -> edit distance too large
+ assertTrue(!spell.findReplacements("Rezkunia").contains("Rzekunia"));
+
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.findReplacements("Rezkunia").contains("Rzekunia"));
+ //diacritics
+ assertTrue(spell1.findReplacements("Rzękunia").contains("Rzekunia"));
+ //we should get no candidates for correct words
+ assertTrue(spell1.isInDictionary("Rzekunia"));
+ assertTrue(spell1.findReplacements("Rzekunia").isEmpty());
+ //and no for things that are too different from the dictionary
+ assertTrue(spell1.findReplacements("Strefakibica").isEmpty());
+ //nothing for nothing
+ assertTrue(spell1.findReplacements("").isEmpty());
+ //nothing for weird characters
+ assertTrue(spell1.findReplacements("\u0000").isEmpty());
+ //nothing for other characters
+ assertTrue(spell1.findReplacements("«…»").isEmpty());
+ //nothing for separator
+ assertTrue(spell1.findReplacements("+").isEmpty());
+
+ }
+
+ @Test
+ public void testFrequencyNonUTFDictionary() throws IOException {
+ final URL url1 = getClass().getResource("test_freq_iso.dict");
+ final Speller spell = new Speller(Dictionary.read(url1));
+ assertTrue(spell.isInDictionary("a"));
+ assertTrue(!spell.isInDictionary("aõh")); //non-encodable in UTF-8
+ }
+
+ @Test
+ public void testFindReplacementsInUTF() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.findReplacements("gęslą").contains("gęślą"));
+ assertTrue(spell.findReplacements("ćwikla").contains("ćwikła"));
+ assertTrue(spell.findReplacements("Swierczewski").contains("Świerczewski"));
+ assertTrue(spell.findReplacements("zółwiową").contains("żółwiową"));
+ assertTrue(spell.findReplacements("Żebrowsk").contains("Żebrowski"));
+ assertTrue(spell.findReplacements("święto").contains("Święto"));
+ //note: no diacritics here, but we still get matches!
+ assertTrue(spell.findReplacements("gesla").contains("gęślą"));
+ assertTrue(spell.findReplacements("swieto").contains("Święto"));
+ assertTrue(spell.findReplacements("zolwiowa").contains("żółwiową"));
+ //using equivalent characters 'x' = 'ź'
+ assertTrue(spell.findReplacements("jexn").contains("jaźń"));
+ // 'u' = 'ó', so the edit distance is still small...
+ assertTrue(spell.findReplacements("zażulv").contains("zażółć"));
+ // 'rz' = 'ż', so the edit distance is still small, but with string replacements...
+ assertTrue(spell.findReplacements("zarzulv").contains("zażółć"));
+ assertTrue(spell.findReplacements("Rzebrowski").contains("Żebrowski"));
+ assertTrue(spell.findReplacements("rzółw").contains("żółw"));
+ assertTrue(spell.findReplacements("Świento").contains("Święto"));
+ // avoid mixed-case words as suggestions when using replacements ('rz' = 'ż')
+ assertTrue(spell.findReplacements("zArzółć").get(0).equals("zażółć"));
+ }
+
+ @Test
+ public void testFindReplacementsUsingFrequency() throws IOException {
+ final URL url = getClass().getResource("dict-with-freq.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+
+ //check if we get only dictionary words...
+ List<String> reps = spell.findReplacements("jist");
+ for (final String word: reps) {
+ assertTrue(spell.isInDictionary(word));
+ }
+ // get replacements ordered by frequency
+ assertTrue(reps.get(0).equals("just"));
+ assertTrue(reps.get(1).equals("list"));
+ assertTrue(reps.get(2).equals("fist"));
+ assertTrue(reps.get(3).equals("mist"));
+ assertTrue(reps.get(4).equals("jest"));
+ assertTrue(reps.get(5).equals("dist"));
+ assertTrue(reps.get(6).equals("gist"));
+ }
+
+ @Test
+ public void testIsMisspelled() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(!spell.isMisspelled("Paragraf22")); //ignorujemy liczby
+ assertTrue(!spell.isMisspelled("!")); //ignorujemy znaki przestankowe
+ assertTrue(spell.isMisspelled("dziekie")); //test, czy znajdujemy błąd
+ assertTrue(!spell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda
+ assertTrue(!spell.isMisspelled("Ćwikła")); //i małe litery
+ assertTrue(!spell.isMisspelled("TOJESTTEST")); //i wielkie litery
+ final Speller oldStyleSpell = new Speller(dictionary, 1);
+ assertTrue(oldStyleSpell.isMisspelled("Paragraf22")); // nie ignorujemy liczby
+ assertTrue(oldStyleSpell.isMisspelled("!")); //nie ignorujemy znaków przestankowych
+ // assertTrue(oldStyleSpell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda
+ assertTrue(oldStyleSpell.isMisspelled("Abaka")); //i małe litery
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(!spell1.isMisspelled("Rzekunia"));
+ assertTrue(spell1.isAllUppercase("RZEKUNIA"));
+ assertTrue(spell1.isMisspelled("RZEKUNIAA")); // finds a typo here
+ assertTrue(!spell1.isMisspelled("RZEKUNIA")); // but not here
+ }
+
+ @Test
+ public void testCamelCase() {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.isCamelCase("CamelCase"));
+ assertTrue(!spell.isCamelCase("Camel"));
+ assertTrue(!spell.isCamelCase("CAMEL"));
+ assertTrue(!spell.isCamelCase("camel"));
+ assertTrue(!spell.isCamelCase("cAmel"));
+ assertTrue(!spell.isCamelCase("CAmel"));
+ assertTrue(!spell.isCamelCase(""));
+ assertTrue(!spell.isCamelCase(null));
+ }
+
+ @Test
+ public void testCapitalizedWord() {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.isNotCapitalizedWord("CamelCase"));
+ assertTrue(!spell.isNotCapitalizedWord("Camel"));
+ assertTrue(spell.isNotCapitalizedWord("CAMEL"));
+ assertTrue(spell.isNotCapitalizedWord("camel"));
+ assertTrue(spell.isNotCapitalizedWord("cAmel"));
+ assertTrue(spell.isNotCapitalizedWord("CAmel"));
+ assertTrue(spell.isNotCapitalizedWord(""));
+ }
+
+ @Test
+ public void testGetAllReplacements() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.isMisspelled("rzarzerzarzu"));
+ assertEquals("[rzarzerzarzu]",
+ Arrays.toString(spell.getAllReplacements("rzarzerzarzu", 0, 0).toArray()));
+ }
+
+ @Test
+ public void testEditDistanceCalculation() throws IOException {
+ final Speller spell = new Speller(dictionary, 5);
+ //test examples from Oflazer's paper
+ assertTrue(getEditDistance(spell, "recoginze", "recognize") == 1);
+ assertTrue(getEditDistance(spell, "sailn", "failing") == 3);
+ assertTrue(getEditDistance(spell, "abc", "abcd") == 1);
+ assertTrue(getEditDistance(spell, "abc", "abcde") == 2);
+ //test words from fsa_spell output
+ assertTrue(getEditDistance(spell, "abka", "abaka") == 1);
+ assertTrue(getEditDistance(spell, "abka", "abakan") == 2);
+ assertTrue(getEditDistance(spell, "abka", "abaką") == 2);
+ assertTrue(getEditDistance(spell, "abka", "abaki") == 2);
+ }
+
+ @Test
+ public void testCutOffEditDistance() throws IOException {
+ final Speller spell2 = new Speller(dictionary, 2); //note: threshold = 2
+ //test cut edit distance - reprter / repo from Oflazer
+ assertTrue(getCutOffDistance(spell2, "repo", "reprter") == 1);
+ assertTrue(getCutOffDistance(spell2, "reporter", "reporter") == 0);
+ }
+
+ private int getCutOffDistance(final Speller spell, final String word, final String candidate) {
+ // assuming there is no pair-replacement
+ spell.setWordAndCandidate(word, candidate);
+ final int [] ced = new int[spell.getCandLen() - spell.getWordLen()];
+ for (int i = 0; i < spell.getCandLen() - spell.getWordLen(); i++) {
+
+ ced[i] = spell.cuted(spell.getWordLen() + i, spell.getWordLen() + i, spell.getWordLen() + i);
+ }
+ Arrays.sort(ced);
+ //and the min value...
+ if (ced.length > 0) {
+ return ced[0];
+ }
+ return 0;
+ }
+
+ private int getEditDistance(final Speller spell, final String word, final String candidate) {
+ // assuming there is no pair-replacement
+ spell.setWordAndCandidate(word, candidate);
+ final int maxDistance = spell.getEffectiveED();
+ final int candidateLen = spell.getCandLen();
+ final int wordLen = spell.getWordLen();
+ int ed = 0;
+ for (int i = 0; i < candidateLen; i++) {
+ if (spell.cuted(i, i, i) <= maxDistance) {
+ if (Math.abs(wordLen - 1 - i) <= maxDistance) {
+ ed = spell.ed(wordLen - 1, i, wordLen - 1, i);
+ }
+ }
+ }
+ return ed;
+ }
+} \ No newline at end of file