summaryrefslogtreecommitdiff
path: root/morfologik-speller/src
diff options
context:
space:
mode:
authorAndrej Shadura <andrewsh@debian.org>2018-12-26 20:22:36 +0100
committerAndrej Shadura <andrewsh@debian.org>2018-12-26 20:22:36 +0100
commit991420c6b1ebc1f041818ffeaee65e5dcaa1581d (patch)
treef8537e1dc33fcdbaaebe0801a7598c2d7f5baea2 /morfologik-speller/src
parent0abd94bb0cc3d7cc07da074acccffcfbc27b0352 (diff)
Import Upstream version 1.9.0+dfsg
Diffstat (limited to 'morfologik-speller/src')
-rw-r--r--morfologik-speller/src/main/java/morfologik/speller/HMatrix.java100
-rw-r--r--morfologik-speller/src/main/java/morfologik/speller/Speller.java920
-rw-r--r--morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java21
-rw-r--r--morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java272
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dictbin0 -> 162 bytes
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info15
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt21
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/slownik.dictbin0 -> 130 bytes
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/slownik.info14
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test-infix.dictbin0 -> 1859 bytes
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test-infix.info10
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dictbin0 -> 168 bytes
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info15
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dictbin0 -> 129 bytes
-rw-r--r--morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info16
15 files changed, 1404 insertions, 0 deletions
diff --git a/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java b/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java
new file mode 100644
index 0000000..848f24e
--- /dev/null
+++ b/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java
@@ -0,0 +1,100 @@
+package morfologik.speller;
+
+/**
+ * Keeps track of already computed values of edit distance.<br/>
+ * Remarks: To save space, the matrix is kept in a vector.
+ */
+public class HMatrix {
+ private int[] p; /* the vector */
+ private int rowLength; /* row length of matrix */
+ int columnHeight; /* column height of matrix */
+ int editDistance; /* edit distance */
+
+ /**
+ * Allocates memory and initializes matrix (constructor).
+ *
+ * @param distance (int) max edit distance allowed for
+ * candidates;
+ * @param maxLength (int) max length of words.
+ *
+ * Remarks: See Oflazer. To save space, the matrix is
+ * stored as a vector. To save time, additional rows and
+ * columns are added. They are initialized to their distance in
+ * the matrix, so that no bound checking is necessary during
+ * access.
+ */
+ public HMatrix(final int distance, final int maxLength) {
+ rowLength = maxLength + 2;
+ columnHeight = 2 * distance + 3;
+ editDistance = distance;
+ final int size = rowLength * columnHeight;
+ p = new int[size];
+ // Initialize edges of the diagonal band to distance + 1 (i.e.
+ // distance too big)
+ for (int i = 0; i < rowLength - distance - 1; i++) {
+ p[i] = distance + 1; // H(distance + j, j) = distance + 1
+ p[size - i - 1] = distance + 1; // H(i, distance + i) = distance
+ // + 1
+ }
+ // Initialize items H(i,j) with at least one index equal to zero to
+ // |i - j|
+ for (int j = 0; j < 2 * distance + 1; j++) {
+ p[j * rowLength] = distance + 1 - j; // H(i=0..distance+1,0)=i
+ // FIXME: fordistance == 2 we exceed the array size here.
+ // there's a bug in spell.cc, Jan Daciuk has been notified about it.
+ p[Math.min(p.length - 1, (j + distance + 1) * rowLength + j)] = j; // H(0,j=0..distance+1)=j
+ }
+ }
+
+ /**
+ * Provide an item of hMatrix indexed by indices.
+ *
+ * @param i
+ * - (int) row number;
+ * @param j
+ * - (int) column number.
+ * @return Item H[i][j] <br/>
+ * Remarks: H matrix is really simulated. What is needed is only
+ * 2 * edit_distance + 1 wide band around the diagonal. In fact
+ * this diagonal has been pushed up to the upper border of the
+ * matrix.
+ *
+ * The matrix in the vector looks likes this:
+ *
+ * <pre>
+ * +---------------------+
+ * 0 |#####################| j=i-e-1
+ * 1 | | j=i-e
+ * : :
+ * e+1 | | j=i-1
+ * +---------------------+
+ * e+2 | | j=i
+ * +---------------------+
+ * e+3 | | j=i+1
+ * : :
+ * 2e+2| | j=i+e
+ * 2e+3|#####################| j=i+e+1
+ * +---------------------+
+ * </pre>
+ */
+ public int get(final int i, final int j) {
+ return p[(j - i + editDistance + 1) * rowLength + j];
+ }
+
+ /**
+ * Set an item in hMatrix.
+ *
+ * @param i
+ * - (int) row number;
+ * @param j
+ * - (int) column number;
+ * @param val
+ * - (int) value to put there.
+ *
+ * No checking for i & j is done. They must be correct.
+ */
+ public void set(final int i, final int j, final int val) {
+ p[(j - i + editDistance + 1) * rowLength + j] = val;
+ }
+
+} \ No newline at end of file
diff --git a/morfologik-speller/src/main/java/morfologik/speller/Speller.java b/morfologik-speller/src/main/java/morfologik/speller/Speller.java
new file mode 100644
index 0000000..f4ee083
--- /dev/null
+++ b/morfologik-speller/src/main/java/morfologik/speller/Speller.java
@@ -0,0 +1,920 @@
+package morfologik.speller;
+
+import static morfologik.fsa.MatchResult.EXACT_MATCH;
+import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.*;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSAFinalStatesIterator;
+import morfologik.fsa.FSATraversal;
+import morfologik.fsa.MatchResult;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.util.BufferUtils;
+
+/**
+ * Finds spelling suggestions. Implements
+ * <a href="http://acl.ldc.upenn.edu/J/J96/J96-1003.pdf">K. Oflazer's algorithm</a>.
+ * See Jan Daciuk's <code>s_fsa</code> package.
+ */
+public class Speller {
+
+ /**
+ * Maximum length of the word to be checked.
+ */
+ public static final int MAX_WORD_LENGTH = 120;
+ static final int FREQ_RANGES = 'Z' - 'A' + 1;
+ static final int FIRST_RANGE_CODE = 'A'; // less frequent words
+
+ //FIXME: this is an upper limit for replacement searches, we need
+ //proper tree traversal instead of generation of all possible candidates
+ static final int UPPER_SEARCH_LIMIT = 15;
+ private static final int MIN_WORD_LENGTH = 4;
+ private static final int MAX_RECURSION_LEVEL = 6;
+
+ private final int editDistance;
+ private int effectEditDistance; // effective edit distance
+
+ private final HMatrix hMatrix;
+
+ private char[] candidate; /* current replacement */
+ private int candLen;
+ private int wordLen; /* length of word being processed */
+ private char[] wordProcessed; /* word being processed */
+
+ private Map<Character, List<char[]>> replacementsAnyToOne = new HashMap<Character, List<char[]>>();
+ private Map<String, List<char[]>> replacementsAnyToTwo = new HashMap<String, List<char[]>>();
+ private Map<String, List<String>> replacementsTheRest = new HashMap<String, List<String>>();
+
+ /**
+ * List of candidate strings, including same additional data such as
+ * edit distance from the original word.
+ */
+ private final List<CandidateData> candidates = new ArrayList<CandidateData>();
+
+ private boolean containsSeparators = true;
+
+ /**
+ * Internal reusable buffer for encoding words into byte arrays using
+ * {@link #encoder}.
+ */
+ private ByteBuffer byteBuffer = ByteBuffer.allocate(MAX_WORD_LENGTH);
+
+ /**
+ * Internal reusable buffer for encoding words into byte arrays using
+ * {@link #encoder}.
+ */
+ private CharBuffer charBuffer = CharBuffer.allocate(MAX_WORD_LENGTH);
+
+ /**
+ * Reusable match result.
+ */
+ private final MatchResult matchResult = new MatchResult();
+
+ /**
+ * Features of the compiled dictionary.
+ *
+ * @see DictionaryMetadata
+ */
+ private final DictionaryMetadata dictionaryMetadata;
+
+ /**
+ * Charset encoder for the FSA.
+ */
+ private final CharsetEncoder encoder;
+
+ /**
+ * Charset decoder for the FSA.
+ */
+ private final CharsetDecoder decoder;
+
+ /** An FSA used for lookups. */
+ private final FSATraversal matcher;
+
+ /** FSA's root node. */
+ private final int rootNode;
+
+ /**
+ * The FSA we are using.
+ */
+ private final FSA fsa;
+
+ /** An iterator for walking along the final states of {@link #fsa}. */
+ private final FSAFinalStatesIterator finalStatesIterator;
+
+ public Speller(final Dictionary dictionary) {
+ this(dictionary, 1);
+ }
+
+ public Speller(final Dictionary dictionary, final int editDistance) {
+ this.editDistance = editDistance;
+ hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH);
+
+ this.dictionaryMetadata = dictionary.metadata;
+ this.rootNode = dictionary.fsa.getRootNode();
+ this.fsa = dictionary.fsa;
+ this.matcher = new FSATraversal(fsa);
+ this.finalStatesIterator = new FSAFinalStatesIterator(fsa, rootNode);
+
+ if (rootNode == 0) {
+ throw new IllegalArgumentException(
+ "Dictionary must have at least the root node.");
+ }
+
+ if (dictionaryMetadata == null) {
+ throw new IllegalArgumentException(
+ "Dictionary metadata must not be null.");
+ }
+
+ encoder = dictionaryMetadata.getEncoder();
+ decoder = dictionaryMetadata.getDecoder();
+
+ // Multibyte separator will result in an exception here.
+ dictionaryMetadata.getSeparatorAsChar();
+
+ this.createReplacementsMaps();
+ }
+
+ private void createReplacementsMaps() {
+ for (Map.Entry<String, List<String>> entry : dictionaryMetadata
+ .getReplacementPairs().entrySet()) {
+ for (String s : entry.getValue()) {
+ // replacements any to one
+ // the new key is the target of the replacement pair
+ if (s.length() == 1) {
+ if (!replacementsAnyToOne.containsKey(s.charAt(0))) {
+ List<char[]> charList = new ArrayList<char[]>();
+ charList.add(entry.getKey().toCharArray());
+ replacementsAnyToOne.put(s.charAt(0), charList);
+ } else {
+ replacementsAnyToOne.get(s.charAt(0)).add(
+ entry.getKey().toCharArray());
+ }
+ }
+ // replacements any to two
+ // the new key is the target of the replacement pair
+ else if (s.length() == 2) {
+ if (!replacementsAnyToTwo.containsKey(s)) {
+ List<char[]> charList = new ArrayList<char[]>();
+ charList.add(entry.getKey().toCharArray());
+ replacementsAnyToTwo.put(s, charList);
+ } else {
+ replacementsAnyToTwo.get(s).add(entry.getKey().toCharArray());
+ }
+ } else {
+ if (!replacementsTheRest.containsKey(entry.getKey())) {
+ List<String> charList = new ArrayList<String>();
+ charList.add(s);
+ replacementsTheRest.put(entry.getKey(), charList);
+ } else {
+ replacementsTheRest.get(entry.getKey()).add(s);
+ }
+ }
+ }
+ }
+ }
+
+
+ /**
+ * Encode a character sequence into a byte buffer, optionally expanding
+ * buffer.
+ */
+ private ByteBuffer charsToBytes(final CharBuffer chars, ByteBuffer bytes) {
+ bytes.clear();
+ final int maxCapacity = (int) (chars.remaining() * encoder.maxBytesPerChar());
+ if (bytes.capacity() <= maxCapacity) {
+ bytes = ByteBuffer.allocate(maxCapacity);
+ }
+ chars.mark();
+ encoder.reset();
+ if (encoder.encode(chars, bytes, true).isError()) {
+ // in the case of encoding errors, clear the buffer
+ bytes.clear();
+ }
+ bytes.flip();
+ chars.reset();
+ return bytes;
+ }
+
+ private ByteBuffer charSequenceToBytes(final CharSequence word) {
+ // Encode word characters into bytes in the same encoding as the FSA's.
+ charBuffer.clear();
+ charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
+ for (int i = 0; i < word.length(); i++) {
+ final char chr = word.charAt(i);
+ charBuffer.put(chr);
+ }
+ charBuffer.flip();
+ byteBuffer = charsToBytes(charBuffer, byteBuffer);
+ return byteBuffer;
+ }
+
+ /**
+ * Checks whether the word is misspelled, by performing a series of checks according to
+ * properties of the dictionary.
+ *
+ * If the flag <code>fsa.dict.speller.ignore-punctuation</code> is set, then all non-alphabetic
+ * characters are considered to be correctly spelled.
+ *
+ * If the flag <code>fsa.dict.speller.ignore-numbers</code> is set, then all words containing decimal
+ * digits are considered to be correctly spelled.
+ *
+ * If the flag <code>fsa.dict.speller.ignore-camel-case</code> is set, then all CamelCase words are
+ * considered to be correctly spelled.
+ *
+ * If the flag <code>fsa.dict.speller.ignore-all-uppercase</code> is set, then all alphabetic words composed
+ * of only uppercase characters are considered to be correctly spelled.
+ *
+ * Otherwise, the word is checked in the dictionary. If the test fails, and the dictionary does not
+ * perform any case conversions (as set by <code>fsa.dict.speller.convert-case</code> flag), then the method
+ * returns false. In case of case conversions, it is checked whether a non-mixed case word is found in its
+ * lowercase version in the dictionary, and for all-uppercase words, whether the word is found in the dictionary
+ * with the initial uppercase letter.
+ *
+ * @param word - the word to be checked
+ * @return true if the word is misspelled
+ **/
+ public boolean isMisspelled(final String word) {
+ // dictionaries usually do not contain punctuation
+ String wordToCheck = word;
+ if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
+ wordToCheck = Dictionary.convertText(word,
+ dictionaryMetadata.getInputConversionPairs()).toString();
+ }
+ boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0));
+ return wordToCheck.length() > 0
+ && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic)
+ && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck))
+ && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck))
+ && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck))
+ && !isInDictionary(wordToCheck)
+ && (!dictionaryMetadata.isConvertingCase() ||
+ !(!isMixedCase(wordToCheck) &&
+ (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale()))
+ || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck)))));
+ }
+
+ private CharSequence initialUppercase(final String wordToCheck) {
+ return wordToCheck.substring(0, 1) +
+ wordToCheck.substring(1).
+ toLowerCase(dictionaryMetadata.getLocale());
+ }
+
+ /**
+ * Test whether the word is found in the dictionary.
+ * @param word the word to be tested
+ * @return True if it is found.
+ */
+ public boolean isInDictionary(final CharSequence word) {
+ byteBuffer = charSequenceToBytes(word);
+
+ // Try to find a partial match in the dictionary.
+ final MatchResult match = matcher.match(matchResult,
+ byteBuffer.array(), 0, byteBuffer.remaining(), rootNode);
+
+ if (match.kind == EXACT_MATCH) {
+ containsSeparators = false;
+ return true;
+ }
+
+ return containsSeparators
+ && match.kind == SEQUENCE_IS_A_PREFIX
+ && byteBuffer.remaining() > 0
+ && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0;
+ }
+
+ /**
+ * Get the frequency value for a word form.
+ * It is taken from the first entry with this word form.
+ * @param word the word to be tested
+ * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent).
+ */
+
+ public int getFrequency(final CharSequence word) {
+ if (!dictionaryMetadata.isFrequencyIncluded()) {
+ return 0;
+ }
+ final byte separator = dictionaryMetadata.getSeparator();
+ byteBuffer = charSequenceToBytes(word);
+ final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0,
+ byteBuffer.remaining(), rootNode);
+ if (match.kind == SEQUENCE_IS_A_PREFIX) {
+ final int arc = fsa.getArc(match.node, separator);
+ if (arc != 0 && !fsa.isArcFinal(arc)) {
+ finalStatesIterator.restartFrom(fsa.getEndNode(arc));
+ if (finalStatesIterator.hasNext()) {
+ final ByteBuffer bb = finalStatesIterator.next();
+ final byte[] ba = bb.array();
+ final int bbSize = bb.remaining();
+ //the last byte contains the frequency after a separator
+ return ba[bbSize - 1] - FIRST_RANGE_CODE;
+ }
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Propose suggestions for misspelled run-on words. This algorithm is inspired by
+ * spell.cc in s_fsa package by Jan Daciuk.
+ *
+ * @param original The original misspelled word.
+ * @return The list of suggested pairs, as space-concatenated strings.
+ */
+ public List<String> replaceRunOnWords(final String original) {
+ final List<String> candidates = new ArrayList<String>();
+ if (!isInDictionary(Dictionary.convertText(original,
+ dictionaryMetadata.getInputConversionPairs()).toString())
+ && dictionaryMetadata.isSupportingRunOnWords()) {
+ for (int i = 2; i < original.length(); i++) {
+ // chop from left to right
+ final CharSequence firstCh = original.subSequence(0, i);
+ if (isInDictionary(firstCh) &&
+ isInDictionary(original.subSequence(i, original.length()))) {
+ if (!dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
+ candidates.add(firstCh + " " + original.subSequence(i, original.length()));
+ } else {
+ candidates.add(
+ Dictionary.convertText(firstCh + " " + original.subSequence(i, original.length()),
+ dictionaryMetadata.getOutputConversionPairs()).toString()
+ );
+ }
+ }
+ }
+ }
+ return candidates;
+ }
+
+ /**
+ * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa
+ * package, spell.cc for further explanation.
+ *
+ * @param w
+ * The original misspelled word.
+ * @return A list of suggested replacements.
+ * @throws CharacterCodingException
+ */
+ public List<String> findReplacements(final String w)
+ throws CharacterCodingException {
+ String word = w;
+ if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
+ word = Dictionary.convertText(w,
+ dictionaryMetadata.getInputConversionPairs()).toString();
+ }
+ candidates.clear();
+ if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && !isInDictionary(word)) {
+ List<String> wordsToCheck = new ArrayList<String>();
+ if (replacementsTheRest != null
+ && word.length() > MIN_WORD_LENGTH) {
+ for (final String wordChecked : getAllReplacements(word, 0, 0)) {
+ boolean found = false;
+ if (isInDictionary(wordChecked)) {
+ candidates.add(new CandidateData(wordChecked, 0));
+ found = true;
+ } else if (dictionaryMetadata.isConvertingCase()) {
+ String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale());
+ String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale());
+ if (isInDictionary(lowerWord)) {
+ //add the word as it is in the dictionary, not mixed-case versions of it
+ candidates.add(new CandidateData(lowerWord, 0));
+ found = true;
+ }
+ if (isInDictionary(upperWord)) {
+ candidates.add(new CandidateData(upperWord, 0));
+ found = true;
+ }
+ if (lowerWord.length() > 1) {
+ String firstupperWord = Character.toUpperCase(lowerWord.charAt(0))
+ + lowerWord.substring(1);
+ if (isInDictionary(firstupperWord)) {
+ candidates.add(new CandidateData(firstupperWord, 0));
+ found = true;
+ }
+ }
+ }
+ if (!found) {
+ wordsToCheck.add(wordChecked);
+ }
+ }
+ } else {
+ wordsToCheck.add(word);
+ }
+
+ //If at least one candidate was found with the replacement pairs (which are usual errors),
+ //probably there is no need for more candidates
+ if (candidates.isEmpty()) {
+ int i = 1;
+ for (final String wordChecked : wordsToCheck) {
+ i++;
+ if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply
+ break;
+ }
+ wordProcessed = wordChecked.toCharArray();
+ wordLen = wordProcessed.length;
+ if (wordLen < MIN_WORD_LENGTH && i > 2) { // three-letter replacements make little sense anyway
+ break;
+ }
+ candidate = new char[MAX_WORD_LENGTH];
+ candLen = candidate.length;
+ effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
+ charBuffer = BufferUtils.ensureCapacity(charBuffer, MAX_WORD_LENGTH);
+ byteBuffer = BufferUtils.ensureCapacity(byteBuffer, MAX_WORD_LENGTH);
+ charBuffer.clear();
+ byteBuffer.clear();
+ final byte[] prevBytes = new byte[0];
+ findRepl(0, fsa.getRootNode(), prevBytes, 0, 0);
+ }
+ }
+ }
+
+ Collections.sort(candidates);
+
+ // Use a linked set to avoid duplicates and preserve the ordering of candidates.
+ final Set<String> candStringSet = new LinkedHashSet<String>();
+ for (final CandidateData cd : candidates) {
+ candStringSet.add(Dictionary.convertText(cd.getWord(),
+ dictionaryMetadata.getOutputConversionPairs()).toString());
+ }
+ final List<String> candStringList = new ArrayList<String>(candStringSet.size());
+ candStringList.addAll(candStringSet);
+ return candStringList;
+ }
+
+ private void findRepl(final int depth, final int node, final byte[] prevBytes,
+ final int wordIndex, final int candIndex) {
+ // char separatorChar = dictionaryMetadata.getSeparatorAsChar();
+ int dist = 0;
+ for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+ byteBuffer = BufferUtils.ensureCapacity(byteBuffer, prevBytes.length + 1);
+ byteBuffer.clear();
+ byteBuffer.put(prevBytes);
+ byteBuffer.put(fsa.getArcLabel(arc));
+ final int bufPos = byteBuffer.position();
+ byteBuffer.flip();
+ decoder.reset();
+ final CoderResult c = decoder.decode(byteBuffer, charBuffer, true);
+ if (c.isMalformed()) { // assume that only valid
+ // encodings are there
+ final byte[] prev = new byte[bufPos];
+ byteBuffer.position(0);
+ byteBuffer.get(prev);
+ if (!fsa.isArcTerminal(arc)) {
+ findRepl(depth, fsa.getEndNode(arc), prev, wordIndex, candIndex); // note: depth is not incremented
+ }
+ byteBuffer.clear();
+ } else if (!c.isError()) { // unmappable characters are silently discarded
+ charBuffer.flip();
+ candidate[candIndex] = charBuffer.get();
+ charBuffer.clear();
+ byteBuffer.clear();
+
+ int lengthReplacement;
+ // replacement "any to two"
+ if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) {
+ if (isEndOfCandidate(arc, wordIndex)) { //the replacement takes place at the end of the candidate
+ if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) { // there is an extra letter in the word after the replacement
+ dist++;
+ }
+ addCandidate(candIndex, dist);
+
+ }
+ if (isArcNotTerminal(arc, candIndex)) {
+ int x = hMatrix.get(depth, depth);
+ hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1));
+ findRepl(Math.max(0, depth), fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement - 1, candIndex + 1);
+ hMatrix.set(depth, depth, x);
+
+ }
+ }
+ //replacement "any to one"
+ if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) {
+ if (isEndOfCandidate(arc, wordIndex)) { //the replacement takes place at the end of the candidate
+ if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) { // there is an extra letter in the word after the replacement
+ dist++;
+ }
+ addCandidate(candIndex, dist);
+ }
+ if (isArcNotTerminal(arc,candIndex)) {
+ findRepl(depth, fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement, candIndex + 1);
+ }
+ }
+ //general
+ if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) {
+ if ((isEndOfCandidate(arc, wordIndex))
+ && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex))
+ <= effectEditDistance) {
+ addCandidate(candIndex, dist);
+ }
+ if (isArcNotTerminal(arc,candIndex)) {
+ findRepl(depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1);
+ }
+ }
+
+ }
+ }
+ }
+
+ private boolean isArcNotTerminal(final int arc, final int candIndex) {
+ return !fsa.isArcTerminal(arc)
+ && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar());
+ }
+
+ private boolean isEndOfCandidate(final int arc, final int wordIndex) {
+ return (fsa.isArcFinal(arc) || isBeforeSeparator(arc))
+ //candidate has proper length
+ && (Math.abs(wordLen - 1 - (wordIndex)) <= effectEditDistance);
+ }
+
+ private boolean isBeforeSeparator(final int arc) {
+ if (containsSeparators) {
+ final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator());
+ return arc1 != 0 && !fsa.isArcTerminal(arc1);
+ }
+ return false;
+ }
+
+ private void addCandidate(final int depth, final int dist) {
+ candidates.add(new CandidateData(String.valueOf(candidate, 0, depth + 1), dist));
+ }
+
+ /**
+ * Calculates edit distance.
+ *
+ * @param i length of first word (here: misspelled) - 1;
+ * @param j length of second word (here: candidate) - 1.
+ * @return Edit distance between the two words. Remarks: See Oflazer.
+ */
+ public int ed(final int i, final int j,
+ final int wordIndex, final int candIndex) {
+ int result;
+ int a, b, c;
+
+ if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) {
+ // last characters are the same
+ result = hMatrix.get(i, j);
+ } else if (wordIndex > 0 && candIndex > 0 && wordProcessed[wordIndex] == candidate[candIndex - 1]
+ && wordProcessed[wordIndex - 1] == candidate[candIndex]) {
+ // last two characters are transposed
+ a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba
+ b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba
+ c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab
+ result = 1 + min(a, b, c);
+ } else {
+ // otherwise
+ a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb
+ b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a
+ c = hMatrix.get(i, j + 1); // insertion e.g. a, ab
+ result = 1 + min(a, b, c);
+ }
+
+ hMatrix.set(i + 1, j + 1, result);
+ return result;
+ }
+
+ // by Jaume Ortola
+ private boolean areEqual(final char x, final char y) {
+ if (x == y) {
+ return true;
+ }
+ if (dictionaryMetadata.getEquivalentChars() != null &&
+ dictionaryMetadata.getEquivalentChars().containsKey(x)
+ && dictionaryMetadata.getEquivalentChars().get(x).contains(y)) {
+ return true;
+ }
+ if (dictionaryMetadata.isIgnoringDiacritics()) {
+ String xn = Normalizer.normalize(Character.toString(x), Form.NFD);
+ String yn = Normalizer.normalize(Character.toString(y), Form.NFD);
+ if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible
+ return true;
+ }
+ if (dictionaryMetadata.isConvertingCase()) {
+ //again case conversion only when needed -- we
+ // do not need String.lowercase because we only check
+ // single characters, so a cheaper method is enough
+ if (Character.isLetter(xn.charAt(0))){
+ boolean testNeeded = Character.isLowerCase(xn.charAt(0))
+ != Character.isLowerCase(yn.charAt(0));
+ if (testNeeded) {
+ return Character.toLowerCase(xn.charAt(0)) ==
+ Character.toLowerCase(yn.charAt(0));
+ }
+ }
+ }
+ return xn.charAt(0) == yn.charAt(0);
+ }
+ return false;
+ }
+
+ /**
+ * Calculates cut-off edit distance.
+ *
+ * @param depth current length of candidates.
+ * @return Cut-off edit distance. Remarks: See Oflazer.
+ */
+
+ public int cuted(final int depth, final int wordIndex, final int candIndex) {
+ final int l = Math.max(0, depth - effectEditDistance); // min chars from word to consider - 1
+ final int u = Math.min(wordLen - 1 - (wordIndex - depth), depth + effectEditDistance); // max chars from word to
+ // consider - 1
+ int minEd = effectEditDistance + 1; // what is to be computed
+ int wi = wordIndex + l - depth;
+ int d;
+
+ for (int i = l; i <= u; i++, wi++) {
+ if ((d = ed(i, depth, wi, candIndex)) < minEd) {
+ minEd = d;
+ }
+ }
+ return minEd;
+ }
+
+ // Match the last letter of the candidate against two or more letters of the word.
+ private int matchAnyToOne(final int wordIndex, final int candIndex) {
+ if (replacementsAnyToOne.containsKey(candidate[candIndex])) {
+ for (final char[] rep : replacementsAnyToOne.get(candidate[candIndex])) {
+ int i = 0;
+ while (i < rep.length && (wordIndex + i) < wordLen
+ && rep[i] == wordProcessed[wordIndex + i]) {
+ i++;
+ }
+ if (i==rep.length) {
+ return i;
+ }
+ }
+ }
+ return 0;
+ }
+
+ private int matchAnyToTwo(final int wordIndex, final int candIndex) {
+ if (candIndex > 0 && candIndex < candidate.length
+ && wordIndex > 0) {
+ char[] twoChar = {candidate[candIndex - 1],candidate[candIndex]};
+ String sTwoChar= new String(twoChar);
+ if (replacementsAnyToTwo.containsKey(sTwoChar)) {
+ for (final char[] rep : replacementsAnyToTwo.get(sTwoChar)) {
+ if (rep.length == 2 && wordIndex < wordLen
+ && candidate[candIndex - 1] == wordProcessed[wordIndex - 1]
+ && candidate[candIndex] == wordProcessed[wordIndex]) {
+ return 0; //unnecessary replacements
+ }
+ int i = 0;
+ while (i < rep.length && (wordIndex - 1 + i) < wordLen
+ && rep[i] == wordProcessed[wordIndex - 1 + i] ) {
+ i++;
+ }
+ if (i==rep.length) {
+ return i;
+ }
+ }
+ }
+ }
+ return 0;
+ }
+
+
+ private static int min(final int a, final int b, final int c) {
+ return Math.min(a, Math.min(b, c));
+ }
+
+ /**
+ * Copy-paste of Character.isAlphabetic() (needed as we require only 1.6)
+ *
+ * @param codePoint The input character.
+ * @return True if the character is a Unicode alphabetic character.
+ */
+ static boolean isAlphabetic(final int codePoint) {
+ return ((1 << Character.UPPERCASE_LETTER
+ | 1 << Character.LOWERCASE_LETTER
+ | 1 << Character.TITLECASE_LETTER
+ | 1 << Character.MODIFIER_LETTER
+ | 1 << Character.OTHER_LETTER
+ | 1 << Character.LETTER_NUMBER) >> Character.getType(codePoint) & 1) != 0;
+ }
+
+ /**
+ * Checks whether a string contains a digit. Used for ignoring words with
+ * numbers
+ * @param s Word to be checked.
+ * @return True if there is a digit inside the word.
+ */
+ static boolean containsNoDigit(final String s) {
+ for (int k = 0; k < s.length(); k++) {
+ if (Character.isDigit(s.charAt(k))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns true if <code>str</code> is made up of all-uppercase characters
+ * (ignoring characters for which no upper-/lowercase distinction exists).
+ */
+ boolean isAllUppercase(final String str) {
+ for(int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if(Character.isLetter(c) && Character.isLowerCase(c)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns true if <code>str</code> is made up of all-lowercase characters
+ * (ignoring characters for which no upper-/lowercase distinction exists).
+ */
+ boolean isNotAllLowercase(final String str) {
+ for(int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if(Character.isLetter(c) && !Character.isLowerCase(c)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @param str input string
+ */
+ boolean isNotCapitalizedWord(final String str) {
+ if (isNotEmpty(str) && Character.isUpperCase(str.charAt(0))) {
+ for (int i = 1; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (Character.isLetter(c) && !Character.isLowerCase(c)) {
+ return true;
+ }
+ }
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Helper method to replace calls to "".equals().
+ *
+ * @param str
+ * String to check
+ * @return true if string is empty OR null
+ */
+ static boolean isNotEmpty(final String str) {
+ return str != null && str.length() != 0;
+ }
+
+ /**
+ * @param str input str
+ * @return Returns true if str is MixedCase.
+ */
+ boolean isMixedCase(final String str) {
+ return !isAllUppercase(str)
+ && isNotCapitalizedWord(str)
+ && isNotAllLowercase(str);
+ }
+
+ /**
+ * @return Returns true if str is CamelCase.
+ */
+ public boolean isCamelCase(final String str) {
+ return isNotEmpty(str)
+ && !isAllUppercase(str)
+ && isNotCapitalizedWord(str)
+ && Character.isUpperCase(str.charAt(0))
+ && (!(str.length() > 1) || Character.isLowerCase(str.charAt(1)))
+ && isNotAllLowercase(str);
+ }
+
+
+ /**
+ * Used to determine whether the dictionary supports case conversions.
+ * @return boolean value that answers this question in a deep and meaningful way.
+ *
+ * @since 1.9
+ *
+ */
+ public boolean convertsCase() {
+ return dictionaryMetadata.isConvertingCase();
+ }
+
+ /**
+ * @param str The string to find the replacements for.
+ * @param fromIndex The index from which replacements are found.
+ * @param level The recursion level. The search stops if level is > MAX_RECURSION_LEVEL.
+ * @return A list of all possible replacements of a {#link str} given string
+ */
+ public List<String> getAllReplacements(final String str, final int fromIndex, final int level) {
+ List<String> replaced = new ArrayList<String>();
+ if (level > MAX_RECURSION_LEVEL) { // Stop searching at some point
+ replaced.add(str);
+ return replaced;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append(str);
+ int index = MAX_WORD_LENGTH;
+ String key = "";
+ int keyLength = 0;
+ boolean found = false;
+ // find first possible replacement after fromIndex position
+ for (final String auxKey : replacementsTheRest.keySet()) {
+ int auxIndex = sb.indexOf(auxKey, fromIndex);
+ if (auxIndex > -1 && auxIndex < index &&
+ !(auxKey.length() < keyLength)) { //select the longest possible key
+ index = auxIndex;
+ key = auxKey;
+ keyLength = auxKey.length();
+ }
+ }
+ if (index < MAX_WORD_LENGTH) {
+ for (final String rep : replacementsTheRest.get(key)) {
+ // start a branch without replacement (only once per key)
+ if (!found) {
+ replaced.addAll(getAllReplacements(str, index + key.length(),
+ level + 1));
+ found = true;
+ }
+ // avoid unnecessary replacements (ex. don't replace L by L·L when L·L already present)
+ int ind = sb.indexOf(rep, fromIndex - rep.length() + 1);
+ if (rep.length() > key.length() && ind > -1
+ && (ind == index || ind == index - rep.length() + 1)) {
+ continue;
+ }
+ // start a branch with replacement
+ sb.replace(index, index + key.length(), rep);
+ replaced.addAll(getAllReplacements(sb.toString(), index + rep.length(),
+ level + 1));
+ sb.setLength(0);
+ sb.append(str);
+ }
+ }
+ if (!found) {
+ replaced.add(sb.toString());
+ }
+ return replaced;
+ }
+
+
+ /**
+ * Sets up the word and candidate. Used only to test the edit distance in
+ * JUnit tests.
+ *
+ * @param word the first word
+ * @param candidate the second word used for edit distance calculation
+ */
+ void setWordAndCandidate(final String word, final String candidate) {
+ wordProcessed = word.toCharArray();
+ wordLen = wordProcessed.length;
+ this.candidate = candidate.toCharArray();
+ candLen = this.candidate.length;
+ effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
+ }
+
+ public final int getWordLen() {
+ return wordLen;
+ }
+
+ public final int getCandLen() {
+ return candLen;
+ }
+
+ public final int getEffectiveED() {
+ return effectEditDistance;
+ }
+
+ /**
+ * Used to sort candidates according to edit distance, and possibly
+ * according to their frequency in the future.
+ *
+ */
+ private class CandidateData implements Comparable<CandidateData> {
+ private final String word;
+ private final int distance;
+
+ CandidateData(final String word, final int distance) {
+ this.word = word;
+ this.distance = distance * FREQ_RANGES + FREQ_RANGES - getFrequency(word) - 1;
+ }
+
+ final String getWord() {
+ return word;
+ }
+
+ final int getDistance() {
+ return distance;
+ }
+
+ @Override
+ public int compareTo(final CandidateData cd) {
+ // Assume no overflow.
+ return cd.getDistance() > this.distance ? -1 :
+ cd.getDistance() == this.distance ? 0 : 1;
+ }
+ }
+}
diff --git a/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java
new file mode 100644
index 0000000..38aa76d
--- /dev/null
+++ b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java
@@ -0,0 +1,21 @@
+package morfologik.speller;
+
+import static org.junit.Assert.*;
+
+import morfologik.speller.HMatrix;
+
+import org.junit.Test;
+
+public class HMatrixTest {
+
+ private static final int MAX_WORD_LENGTH = 120;
+
+ @Test
+ public void stressTestInit() {
+ for (int i = 0; i < 10; i++) { // test if we don't get beyond array limits etc.
+ HMatrix H = new HMatrix(i, MAX_WORD_LENGTH);
+ assertEquals(0, H.get(1, 1));
+ }
+ }
+
+}
diff --git a/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java
new file mode 100644
index 0000000..48ed2c1
--- /dev/null
+++ b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java
@@ -0,0 +1,272 @@
+package morfologik.speller;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+
+import org.fest.assertions.api.Assertions;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class SpellerTest {
+ private static Dictionary dictionary;
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ final URL url = SpellerTest.class.getResource("slownik.dict");
+ dictionary = Dictionary.read(url);
+ }
+
+ /*
+ @Test
+ public void testAbka() throws Exception {
+ final Speller spell = new Speller(dictionary, 2);
+ System.out.println("Replacements:");
+ for (String s : spell.findReplacements("abka")) {
+ System.out.println(s);
+ }
+ }
+ */
+
+ @Test
+ public void testRunonWords() throws IOException {
+ final Speller spell = new Speller(dictionary);
+ Assertions.assertThat(spell.replaceRunOnWords("abaka")).isEmpty();
+ Assertions.assertThat(spell.replaceRunOnWords("abakaabace")).contains("abaka abace");
+
+ // Test on an morphological dictionary - should work as well
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.replaceRunOnWords("Rzekunia").isEmpty());
+ assertTrue(spell1.replaceRunOnWords("RzekuniaRzeczypospolitej").contains("Rzekunia Rzeczypospolitej"));
+ assertTrue(spell1.replaceRunOnWords("RzekuniaRze").isEmpty()); //Rze is not found but is a prefix
+ }
+
+ @Test
+ public void testIsInDictionary() throws IOException {
+ // Test on an morphological dictionary, including separators
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.isInDictionary("Rzekunia"));
+ assertTrue(!spell1.isInDictionary("Rzekunia+"));
+ assertTrue(!spell1.isInDictionary("Rzekunia+aaa"));
+ // test UTF-8 dictionary
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.isInDictionary("jaźń"));
+ assertTrue(spell.isInDictionary("zażółć"));
+ assertTrue(spell.isInDictionary("żółwiową"));
+ assertTrue(spell.isInDictionary("ćwikła"));
+ assertTrue(spell.isInDictionary("Żebrowski"));
+ assertTrue(spell.isInDictionary("Święto"));
+ assertTrue(spell.isInDictionary("Świerczewski"));
+ assertTrue(spell.isInDictionary("abc"));
+ }
+
+ @Test
+ public void testFindReplacements() throws IOException {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.findReplacements("abka").contains("abak"));
+ //check if we get only dictionary words...
+ List<String> reps = spell.findReplacements("bak");
+ for (final String word: reps) {
+ assertTrue(spell.isInDictionary(word));
+ }
+ assertTrue(spell.findReplacements("abka~~").isEmpty()); // 2 characters more -> edit distance too large
+ assertTrue(!spell.findReplacements("Rezkunia").contains("Rzekunia"));
+
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(spell1.findReplacements("Rezkunia").contains("Rzekunia"));
+ //diacritics
+ assertTrue(spell1.findReplacements("Rzękunia").contains("Rzekunia"));
+ //we should get no candidates for correct words
+ assertTrue(spell1.isInDictionary("Rzekunia"));
+ assertTrue(spell1.findReplacements("Rzekunia").isEmpty());
+ //and no for things that are too different from the dictionary
+ assertTrue(spell1.findReplacements("Strefakibica").isEmpty());
+ //nothing for nothing
+ assertTrue(spell1.findReplacements("").isEmpty());
+ //nothing for weird characters
+ assertTrue(spell1.findReplacements("\u0000").isEmpty());
+ //nothing for other characters
+ assertTrue(spell1.findReplacements("«…»").isEmpty());
+ //nothing for separator
+ assertTrue(spell1.findReplacements("+").isEmpty());
+
+ }
+
+ @Test
+ public void testFrequencyNonUTFDictionary() throws IOException {
+ final URL url1 = getClass().getResource("test_freq_iso.dict");
+ final Speller spell = new Speller(Dictionary.read(url1));
+ assertTrue(spell.isInDictionary("a"));
+ assertTrue(!spell.isInDictionary("aõh")); //non-encodable in UTF-8
+ }
+
+ @Test
+ public void testFindReplacementsInUTF() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.findReplacements("gęslą").contains("gęślą"));
+ assertTrue(spell.findReplacements("ćwikla").contains("ćwikła"));
+ assertTrue(spell.findReplacements("Swierczewski").contains("Świerczewski"));
+ assertTrue(spell.findReplacements("zółwiową").contains("żółwiową"));
+ assertTrue(spell.findReplacements("Żebrowsk").contains("Żebrowski"));
+ assertTrue(spell.findReplacements("święto").contains("Święto"));
+ //note: no diacritics here, but we still get matches!
+ assertTrue(spell.findReplacements("gesla").contains("gęślą"));
+ assertTrue(spell.findReplacements("swieto").contains("Święto"));
+ assertTrue(spell.findReplacements("zolwiowa").contains("żółwiową"));
+ //using equivalent characters 'x' = 'ź'
+ assertTrue(spell.findReplacements("jexn").contains("jaźń"));
+ // 'u' = 'ó', so the edit distance is still small...
+ assertTrue(spell.findReplacements("zażulv").contains("zażółć"));
+ // 'rz' = 'ż', so the edit distance is still small, but with string replacements...
+ assertTrue(spell.findReplacements("zarzulv").contains("zażółć"));
+ assertTrue(spell.findReplacements("Rzebrowski").contains("Żebrowski"));
+ assertTrue(spell.findReplacements("rzółw").contains("żółw"));
+ assertTrue(spell.findReplacements("Świento").contains("Święto"));
+ // avoid mixed-case words as suggestions when using replacements ('rz' = 'ż')
+ assertTrue(spell.findReplacements("zArzółć").get(0).equals("zażółć"));
+ }
+
+ @Test
+ public void testFindReplacementsUsingFrequency() throws IOException {
+ final URL url = getClass().getResource("dict-with-freq.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+
+ //check if we get only dictionary words...
+ List<String> reps = spell.findReplacements("jist");
+ for (final String word: reps) {
+ assertTrue(spell.isInDictionary(word));
+ }
+ // get replacements ordered by frequency
+ assertTrue(reps.get(0).equals("just"));
+ assertTrue(reps.get(1).equals("list"));
+ assertTrue(reps.get(2).equals("fist"));
+ assertTrue(reps.get(3).equals("mist"));
+ assertTrue(reps.get(4).equals("jest"));
+ assertTrue(reps.get(5).equals("dist"));
+ assertTrue(reps.get(6).equals("gist"));
+ }
+
+ @Test
+ public void testIsMisspelled() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(!spell.isMisspelled("Paragraf22")); //ignorujemy liczby
+ assertTrue(!spell.isMisspelled("!")); //ignorujemy znaki przestankowe
+ assertTrue(spell.isMisspelled("dziekie")); //test, czy znajdujemy błąd
+ assertTrue(!spell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda
+ assertTrue(!spell.isMisspelled("Ćwikła")); //i małe litery
+ assertTrue(!spell.isMisspelled("TOJESTTEST")); //i wielkie litery
+ final Speller oldStyleSpell = new Speller(dictionary, 1);
+ assertTrue(oldStyleSpell.isMisspelled("Paragraf22")); // nie ignorujemy liczby
+ assertTrue(oldStyleSpell.isMisspelled("!")); //nie ignorujemy znaków przestankowych
+ // assertTrue(oldStyleSpell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda
+ assertTrue(oldStyleSpell.isMisspelled("Abaka")); //i małe litery
+ final URL url1 = getClass().getResource("test-infix.dict");
+ final Speller spell1 = new Speller(Dictionary.read(url1));
+ assertTrue(!spell1.isMisspelled("Rzekunia"));
+ assertTrue(spell1.isAllUppercase("RZEKUNIA"));
+ assertTrue(spell1.isMisspelled("RZEKUNIAA")); // finds a typo here
+ assertTrue(!spell1.isMisspelled("RZEKUNIA")); // but not here
+ }
+
+ @Test
+ public void testCamelCase() {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.isCamelCase("CamelCase"));
+ assertTrue(!spell.isCamelCase("Camel"));
+ assertTrue(!spell.isCamelCase("CAMEL"));
+ assertTrue(!spell.isCamelCase("camel"));
+ assertTrue(!spell.isCamelCase("cAmel"));
+ assertTrue(!spell.isCamelCase("CAmel"));
+ assertTrue(!spell.isCamelCase(""));
+ assertTrue(!spell.isCamelCase(null));
+ }
+
+ @Test
+ public void testCapitalizedWord() {
+ final Speller spell = new Speller(dictionary, 1);
+ assertTrue(spell.isNotCapitalizedWord("CamelCase"));
+ assertTrue(!spell.isNotCapitalizedWord("Camel"));
+ assertTrue(spell.isNotCapitalizedWord("CAMEL"));
+ assertTrue(spell.isNotCapitalizedWord("camel"));
+ assertTrue(spell.isNotCapitalizedWord("cAmel"));
+ assertTrue(spell.isNotCapitalizedWord("CAmel"));
+ assertTrue(spell.isNotCapitalizedWord(""));
+ }
+
+ @Test
+ public void testGetAllReplacements() throws IOException {
+ final URL url = getClass().getResource("test-utf-spell.dict");
+ final Speller spell = new Speller(Dictionary.read(url));
+ assertTrue(spell.isMisspelled("rzarzerzarzu"));
+ assertEquals("[rzarzerzarzu]",
+ Arrays.toString(spell.getAllReplacements("rzarzerzarzu", 0, 0).toArray()));
+ }
+
+ @Test
+ public void testEditDistanceCalculation() throws IOException {
+ final Speller spell = new Speller(dictionary, 5);
+ //test examples from Oflazer's paper
+ assertTrue(getEditDistance(spell, "recoginze", "recognize") == 1);
+ assertTrue(getEditDistance(spell, "sailn", "failing") == 3);
+ assertTrue(getEditDistance(spell, "abc", "abcd") == 1);
+ assertTrue(getEditDistance(spell, "abc", "abcde") == 2);
+ //test words from fsa_spell output
+ assertTrue(getEditDistance(spell, "abka", "abaka") == 1);
+ assertTrue(getEditDistance(spell, "abka", "abakan") == 2);
+ assertTrue(getEditDistance(spell, "abka", "abaką") == 2);
+ assertTrue(getEditDistance(spell, "abka", "abaki") == 2);
+ }
+
+ @Test
+ public void testCutOffEditDistance() throws IOException {
+ final Speller spell2 = new Speller(dictionary, 2); //note: threshold = 2
+ //test cut edit distance - reprter / repo from Oflazer
+ assertTrue(getCutOffDistance(spell2, "repo", "reprter") == 1);
+ assertTrue(getCutOffDistance(spell2, "reporter", "reporter") == 0);
+ }
+
+ private int getCutOffDistance(final Speller spell, final String word, final String candidate) {
+ // assuming there is no pair-replacement
+ spell.setWordAndCandidate(word, candidate);
+ final int [] ced = new int[spell.getCandLen() - spell.getWordLen()];
+ for (int i = 0; i < spell.getCandLen() - spell.getWordLen(); i++) {
+
+ ced[i] = spell.cuted(spell.getWordLen() + i, spell.getWordLen() + i, spell.getWordLen() + i);
+ }
+ Arrays.sort(ced);
+ //and the min value...
+ if (ced.length > 0) {
+ return ced[0];
+ }
+ return 0;
+ }
+
+ private int getEditDistance(final Speller spell, final String word, final String candidate) {
+ // assuming there is no pair-replacement
+ spell.setWordAndCandidate(word, candidate);
+ final int maxDistance = spell.getEffectiveED();
+ final int candidateLen = spell.getCandLen();
+ final int wordLen = spell.getWordLen();
+ int ed = 0;
+ for (int i = 0; i < candidateLen; i++) {
+ if (spell.cuted(i, i, i) <= maxDistance) {
+ if (Math.abs(wordLen - 1 - i) <= maxDistance) {
+ ed = spell.ed(wordLen - 1, i, wordLen - 1, i);
+ }
+ }
+ }
+ return ed;
+ }
+} \ No newline at end of file
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict
new file mode 100644
index 0000000..609a267
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict
Binary files differ
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info
new file mode 100644
index 0000000..1203602
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info
@@ -0,0 +1,15 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+
+fsa.dict.frequency-included=true
+
+fsa.dict.speller.locale=en_US
+fsa.dict.speller.ignore-diacritics=true
+#fsa.dict.speller.replacement-pairs=ninties 1990s, teached taught, rised rose, a ei, ei a, a ey, ey a, ai ie, ie ai, are air, are ear, are eir, air are, air ere, ere air, ere ear, ere eir, ear are, ear air, ear ere, eir are, eir ere, ch te, te ch, ch ti, ti ch, ch tu, tu ch, ch s, s ch, ch k, k ch, f ph, ph f, gh f, f gh, i igh, igh i, i uy, uy i, i ee, ee i, j di, di j, j gg, gg j, j ge, ge j, s ti, ti s, s ci, ci s, k cc, cc k, k qu, qu k, kw qu, o eau, eau o, o ew, ew o, oo ew, ew oo, ew ui, ui ew, oo ui, ui oo, ew u, u ew, oo u, u oo, u oe, oe u, u ieu, ieu u, ue ew, ew ue, uff ough, oo ieu, ieu oo, ier ear, ear ier, ear air, air ear, w qu, qu w, z ss, ss z, shun tion, shun sion, shun cion \ No newline at end of file
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt
new file mode 100644
index 0000000..a12876c
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt
@@ -0,0 +1,21 @@
+ageist+C
+deist+G
+didst+A
+digest+J
+direst+E
+dist+G
+divest+I
+fist+J
+gist+G
+grist+I
+heist+I
+hist+A
+jest+H
+jilt+D
+joist+F
+just+P
+licit+F
+list+O
+mist+J
+weest+A
+wist+C
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict b/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict
new file mode 100644
index 0000000..b650702
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict
Binary files differ
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/slownik.info b/morfologik-speller/src/test/resources/morfologik/speller/slownik.info
new file mode 100644
index 0000000..25aef99
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/slownik.info
@@ -0,0 +1,14 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=Cp1250
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+
+fsa.dict.speller.ignore-diacritics=false
+fsa.dict.speller.ignore-numbers=false
+fsa.dict.speller.convert-case=false
+fsa.dict.speller.ignore-punctuation=false \ No newline at end of file
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict
new file mode 100644
index 0000000..cc91f70
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict
Binary files differ
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info
new file mode 100644
index 0000000..9ba1066
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info
@@ -0,0 +1,10 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=true
+fsa.dict.uses-infixes=true
+fsa.dict.speller.ignore-all-uppercase=false \ No newline at end of file
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict
new file mode 100644
index 0000000..63ff635
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict
Binary files differ
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info
new file mode 100644
index 0000000..b13d12f
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info
@@ -0,0 +1,15 @@
+#
+# Dictionary properties.
+# UTF-8 encoding or native2ascii has to be used for non-ASCII data.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=utf-8
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+
+fsa.dict.speller.locale=pl_PL
+fsa.dict.speller.ignore-diacritics=true
+fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u
+fsa.dict.speller.replacement-pairs=rz ż, ż rz, ch h, h ch, ę en, en ę
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict
new file mode 100644
index 0000000..69c5a99
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict
Binary files differ
diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info
new file mode 100644
index 0000000..353deac
--- /dev/null
+++ b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info
@@ -0,0 +1,16 @@
+#
+# Dictionary properties.
+#
+
+fsa.dict.separator=+
+fsa.dict.encoding=iso-8859-2
+
+fsa.dict.uses-prefixes=false
+fsa.dict.uses-infixes=false
+
+fsa.dict.frequency-included=true
+
+fsa.dict.speller.locale=pl_PL
+fsa.dict.speller.ignore-diacritics=true
+fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u
+fsa.dict.speller.replacement-pairs=ź zi, ł eu, ć ci, ć dż, ć dź, ć dz, c dz, ch h, ci ć, cz czy, dź ć, dź dzi, dż ć, dz ć, dzi dź, edzil ędził, ę em, ę en, ei eja, eja ei, em ę, en ę, eu ł, h ch, he chę, śi ś, ii ija, ija ii, iosc ość, ise się, loz łos, ni ń, ńi ń, ń ni, ą oł, oł ą, oi oja, oja oi, ą om, om ą, ą on, on ą, ru kró, ż rz, rz ż, rz sz, scia ścią, ś si, si ś, sić ść, s sną, sz ż, sz rz, tro rot, u y, wu wy, yi yja, yja yi, zal rzał, zekac rzekać, zi ź, zl azł, z żn, z rz, chłopcowi chłopcu, bratowi bratu, aleji alei, lubieć lubić, nei nie, źmie zmie, piatek piątek, pokuj pokój, poszłem poszedłem, prosze proszę, rząda żąda, sa są, sei się, standart standard, trzcionk czcionk, szłem szedłem, pry przy \ No newline at end of file