10 files changed, 1945 insertions, 0 deletions
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java
new file mode 100644
index 0000000..4c3f877
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java
@@ -0,0 +1,111 @@
+package morfologik.stemming;
+
+import java.util.*;
+
+/**
+ * A view over a range of an array.
+ */
+@SuppressWarnings("serial")
+final class ArrayViewList<E> extends AbstractList<E> 
+	implements RandomAccess, java.io.Serializable
+{
+	/** Backing array. */
+	private E[] a;
+	private int start;
+	private int length;
+
+	/*
+     * 
+     */
+	ArrayViewList(E[] array, int start, int length) {
+		if (array == null)
+			throw new IllegalArgumentException();
+		wrap(array, start, length);
+	}
+
+	/*
+     * 
+     */
+	public int size() {
+		return length;
+	}
+
+	/*
+     * 
+     */
+	public E get(int index) {
+		return a[start + index];
+	}
+
+	/*
+     * 
+     */
+	public E set(int index, E element) {
+		throw new UnsupportedOperationException();
+	}
+
+	/*
+     * 
+     */
+	public void add(int index, E element) {
+		throw new UnsupportedOperationException();
+	}
+
+	/*
+     * 
+     */
+	public E remove(int index) {
+		throw new UnsupportedOperationException();
+	}
+
+	/*
+     * 
+     */
+	public boolean addAll(int index, Collection<? extends E> c) {
+		throw new UnsupportedOperationException();
+	}
+
+	/*
+     * 
+     */
+	public int indexOf(Object o) {
+		if (o == null) {
+			for (int i = start; i < start + length; i++)
+				if (a[i] == null)
+					return i - start;
+		} else {
+			for (int i = start; i < start + length; i++)
+				if (o.equals(a[i]))
+					return i - start;
+		}
+		return -1;
+	}
+
+	public ListIterator<E> listIterator() {
+		return listIterator(0);
+	}
+
+	/*
+     * 
+     */
+	public ListIterator<E> listIterator(final int index) {
+		return Arrays.asList(a).subList(start, start + length).listIterator(
+		        index);
+	}
+
+	/*
+     * 
+     */
+	public boolean contains(Object o) {
+		return indexOf(o) != -1;
+	}
+
+	/*
+     * 
+     */
+	void wrap(E[] array, int start, int length) {
+		this.a = array;
+		this.start = start;
+		this.length = length;
+	}
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java
new file mode 100644
index 0000000..d72c85c
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java
@@ -0,0 +1,233 @@
+package morfologik.stemming;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.WeakHashMap;
+
+import morfologik.fsa.FSA;
+import morfologik.util.FileUtils;
+import morfologik.util.ResourceUtils;
+
+/**
+ * A dictionary combines {@link FSA} automaton and metadata describing the
+ * internals of dictionary entries' coding ({@link DictionaryMetadata}.
+ * 
+ * <p>
+ * A dictionary consists of two files:
+ * <ul>
+ * <li>an actual compressed FSA file,
+ * <li>a metadata file, describing the dictionary.
+ * </ul>
+ * Use static methods in this class to read dictionaries and their metadata.
+ */
+public final class Dictionary {
+  /**
+   * Expected metadata file extension.
+   */
+  public final static String METADATA_FILE_EXTENSION = "info";
+
+  /**
+   * {@link FSA} automaton with the compiled dictionary data.
+   */
+  public final FSA fsa;
+
+  /**
+   * Metadata associated with the dictionary.
+   */
+  public final DictionaryMetadata metadata;
+
+  /**
+   * Default loaded dictionaries.
+   */
+  public static final WeakHashMap<String, Dictionary> defaultDictionaries = new WeakHashMap<String, Dictionary>();
+
+  /**
+   * It is strongly recommended to use static methods in this class for
+   * reading dictionaries.
+   * 
+   * @param fsa
+   *            An instantiated {@link FSA} instance.
+   * 
+   * @param metadata
+   *            A map of attributes describing the compression format and
+   *            other settings not contained in the FSA automaton. For an
+   *            explanation of available attributes and their possible values,
+   *            see {@link DictionaryMetadata}.
+   */
+  public Dictionary(FSA fsa, DictionaryMetadata metadata) {
+    this.fsa = fsa;
+    this.metadata = metadata;
+  }
+
+  /**
+   * Attempts to load a dictionary using the path to the FSA file and the
+   * expected metadata extension.
+   */
+  public static Dictionary read(File fsaFile) throws IOException {
+    final File featuresFile = new File(fsaFile.getParent(),
+        getExpectedFeaturesName(fsaFile.getName()));
+
+    FileUtils.assertExists(featuresFile, true, false);
+
+    return readAndClose(
+        new FileInputStream(fsaFile),
+        new FileInputStream(featuresFile));
+  }
+
+  /**
+   * <p>
+   * Attempts to load a dictionary using the URL to the FSA file and the
+   * expected metadata extension.
+   * 
+   * <p>
+   * This method can be used to load resource-based dictionaries, but be aware
+   * of JAR resource-locking issues that arise from resource URLs.
+   */
+  public static Dictionary read(URL fsaURL) throws IOException {
+    final String fsa = fsaURL.toExternalForm();
+    final String features = getExpectedFeaturesName(fsa);
+
+    return readAndClose(
+        ResourceUtils.openInputStream(fsa),
+        ResourceUtils.openInputStream(features));
+  }
+
+  /**
+   * Attempts to load a dictionary from opened streams of FSA dictionary data
+   * and associated metadata.
+   */
+  public static Dictionary readAndClose(InputStream fsaData, InputStream featuresData)
+      throws IOException
+      {
+    try {
+      Map<DictionaryAttribute, String> map = new HashMap<DictionaryAttribute, String>();
+      final Properties properties = new Properties();
+      properties.load(new InputStreamReader(featuresData, "UTF-8"));
+
+      // Handle back-compatibility for encoder specification.
+      if (!properties.containsKey(DictionaryAttribute.ENCODER.propertyName)) {
+        boolean usesSuffixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-suffixes", "true"));
+        boolean usesPrefixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-prefixes", "false"));
+        boolean usesInfixes  = Boolean.valueOf(properties.getProperty("fsa.dict.uses-infixes",  "false"));
+
+        if (usesInfixes) {
+          map.put(DictionaryAttribute.ENCODER, EncoderType.INFIX.name());
+        } else if (usesPrefixes) {
+          map.put(DictionaryAttribute.ENCODER, EncoderType.PREFIX.name());
+        } else if (usesSuffixes) {
+          map.put(DictionaryAttribute.ENCODER, EncoderType.SUFFIX.name());
+        } else {
+          map.put(DictionaryAttribute.ENCODER, EncoderType.NONE.name());
+        }
+
+        properties.remove("fsa.dict.uses-suffixes");
+        properties.remove("fsa.dict.uses-prefixes");
+        properties.remove("fsa.dict.uses-infixes");
+      }
+
+      for (Enumeration<?> e = properties.propertyNames(); e.hasMoreElements();) {
+        String key = (String) e.nextElement();
+        map.put(DictionaryAttribute.fromPropertyName(key), properties.getProperty(key));
+      }
+      final DictionaryMetadata features = new DictionaryMetadata(map);
+      final FSA fsa = FSA.read(fsaData);
+
+      return new Dictionary(fsa, features);
+    } finally {
+      FileUtils.close(fsaData, featuresData);
+    }
+      }
+
+  /**
+   * Returns the expected name of the metadata file, based on the name of the
+   * FSA dictionary file. The expected name is resolved by truncating any
+   * suffix of <code>name</code> and appending
+   * {@link #METADATA_FILE_EXTENSION}.
+   */
+  public static String getExpectedFeaturesName(String name) {
+    final int dotIndex = name.lastIndexOf('.');
+    final String featuresName;
+    if (dotIndex >= 0) {
+      featuresName = name.substring(0, dotIndex) + "."
+          + METADATA_FILE_EXTENSION;
+    } else {
+      featuresName = name + "." + METADATA_FILE_EXTENSION;
+    }
+
+    return featuresName;
+  }
+
+  /**
+   * Return a built-in dictionary for a given ISO language code. Dictionaries
+   * are cached internally for potential reuse.
+   * 
+   * @throws RuntimeException
+   *             Throws a {@link RuntimeException} if the dictionary is not
+   *             bundled with the library.
+   */
+  public static Dictionary getForLanguage(String languageCode) {
+    if (languageCode == null || "".equals(languageCode)) {
+      throw new IllegalArgumentException(
+          "Language code must not be empty.");
+    }
+
+    synchronized (defaultDictionaries) {
+      Dictionary dict = defaultDictionaries.get(languageCode);
+      if (dict != null)
+        return dict;
+
+      try {
+        final String dictPath = "morfologik/dictionaries/" + languageCode + ".dict";
+        final String metaPath = Dictionary
+            .getExpectedFeaturesName(dictPath);
+
+        dict = Dictionary.readAndClose(
+            ResourceUtils.openInputStream(dictPath),
+            ResourceUtils.openInputStream(metaPath));
+
+        defaultDictionaries.put(languageCode, dict);
+        return dict;
+      } catch (IOException e) {
+        throw new RuntimeException(
+            "Default dictionary resource for language '"
+                + languageCode + "not found.", e);
+      }
+    }
+  }
+
+  /**
+   * Converts the words on input or output according to conversion tables.
+   * 
+   * Useful if the input words need to be normalized (i.e., ligatures,
+   * apostrophes and such).
+   * 
+   * @param str - input character sequence to be converted
+   * @param conversionMap - conversion map used to convert the string (a map
+   * from String to String)
+   * @return a converted string.
+   * 
+   * @since 1.9.0
+   * 
+   */
+  public static CharSequence convertText(final CharSequence str, final Map<String, String> conversionMap) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(str);
+    for (final String auxKey : conversionMap.keySet()) {
+      int index = sb.indexOf(auxKey);
+      while (index != -1) {
+        sb.replace(index, index + auxKey.length(), conversionMap.get(auxKey));
+        index = sb.indexOf(auxKey);
+      }
+    }
+    return sb.toString();
+  }
+
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java
new file mode 100644
index 0000000..1249800
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java
@@ -0,0 +1,333 @@
+package morfologik.stemming;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Attributes applying to {@link Dictionary} and {@link DictionaryMetadata}.
+ */
+public enum DictionaryAttribute {
+  /**
+   * Logical fields separator inside the FSA.
+   */
+  SEPARATOR("fsa.dict.separator") {
+    @Override
+    public Character fromString(String separator) {
+      if (separator == null || separator.length() != 1) {
+        throw new IllegalArgumentException("Attribute " + propertyName
+            + " must be a single character.");
+      }
+
+      char charValue = separator.charAt(0);
+      if (Character.isHighSurrogate(charValue) ||
+          Character.isLowSurrogate(charValue)) {
+        throw new IllegalArgumentException(
+            "Field separator character cannot be part of a surrogate pair: " + separator);
+      }
+
+      return charValue;
+    }
+  },
+
+  /**
+   * Character to byte encoding used for strings inside the FSA.
+   */
+  ENCODING("fsa.dict.encoding") {
+    @Override
+    public Charset fromString(String charsetName) {
+      return Charset.forName(charsetName);
+    }
+  },
+
+  /**
+   * If the FSA dictionary includes frequency data.
+   */
+  FREQUENCY_INCLUDED("fsa.dict.frequency-included") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to ignore words containing digits
+   */
+  IGNORE_NUMBERS("fsa.dict.speller.ignore-numbers") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to ignore punctuation.
+   */
+  IGNORE_PUNCTUATION("fsa.dict.speller.ignore-punctuation") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to ignore CamelCase words.
+   */
+  IGNORE_CAMEL_CASE("fsa.dict.speller.ignore-camel-case") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to ignore ALL UPPERCASE words.
+   */
+  IGNORE_ALL_UPPERCASE("fsa.dict.speller.ignore-all-uppercase") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to ignore diacritics, so that
+   * 'a' would be treated as equivalent to 'ą'.
+   */
+  IGNORE_DIACRITICS("fsa.dict.speller.ignore-diacritics") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * if the spelling dictionary is supposed to treat upper and lower case
+   * as equivalent.
+   */
+  CONVERT_CASE("fsa.dict.speller.convert-case") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /**
+   * If the spelling dictionary is supposed to split runOnWords.
+   */
+  RUN_ON_WORDS("fsa.dict.speller.runon-words") {
+    @Override
+    public Boolean fromString(String value) {
+      return booleanValue(value);
+    }
+  },
+
+  /** Locale associated with the dictionary. */
+  LOCALE("fsa.dict.speller.locale") {
+    @Override
+    public Locale fromString(String value) {
+      return new Locale(value);
+    }
+  },
+
+  /** Locale associated with the dictionary. */
+  ENCODER("fsa.dict.encoder") {
+    @Override
+    public EncoderType fromString(String value) {
+      return EncoderType.valueOf(value.toUpperCase(Locale.ROOT));
+    }
+  },
+
+  /**
+   * Input conversion pairs to replace non-standard characters before search in a speller dictionary.
+   * For example, common ligatures can be replaced here.
+   */
+  INPUT_CONVERSION("fsa.dict.input-conversion") {
+    @Override
+    public Map<String, String> fromString(String value) throws IllegalArgumentException {
+      Map<String, String> conversionPairs = new HashMap<String, String>();
+      final String[] replacements = value.split(",\\s*");
+      for (final String stringPair : replacements) {
+        final String[] twoStrings = stringPair.trim().split(" ");
+        if (twoStrings.length == 2) {
+          if (!conversionPairs.containsKey(twoStrings[0])) {
+            conversionPairs.put(twoStrings[0], twoStrings[1]);
+          } else {
+            throw new IllegalArgumentException(
+                "Input conversion cannot specify different values for the same input string: " + twoStrings[0]);
+          }
+        } else {
+          throw new IllegalArgumentException("Attribute " + propertyName
+              + " is not in the proper format: " + value);
+        }
+      }
+      return conversionPairs;
+    }
+  },
+
+  /**
+   * Output conversion pairs to replace non-standard characters before search in a speller dictionary.
+   * For example, standard characters can be replaced here into ligatures.
+   * 
+   * Useful for dictionaries that do have certain standards imposed.
+   * 
+   */
+  OUTPUT_CONVERSION ("fsa.dict.output-conversion") {
+    @Override
+    public Map<String, String> fromString(String value) throws IllegalArgumentException {
+      Map<String, String> conversionPairs = new HashMap<String, String>();
+      final String[] replacements = value.split(",\\s*");
+      for (final String stringPair : replacements) {
+        final String[] twoStrings = stringPair.trim().split(" ");
+        if (twoStrings.length == 2) {
+          if (!conversionPairs.containsKey(twoStrings[0])) {
+            conversionPairs.put(twoStrings[0], twoStrings[1]);
+          } else {
+            throw new IllegalArgumentException(
+                "Input conversion cannot specify different values for the same input string: " + twoStrings[0]);
+          }
+        } else {
+          throw new IllegalArgumentException("Attribute " + propertyName
+              + " is not in the proper format: " + value);
+        }
+      }
+      return conversionPairs;
+    }
+  },
+
+  /**
+   * Replacement pairs for non-obvious candidate search in a speller dictionary.
+   * For example, Polish <tt>rz</tt> is phonetically equivalent to <tt>ż</tt>,
+   * and this may be specified here to allow looking for replacements of <tt>rz</tt> with <tt>ż</tt>
+   * and vice versa.
+   */
+  REPLACEMENT_PAIRS("fsa.dict.speller.replacement-pairs") {
+    @Override
+    public Map<String, List<String>> fromString(String value) throws IllegalArgumentException {
+      Map<String, List<String>> replacementPairs = new HashMap<String, List<String>>();
+      final String[] replacements = value.split(",\\s*");
+      for (final String stringPair : replacements) {
+        final String[] twoStrings = stringPair.trim().split(" ");
+        if (twoStrings.length == 2) {
+          if (!replacementPairs.containsKey(twoStrings[0])) {
+            List<String> strList = new ArrayList<String>();
+            strList.add(twoStrings[1]);
+            replacementPairs.put(twoStrings[0], strList);
+          } else {
+            replacementPairs.get(twoStrings[0]).add(twoStrings[1]);
+          }
+        } else {
+          throw new IllegalArgumentException("Attribute " + propertyName
+              + " is not in the proper format: " + value);
+        }
+      }
+      return replacementPairs;
+    }
+  },
+
+  /**
+   * Equivalent characters (treated similarly as equivalent chars with and without
+   * diacritics). For example, Polish <tt>ł</tt> can be specified as equivalent to <tt>l</tt>.
+   * 
+   * <p>This implements a feature similar to hunspell MAP in the affix file.
+   */
+  EQUIVALENT_CHARS("fsa.dict.speller.equivalent-chars") {
+    @Override
+    public Map<Character, List<Character>> fromString(String value) throws IllegalArgumentException {
+      Map<Character, List<Character>> equivalentCharacters =
+          new HashMap<Character, List<Character>>();
+      final String[] eqChars = value.split(",\\s*");
+      for (final String characterPair : eqChars) {
+        final String[] twoChars = characterPair.trim().split(" ");
+        if (twoChars.length == 2
+            && twoChars[0].length() == 1
+            && twoChars[1].length() == 1) {
+          char fromChar = twoChars[0].charAt(0);
+          char toChar = twoChars[1].charAt(0);
+          if (!equivalentCharacters.containsKey(fromChar)) {
+            List<Character> chList = new ArrayList<Character>();
+            equivalentCharacters.put(fromChar, chList);
+          }
+          equivalentCharacters.get(fromChar).add(toChar);
+        } else {
+          throw new IllegalArgumentException("Attribute " + propertyName
+              + " is not in the proper format: " + value);
+        }
+      }
+      return equivalentCharacters;
+    }
+  },
+
+  /**
+   * Dictionary license attribute.
+   */
+  LICENSE("fsa.dict.license"),
+
+  /**
+   * Dictionary author.
+   */
+  AUTHOR("fsa.dict.author"),
+
+  /**
+   * Dictionary creation date.
+   */
+  CREATION_DATE("fsa.dict.created");
+
+  /**
+   * Property name for this attribute.
+   */
+  public final String propertyName;
+
+  /**
+   * Converts a string to the given attribute's value (covariants used).
+   * 
+   * @throws IllegalArgumentException
+   *             If the input string cannot be converted to the attribute's
+   *             value.
+   */
+  public Object fromString(String value) throws IllegalArgumentException {
+    return value;
+  }
+
+  /**
+   * Return an {@link DictionaryAttribute} by its {@link #propertyName}.
+   */
+  public static DictionaryAttribute fromPropertyName(String propertyName) {
+    DictionaryAttribute value = attrsByPropertyName.get(propertyName);
+    if (value == null) {
+      throw new IllegalArgumentException("No attribute for property: " + propertyName);
+    }
+    return value;
+  }
+
+  private static final Map<String,DictionaryAttribute> attrsByPropertyName;
+  static {
+    attrsByPropertyName = new HashMap<String,DictionaryAttribute>();
+    for (DictionaryAttribute attr : DictionaryAttribute.values()) {
+      if (attrsByPropertyName.put(attr.propertyName, attr) != null) {
+        throw new RuntimeException("Duplicate property key for: " + attr);
+      }
+    }
+  }
+
+  /**
+   * Private enum instance constructor.
+   */
+  private DictionaryAttribute(String propertyName) {
+    this.propertyName = propertyName;
+  }
+
+  private static Boolean booleanValue(String value) {
+    value = value.toLowerCase();
+    if ("true".equals(value) || "yes".equals(value) || "on".equals(value)) {
+      return Boolean.TRUE;
+    }
+    if ("false".equals(value) || "no".equals(value) || "off".equals(value)) {
+      return Boolean.FALSE;
+    }
+    throw new IllegalArgumentException("Not a boolean value: " + value);
+  }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
new file mode 100644
index 0000000..104ff58
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
@@ -0,0 +1,143 @@
+package morfologik.stemming;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.util.Iterator;
+
+import morfologik.util.BufferUtils;
+
+/**
+ * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems
+ * can be decoded from compressed format or the compressed form can be
+ * preserved.
+ */
+public final class DictionaryIterator implements Iterator<WordData> {
+	private final CharsetDecoder decoder;
+	private final Iterator<ByteBuffer> entriesIter;
+	private final WordData entry;
+	private final byte separator;
+	private final DictionaryMetadata dictionaryMetadata;
+	private final boolean decodeStems;
+
+	private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0);
+	private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0);
+	private ByteBuffer temp = ByteBuffer.allocate(0);
+
+	public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder,
+	        boolean decodeStems) {
+		this.entriesIter = dictionary.fsa.iterator();
+		this.separator = dictionary.metadata.getSeparator();
+		this.dictionaryMetadata = dictionary.metadata;
+		this.decoder = decoder;
+		this.entry = new WordData(decoder);
+		this.decodeStems = decodeStems;
+	}
+
+	public boolean hasNext() {
+		return entriesIter.hasNext();
+	}
+
+	public WordData next() {
+		final ByteBuffer entryBuffer = entriesIter.next();
+		entry.reset();
+
+		/*
+		 * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find
+		 * this split.
+		 */
+		byte[] ba = entryBuffer.array();
+		int bbSize = entryBuffer.remaining();
+
+		int sepPos;
+		for (sepPos = 0; sepPos < bbSize; sepPos++) {
+			if (ba[sepPos] == separator)
+				break;
+		}
+
+		if (sepPos == bbSize) {
+			throw new RuntimeException("Invalid dictionary "
+			        + "entry format (missing separator).");
+		}
+
+		inflectedBuffer.clear();
+		inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos);
+		inflectedBuffer.put(ba, 0, sepPos);
+		inflectedBuffer.flip();
+
+		inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer);
+		entry.wordBuffer = inflectedBuffer;
+		entry.wordCharSequence = inflectedCharBuffer;
+
+		temp.clear();
+		temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos);
+		sepPos++;
+		temp.put(ba, sepPos, bbSize - sepPos);
+		temp.flip();
+
+		ba = temp.array();
+		bbSize = temp.remaining();
+
+		/*
+		 * Find the next separator byte's position splitting word form and tag.
+		 */
+		sepPos = 0;
+		for (; sepPos < bbSize; sepPos++) {
+			if (ba[sepPos] == separator)
+				break;
+		}
+
+		/*
+		 * Decode the stem into stem buffer.
+		 */
+		entry.stemBuffer.clear();
+		if (decodeStems) {
+			entry.stemBuffer = DictionaryLookup.decodeBaseForm(entry.stemBuffer,
+			        ba, sepPos, inflectedBuffer, dictionaryMetadata);
+		} else {
+			entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer, sepPos);
+			entry.stemBuffer.put(ba, 0, sepPos);
+		}
+		entry.stemBuffer.flip();
+
+		// Skip separator character, if present.
+		if (sepPos + 1 <= bbSize) {
+			sepPos++;
+		}
+
+		/*
+		 * Decode the tag data.
+		 */
+		entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize
+		        - sepPos);
+		entry.tagBuffer.clear();
+		entry.tagBuffer.put(ba, sepPos, bbSize - sepPos);
+		entry.tagBuffer.flip();
+
+		return entry;
+	}
+
+	/**
+	 * Decode the byte buffer, optionally expanding the char buffer.
+	 */
+	private CharBuffer bytesToChars(ByteBuffer bytes, CharBuffer chars) {
+		chars.clear();
+		final int maxCapacity = (int) (bytes.remaining() * decoder
+		        .maxCharsPerByte());
+		if (chars.capacity() <= maxCapacity) {
+			chars = CharBuffer.allocate(maxCapacity);
+		}
+
+		bytes.mark();
+		decoder.reset();
+		decoder.decode(bytes, chars, true);
+		chars.flip();
+		bytes.reset();
+
+		return chars;
+	}
+
+	public void remove() {
+		throw new UnsupportedOperationException();
+	}
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
new file mode 100644
index 0000000..5bb0716
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
@@ -0,0 +1,403 @@
+package morfologik.stemming;
+
+import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSAFinalStatesIterator;
+import morfologik.fsa.FSATraversal;
+import morfologik.fsa.MatchResult;
+import morfologik.util.BufferUtils;
+
+/**
+ * This class implements a dictionary lookup over an FSA dictionary. The
+ * dictionary for this class should be prepared from a text file using Jan
+ * Daciuk's FSA package (see link below).
+ * 
+ * <p>
+ * <b>Important:</b> finite state automatons in Jan Daciuk's implementation use
+ * <em>bytes</em> not unicode characters. Therefore objects of this class always
+ * have to be constructed with an encoding used to convert Java strings to byte
+ * arrays and the other way around. You <b>can</b> use UTF-8 encoding, as it
+ * should not conflict with any control sequences and separator characters.
+ * 
+ * @see <a href="http://www.eti.pg.gda.pl/~jandac/fsa.html">FSA package Web
+ *      site</a>
+ */
+public final class DictionaryLookup implements IStemmer, Iterable<WordData> {
+  private static final int REMOVE_EVERYTHING = 255;
+
+  /** An FSA used for lookups. */
+  private final FSATraversal matcher;
+
+  /** An iterator for walking along the final states of {@link #fsa}. */
+  private final FSAFinalStatesIterator finalStatesIterator;
+
+  /** FSA's root node. */
+  private final int rootNode;
+
+  /** Expand buffers and arrays by this constant. */
+  private final static int EXPAND_SIZE = 10;
+
+  /** Private internal array of reusable word data objects. */
+  private WordData[] forms = new WordData[0];
+
+  /** A "view" over an array implementing */
+  private final ArrayViewList<WordData> formsList = new ArrayViewList<WordData>(
+      forms, 0, forms.length);
+
+  /**
+   * Features of the compiled dictionary.
+   * 
+   * @see DictionaryMetadata
+   */
+  private final DictionaryMetadata dictionaryMetadata;
+
+  /**
+   * Charset encoder for the FSA.
+   */
+  private final CharsetEncoder encoder;
+
+  /**
+   * Charset decoder for the FSA.
+   */
+  private final CharsetDecoder decoder;
+
+  /**
+   * The FSA we are using.
+   */
+  private final FSA fsa;
+
+  /**
+   * @see #getSeparatorChar()
+   */
+  private final char separatorChar;
+
+  /**
+   * Internal reusable buffer for encoding words into byte arrays using
+   * {@link #encoder}.
+   */
+  private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
+
+  /**
+   * Internal reusable buffer for encoding words into byte arrays using
+   * {@link #encoder}.
+   */
+  private CharBuffer charBuffer = CharBuffer.allocate(0);
+
+  /**
+   * Reusable match result.
+   */
+  private final MatchResult matchResult = new MatchResult();
+
+  /**
+   * The {@link Dictionary} this lookup is using.
+   */
+  private final Dictionary dictionary;
+
+  /**
+   * <p>
+   * Creates a new object of this class using the given FSA for word lookups
+   * and encoding for converting characters to bytes.
+   * 
+   * @throws IllegalArgumentException
+   *             if FSA's root node cannot be acquired (dictionary is empty).
+   */
+  public DictionaryLookup(Dictionary dictionary)
+      throws IllegalArgumentException {
+    this.dictionary = dictionary;
+    this.dictionaryMetadata = dictionary.metadata;
+    this.rootNode = dictionary.fsa.getRootNode();
+    this.fsa = dictionary.fsa;
+    this.matcher = new FSATraversal(fsa);
+    this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
+
+    if (rootNode == 0) {
+      throw new IllegalArgumentException(
+          "Dictionary must have at least the root node.");
+    }
+
+    if (dictionaryMetadata == null) {
+      throw new IllegalArgumentException(
+          "Dictionary metadata must not be null.");
+    }
+
+    decoder = dictionary.metadata.getDecoder();
+    encoder = dictionary.metadata.getEncoder();
+    separatorChar = dictionary.metadata.getSeparatorAsChar();
+  }
+
+  /**
+   * Searches the automaton for a symbol sequence equal to <code>word</code>,
+   * followed by a separator. The result is a stem (decompressed accordingly
+   * to the dictionary's specification) and an optional tag data.
+   */
+  @Override
+  public List<WordData> lookup(CharSequence word) {
+    final byte separator = dictionaryMetadata.getSeparator();
+
+    if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
+      word = Dictionary.convertText(word, dictionaryMetadata.getInputConversionPairs());
+    }
+
+    // Reset the output list to zero length.
+    formsList.wrap(forms, 0, 0);
+
+    // Encode word characters into bytes in the same encoding as the FSA's.
+    charBuffer.clear();
+    charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
+    for (int i = 0; i < word.length(); i++) {
+      char chr = word.charAt(i);
+      if (chr == separatorChar)
+        return formsList;
+      charBuffer.put(chr);
+    }
+    charBuffer.flip();
+    byteBuffer = charsToBytes(charBuffer, byteBuffer);
+
+    // Try to find a partial match in the dictionary.
+    final MatchResult match = matcher.match(matchResult, byteBuffer
+        .array(), 0, byteBuffer.remaining(), rootNode);
+
+    if (match.kind == SEQUENCE_IS_A_PREFIX) {
+      /*
+       * The entire sequence exists in the dictionary. A separator should
+       * be the next symbol.
+       */
+      final int arc = fsa.getArc(match.node, separator);
+
+      /*
+       * The situation when the arc points to a final node should NEVER
+       * happen. After all, we want the word to have SOME base form.
+       */
+      if (arc != 0 && !fsa.isArcFinal(arc)) {
+        // There is such a word in the dictionary. Return its base forms.
+        int formsCount = 0;
+
+        finalStatesIterator.restartFrom(fsa.getEndNode(arc));
+        while (finalStatesIterator.hasNext()) {
+          final ByteBuffer bb = finalStatesIterator.next();
+          final byte[] ba = bb.array();
+          final int bbSize = bb.remaining();
+
+          if (formsCount >= forms.length) {
+            forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
+            for (int k = 0; k < forms.length; k++) {
+              if (forms[k] == null)
+                forms[k] = new WordData(decoder);
+            }
+          }
+
+          /*
+           * Now, expand the prefix/ suffix 'compression' and store
+           * the base form.
+           */
+          final WordData wordData = forms[formsCount++];
+          wordData.reset();
+
+          wordData.wordBuffer = byteBuffer;
+          if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
+            wordData.wordCharSequence = word;
+          } else {
+            wordData.wordCharSequence = Dictionary.convertText(word,
+                dictionaryMetadata.getOutputConversionPairs());
+          }
+
+          /*
+           * Find the separator byte's position splitting the inflection instructions
+           * from the tag.
+           */
+          int sepPos;
+          for (sepPos = 0; sepPos < bbSize; sepPos++) {
+            if (ba[sepPos] == separator)
+              break;
+          }
+
+          /*
+           * Decode the stem into stem buffer.
+           */
+          wordData.stemBuffer.clear();
+          wordData.stemBuffer = decodeBaseForm(wordData.stemBuffer, ba,
+              sepPos, byteBuffer, dictionaryMetadata);
+          wordData.stemBuffer.flip();
+
+          // Skip separator character.
+          sepPos++;
+
+          /*
+           * Decode the tag data.
+           */
+          final int tagSize = bbSize - sepPos;
+          if (tagSize > 0) {
+            wordData.tagBuffer = BufferUtils.ensureCapacity(
+                wordData.tagBuffer, tagSize);
+            wordData.tagBuffer.clear();
+            wordData.tagBuffer.put(ba, sepPos, tagSize);
+            wordData.tagBuffer.flip();
+          }
+        }
+
+        formsList.wrap(forms, 0, formsCount);
+      }
+    } else {
+      /*
+       * this case is somewhat confusing: we should have hit the separator
+       * first... I don't really know how to deal with it at the time
+       * being.
+       */
+    }
+    return formsList;
+  }
+
+  /**
+   * Decode the base form of an inflected word and save its decoded form into
+   * a byte buffer.
+   * 
+   * @param output
+   *            The byte buffer to save the result to. A new buffer may be
+   *            allocated if the capacity of <code>bb</code> is not large
+   *            enough to store the result. The buffer is not flipped upon
+   *            return.
+   * 
+   * @param inflectedForm
+   *            Inflected form's bytes (decoded properly).
+   * 
+   * @param encoded
+   *            Bytes of the encoded base form, starting at 0 index.
+   * 
+   * @param encodedLen
+   *            Length of the encode base form.
+   * 
+   * @return Returns either <code>bb</code> or a new buffer whose capacity is
+   *         large enough to store the output of the decoded data.
+   */
+  public static ByteBuffer decodeBaseForm(
+      ByteBuffer output,
+      byte[] encoded,
+      int encodedLen,
+      ByteBuffer inflectedForm,
+      DictionaryMetadata metadata) {
+    
+    // FIXME: We should eventually get rid of this method and use 
+    // each encoder's #decode method. The problem is that we'd have to include
+    // HPPC or roundtrip via HPPC to a ByteBuffer, which would slow things down.
+    // Since this is performance-crucial routine, I leave it for now.
+              
+    // Prepare the buffer.
+    output.clear();
+
+    assert inflectedForm.position() == 0;
+
+    // Increase buffer size (overallocating), if needed.
+    final byte[] src = inflectedForm.array();
+    final int srcLen = inflectedForm.remaining();
+    if (output.capacity() < srcLen + encodedLen) {
+      output = ByteBuffer.allocate(srcLen + encodedLen);
+    }
+
+    switch (metadata.getEncoderType()) {
+    case SUFFIX:
+      int suffixTrimCode = encoded[0];
+      int truncateBytes = suffixTrimCode - 'A' & 0xFF;
+      if (truncateBytes == REMOVE_EVERYTHING) {
+        truncateBytes = srcLen;
+      }
+      output.put(src, 0, srcLen - truncateBytes);
+      output.put(encoded, 1, encodedLen - 1);
+      break;
+
+    case PREFIX:
+      int truncatePrefixBytes = encoded[0] - 'A' & 0xFF;
+      int truncateSuffixBytes = encoded[1] - 'A' & 0xFF;
+      if (truncatePrefixBytes == REMOVE_EVERYTHING ||
+          truncateSuffixBytes == REMOVE_EVERYTHING) {
+        truncatePrefixBytes = srcLen;
+        truncateSuffixBytes = 0;
+      }
+      output.put(src, truncatePrefixBytes, srcLen - (truncateSuffixBytes + truncatePrefixBytes));
+      output.put(encoded, 2, encodedLen - 2);
+      break;
+
+    case INFIX:
+      int infixIndex  = encoded[0] - 'A' & 0xFF;
+      int infixLength = encoded[1] - 'A' & 0xFF;
+      truncateSuffixBytes = encoded[2] - 'A' & 0xFF;
+      if (infixLength == REMOVE_EVERYTHING ||
+          truncateSuffixBytes == REMOVE_EVERYTHING) {
+        infixIndex = 0;
+        infixLength = srcLen;
+        truncateSuffixBytes = 0;
+      }
+      output.put(src, 0, infixIndex);
+      output.put(src, infixIndex + infixLength, srcLen - (infixIndex + infixLength + truncateSuffixBytes));
+      output.put(encoded, 3, encodedLen - 3);
+      break;
+
+    case NONE:
+      output.put(encoded, 0, encodedLen);
+      break;
+
+    default:
+      throw new RuntimeException("Unhandled switch/case: " + metadata.getEncoderType());
+    }
+
+    return output;
+  }
+
+  /**
+   * Encode a character sequence into a byte buffer, optionally expanding
+   * buffer.
+   */
+  private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
+    bytes.clear();
+    final int maxCapacity = (int) (chars.remaining() * encoder
+        .maxBytesPerChar());
+    if (bytes.capacity() <= maxCapacity) {
+      bytes = ByteBuffer.allocate(maxCapacity);
+    }
+
+    chars.mark();
+    encoder.reset();
+    if (encoder.encode(chars, bytes, true).isError()) {
+      // remove everything, we don't want to accept malformed input
+      bytes.clear();
+    }
+    bytes.flip();
+    chars.reset();
+
+    return bytes;
+  }
+
+  /**
+   * Return an iterator over all {@link WordData} entries available in the
+   * embedded {@link Dictionary}.
+   */
+  @Override
+  public Iterator<WordData> iterator() {
+    return new DictionaryIterator(dictionary, decoder, true);
+  }
+
+  /**
+   * @return Return the {@link Dictionary} used by this object.
+   */
+  public Dictionary getDictionary() {
+    return dictionary;
+  }
+
+  /**
+   * @return Returns the logical separator character splitting inflected form,
+   *         lemma correction token and a tag. Note that this character is a best-effort
+   *         conversion from a byte in {@link DictionaryMetadata#separator} and
+   *         may not be valid in the target encoding (although this is highly unlikely).
+   */
+  public char getSeparatorChar() {
+    return separatorChar;
+  }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java
new file mode 100644
index 0000000..1475de6
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java
@@ -0,0 +1,298 @@
+package morfologik.stemming;
+
+import static morfologik.stemming.DictionaryAttribute.CONVERT_CASE;
+import static morfologik.stemming.DictionaryAttribute.ENCODING;
+import static morfologik.stemming.DictionaryAttribute.FREQUENCY_INCLUDED;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_ALL_UPPERCASE;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_CAMEL_CASE;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_DIACRITICS;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_NUMBERS;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_PUNCTUATION;
+import static morfologik.stemming.DictionaryAttribute.RUN_ON_WORDS;
+import static morfologik.stemming.DictionaryAttribute.SEPARATOR;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Description of attributes, their types and default values.
+ * 
+ * @see Dictionary
+ */
+public final class DictionaryMetadata {
+  /**
+   * Default attribute values.
+   */
+  private static Map<DictionaryAttribute, String> DEFAULT_ATTRIBUTES = new DictionaryMetadataBuilder()
+  .separator('+')
+  .encoder(EncoderType.SUFFIX)
+  .frequencyIncluded()
+  .ignorePunctuation()
+  .ignoreNumbers()
+  .ignoreCamelCase()
+  .ignoreAllUppercase()
+  .ignoreDiacritics()
+  .convertCase()
+  .supportRunOnWords()
+  .toMap();
+
+  /**
+   * Required attributes.
+   */
+  private static EnumSet<DictionaryAttribute> REQUIRED_ATTRIBUTES = EnumSet.of(
+      SEPARATOR,
+      ENCODING);
+
+  /**
+   * A separator character between fields (stem, lemma, form). The character
+   * must be within byte range (FSA uses bytes internally).
+   */
+  private byte separator;
+  private char separatorChar;
+
+  /**
+   * Encoding used for converting bytes to characters and vice versa.
+   */
+  private String encoding;
+
+  private Charset charset;
+  private Locale locale = Locale.getDefault();
+
+  /**
+   * Replacement pairs for non-obvious candidate search in a speller dictionary.
+   */
+  private Map<String, List<String>> replacementPairs = Collections.emptyMap();
+
+  /**
+   * Conversion pairs for input conversion, for example to replace ligatures.
+   */
+  private Map<String, String> inputConversion = Collections.emptyMap();
+
+  /**
+   * Conversion pairs for output conversion, for example to replace ligatures.
+   */
+  private Map<String, String> outputConversion = Collections.emptyMap();
+
+  /**
+   * Equivalent characters (treated similarly as equivalent chars with and without
+   * diacritics). For example, Polish <tt>ł</tt> can be specified as equivalent to <tt>l</tt>.
+   * 
+   * This implements a feature similar to hunspell MAP in the affix file.
+   */
+  private Map<Character, List<Character>> equivalentChars = Collections.emptyMap();
+
+  /**
+   * All attributes.
+   */
+  private final EnumMap<DictionaryAttribute, String> attributes;
+
+  /**
+   * All "enabled" boolean attributes.
+   */
+  private final EnumMap<DictionaryAttribute,Boolean> boolAttributes;
+
+  /**
+   * Sequence encoder.
+   */
+  private EncoderType encoderType;
+
+  /**
+   * Return all attributes.
+   */
+  public Map<DictionaryAttribute, String> getAttributes() {
+    return Collections.unmodifiableMap(attributes);
+  }
+
+  // Cached attrs.
+  public String getEncoding()      { return encoding; }
+  public byte getSeparator()       { return separator; }
+  public Locale getLocale()        { return locale; }
+
+  public Map<String, String> getInputConversionPairs() { return inputConversion; }
+  public Map<String, String> getOutputConversionPairs() { return outputConversion; }
+
+  public Map<String, List<String>> getReplacementPairs() { return replacementPairs; }
+  public Map<Character, List<Character>> getEquivalentChars() { return equivalentChars; }
+
+  // Dynamically fetched.
+  public boolean isFrequencyIncluded()  { return boolAttributes.get(FREQUENCY_INCLUDED); }
+  public boolean isIgnoringPunctuation()  { return boolAttributes.get(IGNORE_PUNCTUATION); }
+  public boolean isIgnoringNumbers()      { return boolAttributes.get(IGNORE_NUMBERS); }
+  public boolean isIgnoringCamelCase()    { return boolAttributes.get(IGNORE_CAMEL_CASE); }
+  public boolean isIgnoringAllUppercase() { return boolAttributes.get(IGNORE_ALL_UPPERCASE); }
+  public boolean isIgnoringDiacritics()   { return boolAttributes.get(IGNORE_DIACRITICS); }
+  public boolean isConvertingCase()       { return boolAttributes.get(CONVERT_CASE); }
+  public boolean isSupportingRunOnWords() { return boolAttributes.get(RUN_ON_WORDS); }
+
+  /**
+   * Create an instance from an attribute map.
+   * 
+   * @see DictionaryMetadataBuilder
+   */
+  public DictionaryMetadata(Map<DictionaryAttribute, String> userAttrs) {
+    this.boolAttributes = new EnumMap<DictionaryAttribute,Boolean>(DictionaryAttribute.class);
+    this.attributes = new EnumMap<DictionaryAttribute, String>(DictionaryAttribute.class);
+    this.attributes.putAll(userAttrs);
+
+    EnumMap<DictionaryAttribute, String> attrs = new EnumMap<DictionaryAttribute, String>(DEFAULT_ATTRIBUTES);
+    attrs.putAll(userAttrs);
+
+    // Convert some attrs from the map to local fields for performance reasons.
+    EnumSet<DictionaryAttribute> requiredAttributes = EnumSet.copyOf(REQUIRED_ATTRIBUTES);
+
+    for (Map.Entry<DictionaryAttribute,String> e : attrs.entrySet()) {
+      requiredAttributes.remove(e.getKey());
+
+      // Run validation and conversion on all of them.
+      Object value = e.getKey().fromString(e.getValue());
+      switch (e.getKey()) {
+      case ENCODING:
+        this.encoding = e.getValue();
+        if (!Charset.isSupported(encoding)) {
+          throw new IllegalArgumentException("Encoding not supported on this JVM: "
+              + encoding);
+        }
+        this.charset = (Charset) value;
+        break;
+
+      case SEPARATOR:
+        this.separatorChar = (Character) value;
+        break;
+
+      case LOCALE:
+        this.locale = (Locale) value;
+        break;
+
+      case ENCODER:
+        this.encoderType = (EncoderType) value;
+        break;
+
+      case INPUT_CONVERSION:
+      {
+        @SuppressWarnings("unchecked")
+        Map<String, String> gvalue = (Map<String, String>) value;
+        this.inputConversion = gvalue;
+      }
+      break;
+
+      case OUTPUT_CONVERSION:
+      {
+        @SuppressWarnings("unchecked")
+        Map<String, String> gvalue = (Map<String, String>) value;
+        this.outputConversion = gvalue;
+      }
+      break;
+
+      case REPLACEMENT_PAIRS:
+      {
+        @SuppressWarnings("unchecked")
+        Map<String, List<String>> gvalue = (Map<String, List<String>>) value;
+        this.replacementPairs = gvalue;
+      }
+      break;
+
+      case EQUIVALENT_CHARS:
+      {
+        @SuppressWarnings("unchecked")
+        Map<Character, List<Character>> gvalue = (Map<Character, List<Character>>) value;
+        this.equivalentChars = gvalue;
+      }
+      break;
+
+      case IGNORE_PUNCTUATION:
+      case IGNORE_NUMBERS:
+      case IGNORE_CAMEL_CASE:
+      case IGNORE_ALL_UPPERCASE:
+      case IGNORE_DIACRITICS:
+      case CONVERT_CASE:
+      case RUN_ON_WORDS:
+      case FREQUENCY_INCLUDED:
+        this.boolAttributes.put(e.getKey(), (Boolean) value);
+        break;
+
+      case AUTHOR:
+      case LICENSE:
+      case CREATION_DATE:
+        // Just run validation.
+        e.getKey().fromString(e.getValue());
+        break;
+
+      default:
+        throw new RuntimeException("Unexpected code path (attribute should be handled but is not): " + e.getKey());
+      }
+    }
+
+    if (!requiredAttributes.isEmpty()) {
+      throw new IllegalArgumentException("At least one the required attributes was not provided: "
+          + requiredAttributes.toString());
+    }
+
+    // Sanity check.
+    CharsetEncoder encoder = getEncoder();
+    try {
+      ByteBuffer encoded = encoder.encode(CharBuffer.wrap(new char [] { separatorChar }));
+      if (encoded.remaining() > 1) {
+        throw new IllegalArgumentException("Separator character is not a single byte in encoding "
+            + encoding + ": " + separatorChar);
+      }
+      this.separator = encoded.get();
+    } catch (CharacterCodingException e) {
+      throw new IllegalArgumentException("Separator character cannot be converted to a byte in "
+          + encoding + ": " + separatorChar, e);
+    }
+  }
+
+  /**
+   * Returns a new {@link CharsetDecoder} for the {@link #encoding}.
+   */
+  public CharsetDecoder getDecoder() {
+    try {
+      return charset.newDecoder().onMalformedInput(
+          CodingErrorAction.REPORT).onUnmappableCharacter(
+              CodingErrorAction.REPORT);
+    } catch (UnsupportedCharsetException e) {
+      throw new RuntimeException(
+          "FSA's encoding charset is not supported: " + encoding);
+    }
+  }
+
+  /**
+   * Returns a new {@link CharsetEncoder} for the {@link #encoding}.
+   */
+  public CharsetEncoder getEncoder() {
+    try {
+      return charset.newEncoder();
+    } catch (UnsupportedCharsetException e) {
+      throw new RuntimeException(
+          "FSA's encoding charset is not supported: " + encoding);
+    }
+  }
+
+  /**
+   * Return sequence encoder type.
+   */
+  public EncoderType getEncoderType() {
+    return encoderType;
+  }
+
+  /**
+   * Returns the {@link #separator} byte converted to a single <code>char</code>. Throws
+   * a {@link RuntimeException} if this conversion is for some reason impossible
+   * (the byte is a surrogate pair, FSA's {@link #encoding} is not available).
+   */
+  public char getSeparatorAsChar() {
+    return separatorChar;
+  }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java
new file mode 100644
index 0000000..7e85ecb
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java
@@ -0,0 +1,139 @@
+package morfologik.stemming;
+
+import java.nio.charset.Charset;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Helper class to build {@link DictionaryMetadata} instances.
+ */
+public final class DictionaryMetadataBuilder {
+  private final EnumMap<DictionaryAttribute, String> attrs
+  = new EnumMap<DictionaryAttribute, String>(DictionaryAttribute.class);
+
+  public DictionaryMetadataBuilder separator(char c) {
+    this.attrs.put(DictionaryAttribute.SEPARATOR, Character.toString(c));
+    return this;
+  }
+
+  public DictionaryMetadataBuilder encoding(Charset charset) {
+    return encoding(charset.name());
+  }
+
+  public DictionaryMetadataBuilder encoding(String charsetName) {
+    this.attrs.put(DictionaryAttribute.ENCODING, charsetName);
+    return this;
+  }
+
+  public DictionaryMetadataBuilder frequencyIncluded()    { return frequencyIncluded(false); }
+  public DictionaryMetadataBuilder frequencyIncluded(boolean v)  { this.attrs.put(DictionaryAttribute.FREQUENCY_INCLUDED, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder ignorePunctuation()    { return ignorePunctuation(true); }
+  public DictionaryMetadataBuilder ignorePunctuation(boolean v)  { this.attrs.put(DictionaryAttribute.IGNORE_PUNCTUATION, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder ignoreNumbers()        { return ignoreNumbers(true); }
+  public DictionaryMetadataBuilder ignoreNumbers(boolean v)      { this.attrs.put(DictionaryAttribute.IGNORE_NUMBERS, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder ignoreCamelCase()      { return ignoreCamelCase(true); }
+  public DictionaryMetadataBuilder ignoreCamelCase(boolean v)    { this.attrs.put(DictionaryAttribute.IGNORE_CAMEL_CASE, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder ignoreAllUppercase()   { return ignoreAllUppercase(true); }
+  public DictionaryMetadataBuilder ignoreAllUppercase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_ALL_UPPERCASE, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder ignoreDiacritics()     { return ignoreDiacritics(true); }
+  public DictionaryMetadataBuilder ignoreDiacritics(boolean v)   { this.attrs.put(DictionaryAttribute.IGNORE_DIACRITICS, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder convertCase()          { return convertCase(true); }
+  public DictionaryMetadataBuilder convertCase(boolean v)        { this.attrs.put(DictionaryAttribute.CONVERT_CASE, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder supportRunOnWords()    { return supportRunOnWords(true); }
+  public DictionaryMetadataBuilder supportRunOnWords(boolean v)  { this.attrs.put(DictionaryAttribute.RUN_ON_WORDS, Boolean.valueOf(v).toString()); return this; }
+
+  public DictionaryMetadataBuilder encoder(EncoderType type) {
+    this.attrs.put(DictionaryAttribute.ENCODER, type.name());
+    return this;
+  }
+
+  public DictionaryMetadataBuilder locale(Locale locale) {
+    return locale(locale.toString());
+  }
+
+  public DictionaryMetadataBuilder locale(String localeName) {
+    this.attrs.put(DictionaryAttribute.LOCALE, localeName);
+    return this;
+  }
+
+  public DictionaryMetadataBuilder withReplacementPairs(Map<String, List<String>> replacementPairs) {
+    StringBuilder builder = new StringBuilder();
+    for (Map.Entry<String,List<String>> e : replacementPairs.entrySet()) {
+      String k = e.getKey();
+      for (String v : e.getValue()) {
+        if (builder.length() > 0) builder.append(", ");
+        builder.append(k).append(" ").append(v);
+      }
+    }
+    this.attrs.put(DictionaryAttribute.REPLACEMENT_PAIRS, builder.toString());
+    return this;
+  }
+
+  public DictionaryMetadataBuilder withEquivalentChars(Map<Character, List<Character>> equivalentChars) {
+    StringBuilder builder = new StringBuilder();
+    for (Map.Entry<Character,List<Character>> e : equivalentChars.entrySet()) {
+      Character k = e.getKey();
+      for (Character v : e.getValue()) {
+        if (builder.length() > 0) builder.append(", ");
+        builder.append(k).append(" ").append(v);
+      }
+    }
+    this.attrs.put(DictionaryAttribute.EQUIVALENT_CHARS, builder.toString());
+    return this;
+  }
+
+  public DictionaryMetadataBuilder withInputConversionPairs(Map<String, String> conversionPairs) {
+    StringBuilder builder = new StringBuilder();
+    for (Map.Entry<String,String> e : conversionPairs.entrySet()) {
+      String k = e.getKey();
+      if (builder.length() > 0) builder.append(", ");
+      builder.append(k).append(" ").append(conversionPairs.get(k));
+    }
+    this.attrs.put(DictionaryAttribute.INPUT_CONVERSION, builder.toString());
+    return this;
+  }
+
+  public DictionaryMetadataBuilder withOutputConversionPairs(Map<String, String> conversionPairs) {
+    StringBuilder builder = new StringBuilder();
+    for (Map.Entry<String,String> e : conversionPairs.entrySet()) {
+      String k = e.getKey();
+      if (builder.length() > 0) builder.append(", ");
+      builder.append(k).append(" ").append(conversionPairs.get(k));
+    }
+    this.attrs.put(DictionaryAttribute.OUTPUT_CONVERSION, builder.toString());
+    return this;
+  }
+
+
+  public DictionaryMetadataBuilder author(String author) {
+    this.attrs.put(DictionaryAttribute.AUTHOR, author);
+    return this;
+  }
+
+  public DictionaryMetadataBuilder creationDate(String creationDate) {
+    this.attrs.put(DictionaryAttribute.CREATION_DATE, creationDate);
+    return this;
+  }
+
+  public DictionaryMetadataBuilder license(String license) {
+    this.attrs.put(DictionaryAttribute.LICENSE, license);
+    return this;
+  }
+
+  public DictionaryMetadata build() {
+    return new DictionaryMetadata(attrs);
+  }
+
+  public EnumMap<DictionaryAttribute, String> toMap()    {
+    return new EnumMap<DictionaryAttribute, String>(attrs);
+  }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java
new file mode 100644
index 0000000..093cfbb
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java
@@ -0,0 +1,11 @@
+package morfologik.stemming;
+
+/**
+ * Sequence encoder type.
+ */
+public enum EncoderType {
+    SUFFIX,
+    PREFIX,
+    INFIX,
+    NONE;
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java
new file mode 100644
index 0000000..6e59526
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java
@@ -0,0 +1,20 @@
+package morfologik.stemming;
+
+import java.util.List;
+
+/**
+ * A generic &quot;stemmer&quot; interface in Morfologik.
+ */
+public interface IStemmer {
+	/**
+	 * Returns a list of {@link WordData} entries for a given word. The returned
+	 * list is never <code>null</code>. Depending on the stemmer's
+	 * implementation the {@link WordData} may carry the stem and additional
+	 * information (tag) or just the stem.
+	 * <p>
+	 * The returned list and any object it contains are not usable after a
+	 * subsequent call to this method. Any data that should be stored in between
+	 * must be copied by the caller.
+	 */
+	public List<WordData> lookup(CharSequence word);
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java
new file mode 100644
index 0000000..a1bdaa0
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java
@@ -0,0 +1,254 @@
+package morfologik.stemming;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.*;
+
+import morfologik.util.BufferUtils;
+
+/**
+ * Stem and tag data associated with a given word.
+ * 
+ * <p>
+ * <b>Important notes:</b>
+ * <ul>
+ * <li>Objects of this class are <i>volatile</i> (their content changes on
+ * subsequent calls to {@link DictionaryLookup} class. If you need a copy of the
+ * stem or tag data for a given word, you have to create a custom buffer
+ * yourself and copy the associated data, perform {@link #clone()} or create
+ * strings (they are immutable) using {@link #getStem()} and then
+ * {@link CharSequence#toString()}.</li>
+ * <li>Objects of this class must not be used in any Java collections. In fact
+ * both equals and hashCode methods are overridden and throw exceptions to
+ * prevent accidental damage.</li>
+ * </ul>
+ */
+public final class WordData implements Cloneable {
+	/**
+	 * Error information if somebody puts us in a Java collection.
+	 */
+	private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use"
+	        + " in Java collections framework (volatile content). Refer to documentation.";
+
+	/** Character encoding in internal buffers. */
+	private final CharsetDecoder decoder;
+
+	/**
+	 * Inflected word form data.
+	 */
+	CharSequence wordCharSequence;
+
+	/**
+	 * Character sequence after converting {@link #stemBuffer} using
+	 * {@link #decoder}.
+	 */
+	private CharBuffer stemCharSequence;
+
+	/**
+	 * Character sequence after converting {@link #tagBuffer} using
+	 * {@link #decoder}.
+	 */
+	private CharBuffer tagCharSequence;
+
+	/** Byte buffer holding the inflected word form data. */
+	ByteBuffer wordBuffer;
+
+	/** Byte buffer holding stem data. */
+	ByteBuffer stemBuffer;
+
+	/** Byte buffer holding tag data. */
+	ByteBuffer tagBuffer;
+
+	/**
+	 * Package scope constructor.
+	 */
+	WordData(CharsetDecoder decoder) {
+		this.decoder = decoder;
+
+		stemBuffer = ByteBuffer.allocate(0);
+		tagBuffer = ByteBuffer.allocate(0);
+		stemCharSequence = CharBuffer.allocate(0);
+		tagCharSequence = CharBuffer.allocate(0);
+	}
+
+	/**
+	 * A constructor for tests only.
+	 */
+	WordData(String stem, String tag, String encoding) {
+		this(Charset.forName(encoding).newDecoder());
+
+		try {
+			if (stem != null)
+				stemBuffer.put(stem.getBytes(encoding));
+			if (tag != null)
+				tagBuffer.put(tag.getBytes(encoding));
+		} catch (UnsupportedEncodingException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	/**
+	 * Copy the stem's binary data (no charset decoding) to a custom byte
+	 * buffer. If the buffer is null or not large enough to hold the result, a
+	 * new buffer is allocated.
+	 * 
+	 * @param target
+	 *            Target byte buffer to copy the stem buffer to or
+	 *            <code>null</code> if a new buffer should be allocated.
+	 * 
+	 * @return Returns <code>target</code> or the new reallocated buffer.
+	 */
+	public ByteBuffer getStemBytes(ByteBuffer target) {
+		target = BufferUtils.ensureCapacity(target, stemBuffer.remaining());
+		stemBuffer.mark();
+		target.put(stemBuffer);
+		stemBuffer.reset();
+		target.flip();
+		return target;
+	}
+
+	/**
+	 * Copy the tag's binary data (no charset decoding) to a custom byte buffer.
+	 * If the buffer is null or not large enough to hold the result, a new
+	 * buffer is allocated.
+	 * 
+	 * @param target
+	 *            Target byte buffer to copy the tag buffer to or
+	 *            <code>null</code> if a new buffer should be allocated.
+	 * 
+	 * @return Returns <code>target</code> or the new reallocated buffer.
+	 */
+	public ByteBuffer getTagBytes(ByteBuffer target) {
+		target = BufferUtils.ensureCapacity(target, tagBuffer.remaining());
+		tagBuffer.mark();
+		target.put(tagBuffer);
+		tagBuffer.reset();
+		target.flip();
+		return target;
+	}
+
+	/**
+	 * Copy the inflected word's binary data (no charset decoding) to a custom
+	 * byte buffer. If the buffer is null or not large enough to hold the
+	 * result, a new buffer is allocated.
+	 * 
+	 * @param target
+	 *            Target byte buffer to copy the word buffer to or
+	 *            <code>null</code> if a new buffer should be allocated.
+	 * 
+	 * @return Returns <code>target</code> or the new reallocated buffer.
+	 */
+	public ByteBuffer getWordBytes(ByteBuffer target) {
+		target = BufferUtils.ensureCapacity(target, wordBuffer.remaining());
+		wordBuffer.mark();
+		target.put(wordBuffer);
+		wordBuffer.reset();
+		target.flip();
+		return target;
+	}
+
+	/**
+	 * @return Return tag data decoded to a character sequence or
+	 *         <code>null</code> if no associated tag data exists.
+	 */
+	public CharSequence getTag() {
+		tagCharSequence = decode(tagBuffer, tagCharSequence);
+		return tagCharSequence.remaining() == 0 ? null : tagCharSequence;
+	}
+
+	/**
+	 * @return Return stem data decoded to a character sequence or
+	 *         <code>null</code> if no associated stem data exists.
+	 */
+	public CharSequence getStem() {
+		stemCharSequence = decode(stemBuffer, stemCharSequence);
+		return stemCharSequence.remaining() == 0 ? null : stemCharSequence;
+	}
+
+	/**
+	 * @return Return inflected word form data. Usually the parameter passed to
+	 *         {@link DictionaryLookup#lookup(CharSequence)}.
+	 */
+	public CharSequence getWord() {
+		return wordCharSequence;
+	}
+
+	/*
+     * 
+     */
+	@Override
+	public boolean equals(Object obj) {
+		throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
+	}
+
+	/*
+     * 
+     */
+	@Override
+	public int hashCode() {
+		throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
+	}
+
+	@Override
+	public String toString() {
+	    return "WordData["
+	        + this.getWord() + ","
+	        + this.getStem() + ","
+	        + this.getTag() + "]";
+	}
+	
+	/**
+	 * Declare a covariant of {@link Object#clone()} that returns a deep copy of
+	 * this object. The content of all internal buffers is copied.
+	 */
+	@Override
+	protected WordData clone() {
+		final WordData clone = new WordData(this.decoder);
+		clone.wordCharSequence = cloneCharSequence(wordCharSequence);
+		clone.wordBuffer = getWordBytes(null);
+		clone.stemBuffer = getStemBytes(null);
+		clone.tagBuffer = getTagBytes(null);
+		return clone;
+	}
+
+	/**
+	 * Clone char sequences only if not immutable.
+	 */
+	private CharSequence cloneCharSequence(CharSequence chs) {
+		if (chs instanceof String)
+			return chs;
+		return chs.toString();
+	}
+
+	/**
+	 * Reset internal structures for storing another word's data.
+	 */
+	void reset() {
+		this.wordCharSequence = null;
+		this.wordBuffer = null;
+		this.stemCharSequence.clear();
+		this.tagCharSequence.clear();
+		this.stemBuffer.clear();
+		this.tagBuffer.clear();
+	}
+
+	/**
+	 * Decode byte buffer, optionally expanding the char buffer to.
+	 */
+	private CharBuffer decode(ByteBuffer bytes, CharBuffer chars) {
+		chars.clear();
+		final int maxCapacity = (int) (bytes.remaining() * decoder.maxCharsPerByte());
+		if (chars.capacity() <= maxCapacity) {
+			chars = CharBuffer.allocate(maxCapacity);
+		}
+
+		bytes.mark();
+		decoder.reset();
+		decoder.decode(bytes, chars, true);
+		chars.flip();
+		bytes.reset();
+
+		return chars;
+	}
+}