summaryrefslogtreecommitdiff
path: root/morfologik-stemming/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-stemming/src/main')
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java111
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java233
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java333
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java143
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java403
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java298
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java139
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java11
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java20
-rw-r--r--morfologik-stemming/src/main/java/morfologik/stemming/WordData.java254
10 files changed, 1945 insertions, 0 deletions
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java
new file mode 100644
index 0000000..4c3f877
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java
@@ -0,0 +1,111 @@
+package morfologik.stemming;
+
+import java.util.*;
+
+/**
+ * A view over a range of an array.
+ */
+@SuppressWarnings("serial")
+final class ArrayViewList<E> extends AbstractList<E>
+ implements RandomAccess, java.io.Serializable
+{
+ /** Backing array. */
+ private E[] a;
+ private int start;
+ private int length;
+
+ /*
+ *
+ */
+ ArrayViewList(E[] array, int start, int length) {
+ if (array == null)
+ throw new IllegalArgumentException();
+ wrap(array, start, length);
+ }
+
+ /*
+ *
+ */
+ public int size() {
+ return length;
+ }
+
+ /*
+ *
+ */
+ public E get(int index) {
+ return a[start + index];
+ }
+
+ /*
+ *
+ */
+ public E set(int index, E element) {
+ throw new UnsupportedOperationException();
+ }
+
+ /*
+ *
+ */
+ public void add(int index, E element) {
+ throw new UnsupportedOperationException();
+ }
+
+ /*
+ *
+ */
+ public E remove(int index) {
+ throw new UnsupportedOperationException();
+ }
+
+ /*
+ *
+ */
+ public boolean addAll(int index, Collection<? extends E> c) {
+ throw new UnsupportedOperationException();
+ }
+
+ /*
+ *
+ */
+ public int indexOf(Object o) {
+ if (o == null) {
+ for (int i = start; i < start + length; i++)
+ if (a[i] == null)
+ return i - start;
+ } else {
+ for (int i = start; i < start + length; i++)
+ if (o.equals(a[i]))
+ return i - start;
+ }
+ return -1;
+ }
+
+ public ListIterator<E> listIterator() {
+ return listIterator(0);
+ }
+
+ /*
+ *
+ */
+ public ListIterator<E> listIterator(final int index) {
+ return Arrays.asList(a).subList(start, start + length).listIterator(
+ index);
+ }
+
+ /*
+ *
+ */
+ public boolean contains(Object o) {
+ return indexOf(o) != -1;
+ }
+
+ /*
+ *
+ */
+ void wrap(E[] array, int start, int length) {
+ this.a = array;
+ this.start = start;
+ this.length = length;
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java
new file mode 100644
index 0000000..d72c85c
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java
@@ -0,0 +1,233 @@
+package morfologik.stemming;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+import java.util.WeakHashMap;
+
+import morfologik.fsa.FSA;
+import morfologik.util.FileUtils;
+import morfologik.util.ResourceUtils;
+
+/**
+ * A dictionary combines {@link FSA} automaton and metadata describing the
+ * internals of dictionary entries' coding ({@link DictionaryMetadata}.
+ *
+ * <p>
+ * A dictionary consists of two files:
+ * <ul>
+ * <li>an actual compressed FSA file,
+ * <li>a metadata file, describing the dictionary.
+ * </ul>
+ * Use static methods in this class to read dictionaries and their metadata.
+ */
+public final class Dictionary {
+ /**
+ * Expected metadata file extension.
+ */
+ public final static String METADATA_FILE_EXTENSION = "info";
+
+ /**
+ * {@link FSA} automaton with the compiled dictionary data.
+ */
+ public final FSA fsa;
+
+ /**
+ * Metadata associated with the dictionary.
+ */
+ public final DictionaryMetadata metadata;
+
+ /**
+ * Default loaded dictionaries.
+ */
+ public static final WeakHashMap<String, Dictionary> defaultDictionaries = new WeakHashMap<String, Dictionary>();
+
+ /**
+ * It is strongly recommended to use static methods in this class for
+ * reading dictionaries.
+ *
+ * @param fsa
+ * An instantiated {@link FSA} instance.
+ *
+ * @param metadata
+ * A map of attributes describing the compression format and
+ * other settings not contained in the FSA automaton. For an
+ * explanation of available attributes and their possible values,
+ * see {@link DictionaryMetadata}.
+ */
+ public Dictionary(FSA fsa, DictionaryMetadata metadata) {
+ this.fsa = fsa;
+ this.metadata = metadata;
+ }
+
+ /**
+ * Attempts to load a dictionary using the path to the FSA file and the
+ * expected metadata extension.
+ */
+ public static Dictionary read(File fsaFile) throws IOException {
+ final File featuresFile = new File(fsaFile.getParent(),
+ getExpectedFeaturesName(fsaFile.getName()));
+
+ FileUtils.assertExists(featuresFile, true, false);
+
+ return readAndClose(
+ new FileInputStream(fsaFile),
+ new FileInputStream(featuresFile));
+ }
+
+ /**
+ * <p>
+ * Attempts to load a dictionary using the URL to the FSA file and the
+ * expected metadata extension.
+ *
+ * <p>
+ * This method can be used to load resource-based dictionaries, but be aware
+ * of JAR resource-locking issues that arise from resource URLs.
+ */
+ public static Dictionary read(URL fsaURL) throws IOException {
+ final String fsa = fsaURL.toExternalForm();
+ final String features = getExpectedFeaturesName(fsa);
+
+ return readAndClose(
+ ResourceUtils.openInputStream(fsa),
+ ResourceUtils.openInputStream(features));
+ }
+
+ /**
+ * Attempts to load a dictionary from opened streams of FSA dictionary data
+ * and associated metadata.
+ */
+ public static Dictionary readAndClose(InputStream fsaData, InputStream featuresData)
+ throws IOException
+ {
+ try {
+ Map<DictionaryAttribute, String> map = new HashMap<DictionaryAttribute, String>();
+ final Properties properties = new Properties();
+ properties.load(new InputStreamReader(featuresData, "UTF-8"));
+
+ // Handle back-compatibility for encoder specification.
+ if (!properties.containsKey(DictionaryAttribute.ENCODER.propertyName)) {
+ boolean usesSuffixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-suffixes", "true"));
+ boolean usesPrefixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-prefixes", "false"));
+ boolean usesInfixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-infixes", "false"));
+
+ if (usesInfixes) {
+ map.put(DictionaryAttribute.ENCODER, EncoderType.INFIX.name());
+ } else if (usesPrefixes) {
+ map.put(DictionaryAttribute.ENCODER, EncoderType.PREFIX.name());
+ } else if (usesSuffixes) {
+ map.put(DictionaryAttribute.ENCODER, EncoderType.SUFFIX.name());
+ } else {
+ map.put(DictionaryAttribute.ENCODER, EncoderType.NONE.name());
+ }
+
+ properties.remove("fsa.dict.uses-suffixes");
+ properties.remove("fsa.dict.uses-prefixes");
+ properties.remove("fsa.dict.uses-infixes");
+ }
+
+ for (Enumeration<?> e = properties.propertyNames(); e.hasMoreElements();) {
+ String key = (String) e.nextElement();
+ map.put(DictionaryAttribute.fromPropertyName(key), properties.getProperty(key));
+ }
+ final DictionaryMetadata features = new DictionaryMetadata(map);
+ final FSA fsa = FSA.read(fsaData);
+
+ return new Dictionary(fsa, features);
+ } finally {
+ FileUtils.close(fsaData, featuresData);
+ }
+ }
+
+ /**
+ * Returns the expected name of the metadata file, based on the name of the
+ * FSA dictionary file. The expected name is resolved by truncating any
+ * suffix of <code>name</code> and appending
+ * {@link #METADATA_FILE_EXTENSION}.
+ */
+ public static String getExpectedFeaturesName(String name) {
+ final int dotIndex = name.lastIndexOf('.');
+ final String featuresName;
+ if (dotIndex >= 0) {
+ featuresName = name.substring(0, dotIndex) + "."
+ + METADATA_FILE_EXTENSION;
+ } else {
+ featuresName = name + "." + METADATA_FILE_EXTENSION;
+ }
+
+ return featuresName;
+ }
+
+ /**
+ * Return a built-in dictionary for a given ISO language code. Dictionaries
+ * are cached internally for potential reuse.
+ *
+ * @throws RuntimeException
+ * Throws a {@link RuntimeException} if the dictionary is not
+ * bundled with the library.
+ */
+ public static Dictionary getForLanguage(String languageCode) {
+ if (languageCode == null || "".equals(languageCode)) {
+ throw new IllegalArgumentException(
+ "Language code must not be empty.");
+ }
+
+ synchronized (defaultDictionaries) {
+ Dictionary dict = defaultDictionaries.get(languageCode);
+ if (dict != null)
+ return dict;
+
+ try {
+ final String dictPath = "morfologik/dictionaries/" + languageCode + ".dict";
+ final String metaPath = Dictionary
+ .getExpectedFeaturesName(dictPath);
+
+ dict = Dictionary.readAndClose(
+ ResourceUtils.openInputStream(dictPath),
+ ResourceUtils.openInputStream(metaPath));
+
+ defaultDictionaries.put(languageCode, dict);
+ return dict;
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Default dictionary resource for language '"
+ + languageCode + "not found.", e);
+ }
+ }
+ }
+
+ /**
+ * Converts the words on input or output according to conversion tables.
+ *
+ * Useful if the input words need to be normalized (i.e., ligatures,
+ * apostrophes and such).
+ *
+ * @param str - input character sequence to be converted
+ * @param conversionMap - conversion map used to convert the string (a map
+ * from String to String)
+ * @return a converted string.
+ *
+ * @since 1.9.0
+ *
+ */
+ public static CharSequence convertText(final CharSequence str, final Map<String, String> conversionMap) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(str);
+ for (final String auxKey : conversionMap.keySet()) {
+ int index = sb.indexOf(auxKey);
+ while (index != -1) {
+ sb.replace(index, index + auxKey.length(), conversionMap.get(auxKey));
+ index = sb.indexOf(auxKey);
+ }
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java
new file mode 100644
index 0000000..1249800
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java
@@ -0,0 +1,333 @@
+package morfologik.stemming;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Attributes applying to {@link Dictionary} and {@link DictionaryMetadata}.
+ */
+public enum DictionaryAttribute {
+ /**
+ * Logical fields separator inside the FSA.
+ */
+ SEPARATOR("fsa.dict.separator") {
+ @Override
+ public Character fromString(String separator) {
+ if (separator == null || separator.length() != 1) {
+ throw new IllegalArgumentException("Attribute " + propertyName
+ + " must be a single character.");
+ }
+
+ char charValue = separator.charAt(0);
+ if (Character.isHighSurrogate(charValue) ||
+ Character.isLowSurrogate(charValue)) {
+ throw new IllegalArgumentException(
+ "Field separator character cannot be part of a surrogate pair: " + separator);
+ }
+
+ return charValue;
+ }
+ },
+
+ /**
+ * Character to byte encoding used for strings inside the FSA.
+ */
+ ENCODING("fsa.dict.encoding") {
+ @Override
+ public Charset fromString(String charsetName) {
+ return Charset.forName(charsetName);
+ }
+ },
+
+ /**
+ * If the FSA dictionary includes frequency data.
+ */
+ FREQUENCY_INCLUDED("fsa.dict.frequency-included") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to ignore words containing digits
+ */
+ IGNORE_NUMBERS("fsa.dict.speller.ignore-numbers") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to ignore punctuation.
+ */
+ IGNORE_PUNCTUATION("fsa.dict.speller.ignore-punctuation") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to ignore CamelCase words.
+ */
+ IGNORE_CAMEL_CASE("fsa.dict.speller.ignore-camel-case") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to ignore ALL UPPERCASE words.
+ */
+ IGNORE_ALL_UPPERCASE("fsa.dict.speller.ignore-all-uppercase") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to ignore diacritics, so that
+ * 'a' would be treated as equivalent to 'ą'.
+ */
+ IGNORE_DIACRITICS("fsa.dict.speller.ignore-diacritics") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * if the spelling dictionary is supposed to treat upper and lower case
+ * as equivalent.
+ */
+ CONVERT_CASE("fsa.dict.speller.convert-case") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /**
+ * If the spelling dictionary is supposed to split runOnWords.
+ */
+ RUN_ON_WORDS("fsa.dict.speller.runon-words") {
+ @Override
+ public Boolean fromString(String value) {
+ return booleanValue(value);
+ }
+ },
+
+ /** Locale associated with the dictionary. */
+ LOCALE("fsa.dict.speller.locale") {
+ @Override
+ public Locale fromString(String value) {
+ return new Locale(value);
+ }
+ },
+
+ /** Locale associated with the dictionary. */
+ ENCODER("fsa.dict.encoder") {
+ @Override
+ public EncoderType fromString(String value) {
+ return EncoderType.valueOf(value.toUpperCase(Locale.ROOT));
+ }
+ },
+
+ /**
+ * Input conversion pairs to replace non-standard characters before search in a speller dictionary.
+ * For example, common ligatures can be replaced here.
+ */
+ INPUT_CONVERSION("fsa.dict.input-conversion") {
+ @Override
+ public Map<String, String> fromString(String value) throws IllegalArgumentException {
+ Map<String, String> conversionPairs = new HashMap<String, String>();
+ final String[] replacements = value.split(",\\s*");
+ for (final String stringPair : replacements) {
+ final String[] twoStrings = stringPair.trim().split(" ");
+ if (twoStrings.length == 2) {
+ if (!conversionPairs.containsKey(twoStrings[0])) {
+ conversionPairs.put(twoStrings[0], twoStrings[1]);
+ } else {
+ throw new IllegalArgumentException(
+ "Input conversion cannot specify different values for the same input string: " + twoStrings[0]);
+ }
+ } else {
+ throw new IllegalArgumentException("Attribute " + propertyName
+ + " is not in the proper format: " + value);
+ }
+ }
+ return conversionPairs;
+ }
+ },
+
+ /**
+ * Output conversion pairs to replace non-standard characters before search in a speller dictionary.
+ * For example, standard characters can be replaced here into ligatures.
+ *
+ * Useful for dictionaries that do have certain standards imposed.
+ *
+ */
+ OUTPUT_CONVERSION ("fsa.dict.output-conversion") {
+ @Override
+ public Map<String, String> fromString(String value) throws IllegalArgumentException {
+ Map<String, String> conversionPairs = new HashMap<String, String>();
+ final String[] replacements = value.split(",\\s*");
+ for (final String stringPair : replacements) {
+ final String[] twoStrings = stringPair.trim().split(" ");
+ if (twoStrings.length == 2) {
+ if (!conversionPairs.containsKey(twoStrings[0])) {
+ conversionPairs.put(twoStrings[0], twoStrings[1]);
+ } else {
+ throw new IllegalArgumentException(
+ "Input conversion cannot specify different values for the same input string: " + twoStrings[0]);
+ }
+ } else {
+ throw new IllegalArgumentException("Attribute " + propertyName
+ + " is not in the proper format: " + value);
+ }
+ }
+ return conversionPairs;
+ }
+ },
+
+ /**
+ * Replacement pairs for non-obvious candidate search in a speller dictionary.
+ * For example, Polish <tt>rz</tt> is phonetically equivalent to <tt>ż</tt>,
+ * and this may be specified here to allow looking for replacements of <tt>rz</tt> with <tt>ż</tt>
+ * and vice versa.
+ */
+ REPLACEMENT_PAIRS("fsa.dict.speller.replacement-pairs") {
+ @Override
+ public Map<String, List<String>> fromString(String value) throws IllegalArgumentException {
+ Map<String, List<String>> replacementPairs = new HashMap<String, List<String>>();
+ final String[] replacements = value.split(",\\s*");
+ for (final String stringPair : replacements) {
+ final String[] twoStrings = stringPair.trim().split(" ");
+ if (twoStrings.length == 2) {
+ if (!replacementPairs.containsKey(twoStrings[0])) {
+ List<String> strList = new ArrayList<String>();
+ strList.add(twoStrings[1]);
+ replacementPairs.put(twoStrings[0], strList);
+ } else {
+ replacementPairs.get(twoStrings[0]).add(twoStrings[1]);
+ }
+ } else {
+ throw new IllegalArgumentException("Attribute " + propertyName
+ + " is not in the proper format: " + value);
+ }
+ }
+ return replacementPairs;
+ }
+ },
+
+ /**
+ * Equivalent characters (treated similarly as equivalent chars with and without
+ * diacritics). For example, Polish <tt>ł</tt> can be specified as equivalent to <tt>l</tt>.
+ *
+ * <p>This implements a feature similar to hunspell MAP in the affix file.
+ */
+ EQUIVALENT_CHARS("fsa.dict.speller.equivalent-chars") {
+ @Override
+ public Map<Character, List<Character>> fromString(String value) throws IllegalArgumentException {
+ Map<Character, List<Character>> equivalentCharacters =
+ new HashMap<Character, List<Character>>();
+ final String[] eqChars = value.split(",\\s*");
+ for (final String characterPair : eqChars) {
+ final String[] twoChars = characterPair.trim().split(" ");
+ if (twoChars.length == 2
+ && twoChars[0].length() == 1
+ && twoChars[1].length() == 1) {
+ char fromChar = twoChars[0].charAt(0);
+ char toChar = twoChars[1].charAt(0);
+ if (!equivalentCharacters.containsKey(fromChar)) {
+ List<Character> chList = new ArrayList<Character>();
+ equivalentCharacters.put(fromChar, chList);
+ }
+ equivalentCharacters.get(fromChar).add(toChar);
+ } else {
+ throw new IllegalArgumentException("Attribute " + propertyName
+ + " is not in the proper format: " + value);
+ }
+ }
+ return equivalentCharacters;
+ }
+ },
+
+ /**
+ * Dictionary license attribute.
+ */
+ LICENSE("fsa.dict.license"),
+
+ /**
+ * Dictionary author.
+ */
+ AUTHOR("fsa.dict.author"),
+
+ /**
+ * Dictionary creation date.
+ */
+ CREATION_DATE("fsa.dict.created");
+
+ /**
+ * Property name for this attribute.
+ */
+ public final String propertyName;
+
+ /**
+ * Converts a string to the given attribute's value (covariants used).
+ *
+ * @throws IllegalArgumentException
+ * If the input string cannot be converted to the attribute's
+ * value.
+ */
+ public Object fromString(String value) throws IllegalArgumentException {
+ return value;
+ }
+
+ /**
+ * Return an {@link DictionaryAttribute} by its {@link #propertyName}.
+ */
+ public static DictionaryAttribute fromPropertyName(String propertyName) {
+ DictionaryAttribute value = attrsByPropertyName.get(propertyName);
+ if (value == null) {
+ throw new IllegalArgumentException("No attribute for property: " + propertyName);
+ }
+ return value;
+ }
+
+ private static final Map<String,DictionaryAttribute> attrsByPropertyName;
+ static {
+ attrsByPropertyName = new HashMap<String,DictionaryAttribute>();
+ for (DictionaryAttribute attr : DictionaryAttribute.values()) {
+ if (attrsByPropertyName.put(attr.propertyName, attr) != null) {
+ throw new RuntimeException("Duplicate property key for: " + attr);
+ }
+ }
+ }
+
+ /**
+ * Private enum instance constructor.
+ */
+ private DictionaryAttribute(String propertyName) {
+ this.propertyName = propertyName;
+ }
+
+ private static Boolean booleanValue(String value) {
+ value = value.toLowerCase();
+ if ("true".equals(value) || "yes".equals(value) || "on".equals(value)) {
+ return Boolean.TRUE;
+ }
+ if ("false".equals(value) || "no".equals(value) || "off".equals(value)) {
+ return Boolean.FALSE;
+ }
+ throw new IllegalArgumentException("Not a boolean value: " + value);
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
new file mode 100644
index 0000000..104ff58
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
@@ -0,0 +1,143 @@
+package morfologik.stemming;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.util.Iterator;
+
+import morfologik.util.BufferUtils;
+
+/**
+ * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems
+ * can be decoded from compressed format or the compressed form can be
+ * preserved.
+ */
+public final class DictionaryIterator implements Iterator<WordData> {
+ private final CharsetDecoder decoder;
+ private final Iterator<ByteBuffer> entriesIter;
+ private final WordData entry;
+ private final byte separator;
+ private final DictionaryMetadata dictionaryMetadata;
+ private final boolean decodeStems;
+
+ private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0);
+ private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0);
+ private ByteBuffer temp = ByteBuffer.allocate(0);
+
+ public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder,
+ boolean decodeStems) {
+ this.entriesIter = dictionary.fsa.iterator();
+ this.separator = dictionary.metadata.getSeparator();
+ this.dictionaryMetadata = dictionary.metadata;
+ this.decoder = decoder;
+ this.entry = new WordData(decoder);
+ this.decodeStems = decodeStems;
+ }
+
+ public boolean hasNext() {
+ return entriesIter.hasNext();
+ }
+
+ public WordData next() {
+ final ByteBuffer entryBuffer = entriesIter.next();
+ entry.reset();
+
+ /*
+ * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find
+ * this split.
+ */
+ byte[] ba = entryBuffer.array();
+ int bbSize = entryBuffer.remaining();
+
+ int sepPos;
+ for (sepPos = 0; sepPos < bbSize; sepPos++) {
+ if (ba[sepPos] == separator)
+ break;
+ }
+
+ if (sepPos == bbSize) {
+ throw new RuntimeException("Invalid dictionary "
+ + "entry format (missing separator).");
+ }
+
+ inflectedBuffer.clear();
+ inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos);
+ inflectedBuffer.put(ba, 0, sepPos);
+ inflectedBuffer.flip();
+
+ inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer);
+ entry.wordBuffer = inflectedBuffer;
+ entry.wordCharSequence = inflectedCharBuffer;
+
+ temp.clear();
+ temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos);
+ sepPos++;
+ temp.put(ba, sepPos, bbSize - sepPos);
+ temp.flip();
+
+ ba = temp.array();
+ bbSize = temp.remaining();
+
+ /*
+ * Find the next separator byte's position splitting word form and tag.
+ */
+ sepPos = 0;
+ for (; sepPos < bbSize; sepPos++) {
+ if (ba[sepPos] == separator)
+ break;
+ }
+
+ /*
+ * Decode the stem into stem buffer.
+ */
+ entry.stemBuffer.clear();
+ if (decodeStems) {
+ entry.stemBuffer = DictionaryLookup.decodeBaseForm(entry.stemBuffer,
+ ba, sepPos, inflectedBuffer, dictionaryMetadata);
+ } else {
+ entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer, sepPos);
+ entry.stemBuffer.put(ba, 0, sepPos);
+ }
+ entry.stemBuffer.flip();
+
+ // Skip separator character, if present.
+ if (sepPos + 1 <= bbSize) {
+ sepPos++;
+ }
+
+ /*
+ * Decode the tag data.
+ */
+ entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize
+ - sepPos);
+ entry.tagBuffer.clear();
+ entry.tagBuffer.put(ba, sepPos, bbSize - sepPos);
+ entry.tagBuffer.flip();
+
+ return entry;
+ }
+
+ /**
+ * Decode the byte buffer, optionally expanding the char buffer.
+ */
+ private CharBuffer bytesToChars(ByteBuffer bytes, CharBuffer chars) {
+ chars.clear();
+ final int maxCapacity = (int) (bytes.remaining() * decoder
+ .maxCharsPerByte());
+ if (chars.capacity() <= maxCapacity) {
+ chars = CharBuffer.allocate(maxCapacity);
+ }
+
+ bytes.mark();
+ decoder.reset();
+ decoder.decode(bytes, chars, true);
+ chars.flip();
+ bytes.reset();
+
+ return chars;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
new file mode 100644
index 0000000..5bb0716
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
@@ -0,0 +1,403 @@
+package morfologik.stemming;
+
+import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSAFinalStatesIterator;
+import morfologik.fsa.FSATraversal;
+import morfologik.fsa.MatchResult;
+import morfologik.util.BufferUtils;
+
+/**
+ * This class implements a dictionary lookup over an FSA dictionary. The
+ * dictionary for this class should be prepared from a text file using Jan
+ * Daciuk's FSA package (see link below).
+ *
+ * <p>
+ * <b>Important:</b> finite state automatons in Jan Daciuk's implementation use
+ * <em>bytes</em> not unicode characters. Therefore objects of this class always
+ * have to be constructed with an encoding used to convert Java strings to byte
+ * arrays and the other way around. You <b>can</b> use UTF-8 encoding, as it
+ * should not conflict with any control sequences and separator characters.
+ *
+ * @see <a href="http://www.eti.pg.gda.pl/~jandac/fsa.html">FSA package Web
+ * site</a>
+ */
+public final class DictionaryLookup implements IStemmer, Iterable<WordData> {
+ private static final int REMOVE_EVERYTHING = 255;
+
+ /** An FSA used for lookups. */
+ private final FSATraversal matcher;
+
+ /** An iterator for walking along the final states of {@link #fsa}. */
+ private final FSAFinalStatesIterator finalStatesIterator;
+
+ /** FSA's root node. */
+ private final int rootNode;
+
+ /** Expand buffers and arrays by this constant. */
+ private final static int EXPAND_SIZE = 10;
+
+ /** Private internal array of reusable word data objects. */
+ private WordData[] forms = new WordData[0];
+
+ /** A "view" over an array implementing */
+ private final ArrayViewList<WordData> formsList = new ArrayViewList<WordData>(
+ forms, 0, forms.length);
+
+ /**
+ * Features of the compiled dictionary.
+ *
+ * @see DictionaryMetadata
+ */
+ private final DictionaryMetadata dictionaryMetadata;
+
+ /**
+ * Charset encoder for the FSA.
+ */
+ private final CharsetEncoder encoder;
+
+ /**
+ * Charset decoder for the FSA.
+ */
+ private final CharsetDecoder decoder;
+
+ /**
+ * The FSA we are using.
+ */
+ private final FSA fsa;
+
+ /**
+ * @see #getSeparatorChar()
+ */
+ private final char separatorChar;
+
+ /**
+ * Internal reusable buffer for encoding words into byte arrays using
+ * {@link #encoder}.
+ */
+ private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
+
+ /**
+ * Internal reusable buffer for encoding words into byte arrays using
+ * {@link #encoder}.
+ */
+ private CharBuffer charBuffer = CharBuffer.allocate(0);
+
+ /**
+ * Reusable match result.
+ */
+ private final MatchResult matchResult = new MatchResult();
+
+ /**
+ * The {@link Dictionary} this lookup is using.
+ */
+ private final Dictionary dictionary;
+
+ /**
+ * <p>
+ * Creates a new object of this class using the given FSA for word lookups
+ * and encoding for converting characters to bytes.
+ *
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ */
+ public DictionaryLookup(Dictionary dictionary)
+ throws IllegalArgumentException {
+ this.dictionary = dictionary;
+ this.dictionaryMetadata = dictionary.metadata;
+ this.rootNode = dictionary.fsa.getRootNode();
+ this.fsa = dictionary.fsa;
+ this.matcher = new FSATraversal(fsa);
+ this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
+
+ if (rootNode == 0) {
+ throw new IllegalArgumentException(
+ "Dictionary must have at least the root node.");
+ }
+
+ if (dictionaryMetadata == null) {
+ throw new IllegalArgumentException(
+ "Dictionary metadata must not be null.");
+ }
+
+ decoder = dictionary.metadata.getDecoder();
+ encoder = dictionary.metadata.getEncoder();
+ separatorChar = dictionary.metadata.getSeparatorAsChar();
+ }
+
+ /**
+ * Searches the automaton for a symbol sequence equal to <code>word</code>,
+ * followed by a separator. The result is a stem (decompressed accordingly
+ * to the dictionary's specification) and an optional tag data.
+ */
+ @Override
+ public List<WordData> lookup(CharSequence word) {
+ final byte separator = dictionaryMetadata.getSeparator();
+
+ if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
+ word = Dictionary.convertText(word, dictionaryMetadata.getInputConversionPairs());
+ }
+
+ // Reset the output list to zero length.
+ formsList.wrap(forms, 0, 0);
+
+ // Encode word characters into bytes in the same encoding as the FSA's.
+ charBuffer.clear();
+ charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
+ for (int i = 0; i < word.length(); i++) {
+ char chr = word.charAt(i);
+ if (chr == separatorChar)
+ return formsList;
+ charBuffer.put(chr);
+ }
+ charBuffer.flip();
+ byteBuffer = charsToBytes(charBuffer, byteBuffer);
+
+ // Try to find a partial match in the dictionary.
+ final MatchResult match = matcher.match(matchResult, byteBuffer
+ .array(), 0, byteBuffer.remaining(), rootNode);
+
+ if (match.kind == SEQUENCE_IS_A_PREFIX) {
+ /*
+ * The entire sequence exists in the dictionary. A separator should
+ * be the next symbol.
+ */
+ final int arc = fsa.getArc(match.node, separator);
+
+ /*
+ * The situation when the arc points to a final node should NEVER
+ * happen. After all, we want the word to have SOME base form.
+ */
+ if (arc != 0 && !fsa.isArcFinal(arc)) {
+ // There is such a word in the dictionary. Return its base forms.
+ int formsCount = 0;
+
+ finalStatesIterator.restartFrom(fsa.getEndNode(arc));
+ while (finalStatesIterator.hasNext()) {
+ final ByteBuffer bb = finalStatesIterator.next();
+ final byte[] ba = bb.array();
+ final int bbSize = bb.remaining();
+
+ if (formsCount >= forms.length) {
+ forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
+ for (int k = 0; k < forms.length; k++) {
+ if (forms[k] == null)
+ forms[k] = new WordData(decoder);
+ }
+ }
+
+ /*
+ * Now, expand the prefix/ suffix 'compression' and store
+ * the base form.
+ */
+ final WordData wordData = forms[formsCount++];
+ wordData.reset();
+
+ wordData.wordBuffer = byteBuffer;
+ if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
+ wordData.wordCharSequence = word;
+ } else {
+ wordData.wordCharSequence = Dictionary.convertText(word,
+ dictionaryMetadata.getOutputConversionPairs());
+ }
+
+ /*
+ * Find the separator byte's position splitting the inflection instructions
+ * from the tag.
+ */
+ int sepPos;
+ for (sepPos = 0; sepPos < bbSize; sepPos++) {
+ if (ba[sepPos] == separator)
+ break;
+ }
+
+ /*
+ * Decode the stem into stem buffer.
+ */
+ wordData.stemBuffer.clear();
+ wordData.stemBuffer = decodeBaseForm(wordData.stemBuffer, ba,
+ sepPos, byteBuffer, dictionaryMetadata);
+ wordData.stemBuffer.flip();
+
+ // Skip separator character.
+ sepPos++;
+
+ /*
+ * Decode the tag data.
+ */
+ final int tagSize = bbSize - sepPos;
+ if (tagSize > 0) {
+ wordData.tagBuffer = BufferUtils.ensureCapacity(
+ wordData.tagBuffer, tagSize);
+ wordData.tagBuffer.clear();
+ wordData.tagBuffer.put(ba, sepPos, tagSize);
+ wordData.tagBuffer.flip();
+ }
+ }
+
+ formsList.wrap(forms, 0, formsCount);
+ }
+ } else {
+ /*
+ * this case is somewhat confusing: we should have hit the separator
+ * first... I don't really know how to deal with it at the time
+ * being.
+ */
+ }
+ return formsList;
+ }
+
+ /**
+ * Decode the base form of an inflected word and save its decoded form into
+ * a byte buffer.
+ *
+ * @param output
+ * The byte buffer to save the result to. A new buffer may be
+ * allocated if the capacity of <code>bb</code> is not large
+ * enough to store the result. The buffer is not flipped upon
+ * return.
+ *
+ * @param inflectedForm
+ * Inflected form's bytes (decoded properly).
+ *
+ * @param encoded
+ * Bytes of the encoded base form, starting at 0 index.
+ *
+ * @param encodedLen
+ * Length of the encode base form.
+ *
+ * @return Returns either <code>bb</code> or a new buffer whose capacity is
+ * large enough to store the output of the decoded data.
+ */
+ public static ByteBuffer decodeBaseForm(
+ ByteBuffer output,
+ byte[] encoded,
+ int encodedLen,
+ ByteBuffer inflectedForm,
+ DictionaryMetadata metadata) {
+
+ // FIXME: We should eventually get rid of this method and use
+ // each encoder's #decode method. The problem is that we'd have to include
+ // HPPC or roundtrip via HPPC to a ByteBuffer, which would slow things down.
+ // Since this is performance-crucial routine, I leave it for now.
+
+ // Prepare the buffer.
+ output.clear();
+
+ assert inflectedForm.position() == 0;
+
+ // Increase buffer size (overallocating), if needed.
+ final byte[] src = inflectedForm.array();
+ final int srcLen = inflectedForm.remaining();
+ if (output.capacity() < srcLen + encodedLen) {
+ output = ByteBuffer.allocate(srcLen + encodedLen);
+ }
+
+ switch (metadata.getEncoderType()) {
+ case SUFFIX:
+ int suffixTrimCode = encoded[0];
+ int truncateBytes = suffixTrimCode - 'A' & 0xFF;
+ if (truncateBytes == REMOVE_EVERYTHING) {
+ truncateBytes = srcLen;
+ }
+ output.put(src, 0, srcLen - truncateBytes);
+ output.put(encoded, 1, encodedLen - 1);
+ break;
+
+ case PREFIX:
+ int truncatePrefixBytes = encoded[0] - 'A' & 0xFF;
+ int truncateSuffixBytes = encoded[1] - 'A' & 0xFF;
+ if (truncatePrefixBytes == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ truncatePrefixBytes = srcLen;
+ truncateSuffixBytes = 0;
+ }
+ output.put(src, truncatePrefixBytes, srcLen - (truncateSuffixBytes + truncatePrefixBytes));
+ output.put(encoded, 2, encodedLen - 2);
+ break;
+
+ case INFIX:
+ int infixIndex = encoded[0] - 'A' & 0xFF;
+ int infixLength = encoded[1] - 'A' & 0xFF;
+ truncateSuffixBytes = encoded[2] - 'A' & 0xFF;
+ if (infixLength == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ infixIndex = 0;
+ infixLength = srcLen;
+ truncateSuffixBytes = 0;
+ }
+ output.put(src, 0, infixIndex);
+ output.put(src, infixIndex + infixLength, srcLen - (infixIndex + infixLength + truncateSuffixBytes));
+ output.put(encoded, 3, encodedLen - 3);
+ break;
+
+ case NONE:
+ output.put(encoded, 0, encodedLen);
+ break;
+
+ default:
+ throw new RuntimeException("Unhandled switch/case: " + metadata.getEncoderType());
+ }
+
+ return output;
+ }
+
+ /**
+ * Encode a character sequence into a byte buffer, optionally expanding
+ * buffer.
+ */
+ private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
+ bytes.clear();
+ final int maxCapacity = (int) (chars.remaining() * encoder
+ .maxBytesPerChar());
+ if (bytes.capacity() <= maxCapacity) {
+ bytes = ByteBuffer.allocate(maxCapacity);
+ }
+
+ chars.mark();
+ encoder.reset();
+ if (encoder.encode(chars, bytes, true).isError()) {
+ // remove everything, we don't want to accept malformed input
+ bytes.clear();
+ }
+ bytes.flip();
+ chars.reset();
+
+ return bytes;
+ }
+
+ /**
+ * Return an iterator over all {@link WordData} entries available in the
+ * embedded {@link Dictionary}.
+ */
+ @Override
+ public Iterator<WordData> iterator() {
+ return new DictionaryIterator(dictionary, decoder, true);
+ }
+
+ /**
+ * @return Return the {@link Dictionary} used by this object.
+ */
+ public Dictionary getDictionary() {
+ return dictionary;
+ }
+
+ /**
+ * @return Returns the logical separator character splitting inflected form,
+ * lemma correction token and a tag. Note that this character is a best-effort
+ * conversion from a byte in {@link DictionaryMetadata#separator} and
+ * may not be valid in the target encoding (although this is highly unlikely).
+ */
+ public char getSeparatorChar() {
+ return separatorChar;
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java
new file mode 100644
index 0000000..1475de6
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java
@@ -0,0 +1,298 @@
+package morfologik.stemming;
+
+import static morfologik.stemming.DictionaryAttribute.CONVERT_CASE;
+import static morfologik.stemming.DictionaryAttribute.ENCODING;
+import static morfologik.stemming.DictionaryAttribute.FREQUENCY_INCLUDED;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_ALL_UPPERCASE;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_CAMEL_CASE;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_DIACRITICS;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_NUMBERS;
+import static morfologik.stemming.DictionaryAttribute.IGNORE_PUNCTUATION;
+import static morfologik.stemming.DictionaryAttribute.RUN_ON_WORDS;
+import static morfologik.stemming.DictionaryAttribute.SEPARATOR;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Description of attributes, their types and default values.
+ *
+ * @see Dictionary
+ */
+public final class DictionaryMetadata {
+ /**
+ * Default attribute values.
+ */
+ private static Map<DictionaryAttribute, String> DEFAULT_ATTRIBUTES = new DictionaryMetadataBuilder()
+ .separator('+')
+ .encoder(EncoderType.SUFFIX)
+ .frequencyIncluded()
+ .ignorePunctuation()
+ .ignoreNumbers()
+ .ignoreCamelCase()
+ .ignoreAllUppercase()
+ .ignoreDiacritics()
+ .convertCase()
+ .supportRunOnWords()
+ .toMap();
+
+ /**
+ * Required attributes.
+ */
+ private static EnumSet<DictionaryAttribute> REQUIRED_ATTRIBUTES = EnumSet.of(
+ SEPARATOR,
+ ENCODING);
+
+ /**
+ * A separator character between fields (stem, lemma, form). The character
+ * must be within byte range (FSA uses bytes internally).
+ */
+ private byte separator;
+ private char separatorChar;
+
+ /**
+ * Encoding used for converting bytes to characters and vice versa.
+ */
+ private String encoding;
+
+ private Charset charset;
+ private Locale locale = Locale.getDefault();
+
+ /**
+ * Replacement pairs for non-obvious candidate search in a speller dictionary.
+ */
+ private Map<String, List<String>> replacementPairs = Collections.emptyMap();
+
+ /**
+ * Conversion pairs for input conversion, for example to replace ligatures.
+ */
+ private Map<String, String> inputConversion = Collections.emptyMap();
+
+ /**
+ * Conversion pairs for output conversion, for example to replace ligatures.
+ */
+ private Map<String, String> outputConversion = Collections.emptyMap();
+
+ /**
+ * Equivalent characters (treated similarly as equivalent chars with and without
+ * diacritics). For example, Polish <tt>ł</tt> can be specified as equivalent to <tt>l</tt>.
+ *
+ * This implements a feature similar to hunspell MAP in the affix file.
+ */
+ private Map<Character, List<Character>> equivalentChars = Collections.emptyMap();
+
+ /**
+ * All attributes.
+ */
+ private final EnumMap<DictionaryAttribute, String> attributes;
+
+ /**
+ * All "enabled" boolean attributes.
+ */
+ private final EnumMap<DictionaryAttribute,Boolean> boolAttributes;
+
+ /**
+ * Sequence encoder.
+ */
+ private EncoderType encoderType;
+
+ /**
+ * Return all attributes.
+ */
+ public Map<DictionaryAttribute, String> getAttributes() {
+ return Collections.unmodifiableMap(attributes);
+ }
+
+ // Cached attrs.
+ public String getEncoding() { return encoding; }
+ public byte getSeparator() { return separator; }
+ public Locale getLocale() { return locale; }
+
+ public Map<String, String> getInputConversionPairs() { return inputConversion; }
+ public Map<String, String> getOutputConversionPairs() { return outputConversion; }
+
+ public Map<String, List<String>> getReplacementPairs() { return replacementPairs; }
+ public Map<Character, List<Character>> getEquivalentChars() { return equivalentChars; }
+
+ // Dynamically fetched.
+ public boolean isFrequencyIncluded() { return boolAttributes.get(FREQUENCY_INCLUDED); }
+ public boolean isIgnoringPunctuation() { return boolAttributes.get(IGNORE_PUNCTUATION); }
+ public boolean isIgnoringNumbers() { return boolAttributes.get(IGNORE_NUMBERS); }
+ public boolean isIgnoringCamelCase() { return boolAttributes.get(IGNORE_CAMEL_CASE); }
+ public boolean isIgnoringAllUppercase() { return boolAttributes.get(IGNORE_ALL_UPPERCASE); }
+ public boolean isIgnoringDiacritics() { return boolAttributes.get(IGNORE_DIACRITICS); }
+ public boolean isConvertingCase() { return boolAttributes.get(CONVERT_CASE); }
+ public boolean isSupportingRunOnWords() { return boolAttributes.get(RUN_ON_WORDS); }
+
+ /**
+ * Create an instance from an attribute map.
+ *
+ * @see DictionaryMetadataBuilder
+ */
+ public DictionaryMetadata(Map<DictionaryAttribute, String> userAttrs) {
+ this.boolAttributes = new EnumMap<DictionaryAttribute,Boolean>(DictionaryAttribute.class);
+ this.attributes = new EnumMap<DictionaryAttribute, String>(DictionaryAttribute.class);
+ this.attributes.putAll(userAttrs);
+
+ EnumMap<DictionaryAttribute, String> attrs = new EnumMap<DictionaryAttribute, String>(DEFAULT_ATTRIBUTES);
+ attrs.putAll(userAttrs);
+
+ // Convert some attrs from the map to local fields for performance reasons.
+ EnumSet<DictionaryAttribute> requiredAttributes = EnumSet.copyOf(REQUIRED_ATTRIBUTES);
+
+ for (Map.Entry<DictionaryAttribute,String> e : attrs.entrySet()) {
+ requiredAttributes.remove(e.getKey());
+
+ // Run validation and conversion on all of them.
+ Object value = e.getKey().fromString(e.getValue());
+ switch (e.getKey()) {
+ case ENCODING:
+ this.encoding = e.getValue();
+ if (!Charset.isSupported(encoding)) {
+ throw new IllegalArgumentException("Encoding not supported on this JVM: "
+ + encoding);
+ }
+ this.charset = (Charset) value;
+ break;
+
+ case SEPARATOR:
+ this.separatorChar = (Character) value;
+ break;
+
+ case LOCALE:
+ this.locale = (Locale) value;
+ break;
+
+ case ENCODER:
+ this.encoderType = (EncoderType) value;
+ break;
+
+ case INPUT_CONVERSION:
+ {
+ @SuppressWarnings("unchecked")
+ Map<String, String> gvalue = (Map<String, String>) value;
+ this.inputConversion = gvalue;
+ }
+ break;
+
+ case OUTPUT_CONVERSION:
+ {
+ @SuppressWarnings("unchecked")
+ Map<String, String> gvalue = (Map<String, String>) value;
+ this.outputConversion = gvalue;
+ }
+ break;
+
+ case REPLACEMENT_PAIRS:
+ {
+ @SuppressWarnings("unchecked")
+ Map<String, List<String>> gvalue = (Map<String, List<String>>) value;
+ this.replacementPairs = gvalue;
+ }
+ break;
+
+ case EQUIVALENT_CHARS:
+ {
+ @SuppressWarnings("unchecked")
+ Map<Character, List<Character>> gvalue = (Map<Character, List<Character>>) value;
+ this.equivalentChars = gvalue;
+ }
+ break;
+
+ case IGNORE_PUNCTUATION:
+ case IGNORE_NUMBERS:
+ case IGNORE_CAMEL_CASE:
+ case IGNORE_ALL_UPPERCASE:
+ case IGNORE_DIACRITICS:
+ case CONVERT_CASE:
+ case RUN_ON_WORDS:
+ case FREQUENCY_INCLUDED:
+ this.boolAttributes.put(e.getKey(), (Boolean) value);
+ break;
+
+ case AUTHOR:
+ case LICENSE:
+ case CREATION_DATE:
+ // Just run validation.
+ e.getKey().fromString(e.getValue());
+ break;
+
+ default:
+ throw new RuntimeException("Unexpected code path (attribute should be handled but is not): " + e.getKey());
+ }
+ }
+
+ if (!requiredAttributes.isEmpty()) {
+ throw new IllegalArgumentException("At least one the required attributes was not provided: "
+ + requiredAttributes.toString());
+ }
+
+ // Sanity check.
+ CharsetEncoder encoder = getEncoder();
+ try {
+ ByteBuffer encoded = encoder.encode(CharBuffer.wrap(new char [] { separatorChar }));
+ if (encoded.remaining() > 1) {
+ throw new IllegalArgumentException("Separator character is not a single byte in encoding "
+ + encoding + ": " + separatorChar);
+ }
+ this.separator = encoded.get();
+ } catch (CharacterCodingException e) {
+ throw new IllegalArgumentException("Separator character cannot be converted to a byte in "
+ + encoding + ": " + separatorChar, e);
+ }
+ }
+
+ /**
+ * Returns a new {@link CharsetDecoder} for the {@link #encoding}.
+ */
+ public CharsetDecoder getDecoder() {
+ try {
+ return charset.newDecoder().onMalformedInput(
+ CodingErrorAction.REPORT).onUnmappableCharacter(
+ CodingErrorAction.REPORT);
+ } catch (UnsupportedCharsetException e) {
+ throw new RuntimeException(
+ "FSA's encoding charset is not supported: " + encoding);
+ }
+ }
+
+ /**
+ * Returns a new {@link CharsetEncoder} for the {@link #encoding}.
+ */
+ public CharsetEncoder getEncoder() {
+ try {
+ return charset.newEncoder();
+ } catch (UnsupportedCharsetException e) {
+ throw new RuntimeException(
+ "FSA's encoding charset is not supported: " + encoding);
+ }
+ }
+
+ /**
+ * Return sequence encoder type.
+ */
+ public EncoderType getEncoderType() {
+ return encoderType;
+ }
+
+ /**
+ * Returns the {@link #separator} byte converted to a single <code>char</code>. Throws
+ * a {@link RuntimeException} if this conversion is for some reason impossible
+ * (the byte is a surrogate pair, FSA's {@link #encoding} is not available).
+ */
+ public char getSeparatorAsChar() {
+ return separatorChar;
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java
new file mode 100644
index 0000000..7e85ecb
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java
@@ -0,0 +1,139 @@
+package morfologik.stemming;
+
+import java.nio.charset.Charset;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Helper class to build {@link DictionaryMetadata} instances.
+ */
+public final class DictionaryMetadataBuilder {
+ private final EnumMap<DictionaryAttribute, String> attrs
+ = new EnumMap<DictionaryAttribute, String>(DictionaryAttribute.class);
+
+ public DictionaryMetadataBuilder separator(char c) {
+ this.attrs.put(DictionaryAttribute.SEPARATOR, Character.toString(c));
+ return this;
+ }
+
+ public DictionaryMetadataBuilder encoding(Charset charset) {
+ return encoding(charset.name());
+ }
+
+ public DictionaryMetadataBuilder encoding(String charsetName) {
+ this.attrs.put(DictionaryAttribute.ENCODING, charsetName);
+ return this;
+ }
+
+ public DictionaryMetadataBuilder frequencyIncluded() { return frequencyIncluded(false); }
+ public DictionaryMetadataBuilder frequencyIncluded(boolean v) { this.attrs.put(DictionaryAttribute.FREQUENCY_INCLUDED, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder ignorePunctuation() { return ignorePunctuation(true); }
+ public DictionaryMetadataBuilder ignorePunctuation(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_PUNCTUATION, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder ignoreNumbers() { return ignoreNumbers(true); }
+ public DictionaryMetadataBuilder ignoreNumbers(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_NUMBERS, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder ignoreCamelCase() { return ignoreCamelCase(true); }
+ public DictionaryMetadataBuilder ignoreCamelCase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_CAMEL_CASE, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder ignoreAllUppercase() { return ignoreAllUppercase(true); }
+ public DictionaryMetadataBuilder ignoreAllUppercase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_ALL_UPPERCASE, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder ignoreDiacritics() { return ignoreDiacritics(true); }
+ public DictionaryMetadataBuilder ignoreDiacritics(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_DIACRITICS, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder convertCase() { return convertCase(true); }
+ public DictionaryMetadataBuilder convertCase(boolean v) { this.attrs.put(DictionaryAttribute.CONVERT_CASE, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder supportRunOnWords() { return supportRunOnWords(true); }
+ public DictionaryMetadataBuilder supportRunOnWords(boolean v) { this.attrs.put(DictionaryAttribute.RUN_ON_WORDS, Boolean.valueOf(v).toString()); return this; }
+
+ public DictionaryMetadataBuilder encoder(EncoderType type) {
+ this.attrs.put(DictionaryAttribute.ENCODER, type.name());
+ return this;
+ }
+
+ public DictionaryMetadataBuilder locale(Locale locale) {
+ return locale(locale.toString());
+ }
+
+ public DictionaryMetadataBuilder locale(String localeName) {
+ this.attrs.put(DictionaryAttribute.LOCALE, localeName);
+ return this;
+ }
+
+ public DictionaryMetadataBuilder withReplacementPairs(Map<String, List<String>> replacementPairs) {
+ StringBuilder builder = new StringBuilder();
+ for (Map.Entry<String,List<String>> e : replacementPairs.entrySet()) {
+ String k = e.getKey();
+ for (String v : e.getValue()) {
+ if (builder.length() > 0) builder.append(", ");
+ builder.append(k).append(" ").append(v);
+ }
+ }
+ this.attrs.put(DictionaryAttribute.REPLACEMENT_PAIRS, builder.toString());
+ return this;
+ }
+
+ public DictionaryMetadataBuilder withEquivalentChars(Map<Character, List<Character>> equivalentChars) {
+ StringBuilder builder = new StringBuilder();
+ for (Map.Entry<Character,List<Character>> e : equivalentChars.entrySet()) {
+ Character k = e.getKey();
+ for (Character v : e.getValue()) {
+ if (builder.length() > 0) builder.append(", ");
+ builder.append(k).append(" ").append(v);
+ }
+ }
+ this.attrs.put(DictionaryAttribute.EQUIVALENT_CHARS, builder.toString());
+ return this;
+ }
+
+ public DictionaryMetadataBuilder withInputConversionPairs(Map<String, String> conversionPairs) {
+ StringBuilder builder = new StringBuilder();
+ for (Map.Entry<String,String> e : conversionPairs.entrySet()) {
+ String k = e.getKey();
+ if (builder.length() > 0) builder.append(", ");
+ builder.append(k).append(" ").append(conversionPairs.get(k));
+ }
+ this.attrs.put(DictionaryAttribute.INPUT_CONVERSION, builder.toString());
+ return this;
+ }
+
+ public DictionaryMetadataBuilder withOutputConversionPairs(Map<String, String> conversionPairs) {
+ StringBuilder builder = new StringBuilder();
+ for (Map.Entry<String,String> e : conversionPairs.entrySet()) {
+ String k = e.getKey();
+ if (builder.length() > 0) builder.append(", ");
+ builder.append(k).append(" ").append(conversionPairs.get(k));
+ }
+ this.attrs.put(DictionaryAttribute.OUTPUT_CONVERSION, builder.toString());
+ return this;
+ }
+
+
+ public DictionaryMetadataBuilder author(String author) {
+ this.attrs.put(DictionaryAttribute.AUTHOR, author);
+ return this;
+ }
+
+ public DictionaryMetadataBuilder creationDate(String creationDate) {
+ this.attrs.put(DictionaryAttribute.CREATION_DATE, creationDate);
+ return this;
+ }
+
+ public DictionaryMetadataBuilder license(String license) {
+ this.attrs.put(DictionaryAttribute.LICENSE, license);
+ return this;
+ }
+
+ public DictionaryMetadata build() {
+ return new DictionaryMetadata(attrs);
+ }
+
+ public EnumMap<DictionaryAttribute, String> toMap() {
+ return new EnumMap<DictionaryAttribute, String>(attrs);
+ }
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java
new file mode 100644
index 0000000..093cfbb
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java
@@ -0,0 +1,11 @@
+package morfologik.stemming;
+
+/**
+ * Sequence encoder type.
+ */
+public enum EncoderType {
+ SUFFIX,
+ PREFIX,
+ INFIX,
+ NONE;
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java
new file mode 100644
index 0000000..6e59526
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java
@@ -0,0 +1,20 @@
+package morfologik.stemming;
+
+import java.util.List;
+
+/**
+ * A generic &quot;stemmer&quot; interface in Morfologik.
+ */
+public interface IStemmer {
+ /**
+ * Returns a list of {@link WordData} entries for a given word. The returned
+ * list is never <code>null</code>. Depending on the stemmer's
+ * implementation the {@link WordData} may carry the stem and additional
+ * information (tag) or just the stem.
+ * <p>
+ * The returned list and any object it contains are not usable after a
+ * subsequent call to this method. Any data that should be stored in between
+ * must be copied by the caller.
+ */
+ public List<WordData> lookup(CharSequence word);
+}
diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java
new file mode 100644
index 0000000..a1bdaa0
--- /dev/null
+++ b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java
@@ -0,0 +1,254 @@
+package morfologik.stemming;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.*;
+
+import morfologik.util.BufferUtils;
+
+/**
+ * Stem and tag data associated with a given word.
+ *
+ * <p>
+ * <b>Important notes:</b>
+ * <ul>
+ * <li>Objects of this class are <i>volatile</i> (their content changes on
+ * subsequent calls to {@link DictionaryLookup} class. If you need a copy of the
+ * stem or tag data for a given word, you have to create a custom buffer
+ * yourself and copy the associated data, perform {@link #clone()} or create
+ * strings (they are immutable) using {@link #getStem()} and then
+ * {@link CharSequence#toString()}.</li>
+ * <li>Objects of this class must not be used in any Java collections. In fact
+ * both equals and hashCode methods are overridden and throw exceptions to
+ * prevent accidental damage.</li>
+ * </ul>
+ */
+public final class WordData implements Cloneable {
+ /**
+ * Error information if somebody puts us in a Java collection.
+ */
+ private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use"
+ + " in Java collections framework (volatile content). Refer to documentation.";
+
+ /** Character encoding in internal buffers. */
+ private final CharsetDecoder decoder;
+
+ /**
+ * Inflected word form data.
+ */
+ CharSequence wordCharSequence;
+
+ /**
+ * Character sequence after converting {@link #stemBuffer} using
+ * {@link #decoder}.
+ */
+ private CharBuffer stemCharSequence;
+
+ /**
+ * Character sequence after converting {@link #tagBuffer} using
+ * {@link #decoder}.
+ */
+ private CharBuffer tagCharSequence;
+
+ /** Byte buffer holding the inflected word form data. */
+ ByteBuffer wordBuffer;
+
+ /** Byte buffer holding stem data. */
+ ByteBuffer stemBuffer;
+
+ /** Byte buffer holding tag data. */
+ ByteBuffer tagBuffer;
+
+ /**
+ * Package scope constructor.
+ */
+ WordData(CharsetDecoder decoder) {
+ this.decoder = decoder;
+
+ stemBuffer = ByteBuffer.allocate(0);
+ tagBuffer = ByteBuffer.allocate(0);
+ stemCharSequence = CharBuffer.allocate(0);
+ tagCharSequence = CharBuffer.allocate(0);
+ }
+
+ /**
+ * A constructor for tests only.
+ */
+ WordData(String stem, String tag, String encoding) {
+ this(Charset.forName(encoding).newDecoder());
+
+ try {
+ if (stem != null)
+ stemBuffer.put(stem.getBytes(encoding));
+ if (tag != null)
+ tagBuffer.put(tag.getBytes(encoding));
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Copy the stem's binary data (no charset decoding) to a custom byte
+ * buffer. If the buffer is null or not large enough to hold the result, a
+ * new buffer is allocated.
+ *
+ * @param target
+ * Target byte buffer to copy the stem buffer to or
+ * <code>null</code> if a new buffer should be allocated.
+ *
+ * @return Returns <code>target</code> or the new reallocated buffer.
+ */
+ public ByteBuffer getStemBytes(ByteBuffer target) {
+ target = BufferUtils.ensureCapacity(target, stemBuffer.remaining());
+ stemBuffer.mark();
+ target.put(stemBuffer);
+ stemBuffer.reset();
+ target.flip();
+ return target;
+ }
+
+ /**
+ * Copy the tag's binary data (no charset decoding) to a custom byte buffer.
+ * If the buffer is null or not large enough to hold the result, a new
+ * buffer is allocated.
+ *
+ * @param target
+ * Target byte buffer to copy the tag buffer to or
+ * <code>null</code> if a new buffer should be allocated.
+ *
+ * @return Returns <code>target</code> or the new reallocated buffer.
+ */
+ public ByteBuffer getTagBytes(ByteBuffer target) {
+ target = BufferUtils.ensureCapacity(target, tagBuffer.remaining());
+ tagBuffer.mark();
+ target.put(tagBuffer);
+ tagBuffer.reset();
+ target.flip();
+ return target;
+ }
+
+ /**
+ * Copy the inflected word's binary data (no charset decoding) to a custom
+ * byte buffer. If the buffer is null or not large enough to hold the
+ * result, a new buffer is allocated.
+ *
+ * @param target
+ * Target byte buffer to copy the word buffer to or
+ * <code>null</code> if a new buffer should be allocated.
+ *
+ * @return Returns <code>target</code> or the new reallocated buffer.
+ */
+ public ByteBuffer getWordBytes(ByteBuffer target) {
+ target = BufferUtils.ensureCapacity(target, wordBuffer.remaining());
+ wordBuffer.mark();
+ target.put(wordBuffer);
+ wordBuffer.reset();
+ target.flip();
+ return target;
+ }
+
+ /**
+ * @return Return tag data decoded to a character sequence or
+ * <code>null</code> if no associated tag data exists.
+ */
+ public CharSequence getTag() {
+ tagCharSequence = decode(tagBuffer, tagCharSequence);
+ return tagCharSequence.remaining() == 0 ? null : tagCharSequence;
+ }
+
+ /**
+ * @return Return stem data decoded to a character sequence or
+ * <code>null</code> if no associated stem data exists.
+ */
+ public CharSequence getStem() {
+ stemCharSequence = decode(stemBuffer, stemCharSequence);
+ return stemCharSequence.remaining() == 0 ? null : stemCharSequence;
+ }
+
+ /**
+ * @return Return inflected word form data. Usually the parameter passed to
+ * {@link DictionaryLookup#lookup(CharSequence)}.
+ */
+ public CharSequence getWord() {
+ return wordCharSequence;
+ }
+
+ /*
+ *
+ */
+ @Override
+ public boolean equals(Object obj) {
+ throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
+ }
+
+ /*
+ *
+ */
+ @Override
+ public int hashCode() {
+ throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
+ }
+
+ @Override
+ public String toString() {
+ return "WordData["
+ + this.getWord() + ","
+ + this.getStem() + ","
+ + this.getTag() + "]";
+ }
+
+ /**
+ * Declare a covariant of {@link Object#clone()} that returns a deep copy of
+ * this object. The content of all internal buffers is copied.
+ */
+ @Override
+ protected WordData clone() {
+ final WordData clone = new WordData(this.decoder);
+ clone.wordCharSequence = cloneCharSequence(wordCharSequence);
+ clone.wordBuffer = getWordBytes(null);
+ clone.stemBuffer = getStemBytes(null);
+ clone.tagBuffer = getTagBytes(null);
+ return clone;
+ }
+
+ /**
+ * Clone char sequences only if not immutable.
+ */
+ private CharSequence cloneCharSequence(CharSequence chs) {
+ if (chs instanceof String)
+ return chs;
+ return chs.toString();
+ }
+
+ /**
+ * Reset internal structures for storing another word's data.
+ */
+ void reset() {
+ this.wordCharSequence = null;
+ this.wordBuffer = null;
+ this.stemCharSequence.clear();
+ this.tagCharSequence.clear();
+ this.stemBuffer.clear();
+ this.tagBuffer.clear();
+ }
+
+ /**
+ * Decode byte buffer, optionally expanding the char buffer to.
+ */
+ private CharBuffer decode(ByteBuffer bytes, CharBuffer chars) {
+ chars.clear();
+ final int maxCapacity = (int) (bytes.remaining() * decoder.maxCharsPerByte());
+ if (chars.capacity() <= maxCapacity) {
+ chars = CharBuffer.allocate(maxCapacity);
+ }
+
+ bytes.mark();
+ decoder.reset();
+ decoder.decode(bytes, chars, true);
+ chars.flip();
+ bytes.reset();
+
+ return chars;
+ }
+}