summaryrefslogtreecommitdiff
path: root/src/morfologik/stemming/DictionaryLookup.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/morfologik/stemming/DictionaryLookup.java')
-rw-r--r--src/morfologik/stemming/DictionaryLookup.java355
1 files changed, 0 insertions, 355 deletions
diff --git a/src/morfologik/stemming/DictionaryLookup.java b/src/morfologik/stemming/DictionaryLookup.java
deleted file mode 100644
index ac90107..0000000
--- a/src/morfologik/stemming/DictionaryLookup.java
+++ /dev/null
@@ -1,355 +0,0 @@
-package morfologik.stemming;
-
-import static morfologik.fsa.MatchResult.*;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.*;
-import java.util.*;
-
-import morfologik.fsa.*;
-import morfologik.util.BufferUtils;
-
-/**
- * This class implements a dictionary lookup over an FSA dictionary. The
- * dictionary for this class should be prepared from a text file using Jan
- * Daciuk's FSA package (see link below).
- *
- * <p>
- * <b>Important:</b> finite state automatons in Jan Daciuk's implementation use
- * <em>bytes</em> not unicode characters. Therefore objects of this class always
- * have to be constructed with an encoding used to convert Java strings to byte
- * arrays and the other way around. You <b>can</b> use UTF-8 encoding, as it
- * should not conflict with any control sequences and separator characters.
- *
- * @see <a href="http://www.eti.pg.gda.pl/~jandac/fsa.html">FSA package Web
- * site</a>
- */
-public final class DictionaryLookup implements IStemmer, Iterable<WordData> {
- /** An FSA used for lookups. */
- private final FSATraversal matcher;
-
- /** An iterator for walking along the final states of {@link #fsa}. */
- private final FSAFinalStatesIterator finalStatesIterator;
-
- /** FSA's root node. */
- private final int rootNode;
-
- /** Expand buffers and arrays by this constant. */
- private final static int EXPAND_SIZE = 10;
-
- /** Private internal array of reusable word data objects. */
- private WordData[] forms = new WordData[0];
-
- /** A "view" over an array implementing */
- private ArrayViewList<WordData> formsList = new ArrayViewList<WordData>(
- forms, 0, forms.length);
-
- /**
- * Features of the compiled dictionary.
- *
- * @see DictionaryMetadata
- */
- private final DictionaryMetadata dictionaryMetadata;
-
- /**
- * Charset encoder for the FSA.
- */
- private final CharsetEncoder encoder;
-
- /**
- * Charset decoder for the FSA.
- */
- private final CharsetDecoder decoder;
-
- /**
- * The FSA we are using.
- */
- private final FSA fsa;
-
- /**
- * Internal reusable buffer for encoding words into byte arrays using
- * {@link #encoder}.
- */
- private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
-
- /**
- * Internal reusable buffer for encoding words into byte arrays using
- * {@link #encoder}.
- */
- private CharBuffer charBuffer = CharBuffer.allocate(0);
-
- /**
- * Reusable match result.
- */
- private final MatchResult matchResult = new MatchResult();
-
- /**
- * The {@link Dictionary} this lookup is using.
- */
- private final Dictionary dictionary;
-
- /**
- * <p>
- * Creates a new object of this class using the given FSA for word lookups
- * and encoding for converting characters to bytes.
- *
- * @throws IllegalArgumentException
- * if FSA's root node cannot be acquired (dictionary is empty).
- */
- public DictionaryLookup(Dictionary dictionary)
- throws IllegalArgumentException {
- this.dictionary = dictionary;
- this.dictionaryMetadata = dictionary.metadata;
- this.rootNode = dictionary.fsa.getRootNode();
- this.fsa = dictionary.fsa;
- this.matcher = new FSATraversal(fsa);
- this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
-
- if (rootNode == 0) {
- throw new IllegalArgumentException(
- "Dictionary must have at least the root node.");
- }
-
- if (dictionaryMetadata == null) {
- throw new IllegalArgumentException(
- "Dictionary metadata must not be null.");
- }
-
- try {
- Charset charset = Charset.forName(dictionaryMetadata.encoding);
- encoder = charset.newEncoder();
- decoder = charset.newDecoder().onMalformedInput(
- CodingErrorAction.REPORT).onUnmappableCharacter(
- CodingErrorAction.REPORT);
- } catch (UnsupportedCharsetException e) {
- throw new RuntimeException(
- "FSA's encoding charset is not supported: "
- + dictionaryMetadata.encoding);
- }
- }
-
- /**
- * Searches the automaton for a symbol sequence equal to <code>word</code>,
- * followed by a separator. The result is a stem (decompressed accordingly
- * to the dictionary's specification) and an optional tag data.
- */
- public List<WordData> lookup(CharSequence word) {
- final byte separator = dictionaryMetadata.separator;
-
- // Encode word characters into bytes in the same encoding as the FSA's.
- charBuffer.clear();
- charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
- for (int i = 0; i < word.length(); i++)
- charBuffer.put(word.charAt(i));
- charBuffer.flip();
- byteBuffer = charsToBytes(charBuffer, byteBuffer);
-
- // Try to find a partial match in the dictionary.
- final MatchResult match = matcher.match(matchResult, byteBuffer
- .array(), 0, byteBuffer.remaining(), rootNode);
-
- if (match.kind == SEQUENCE_IS_A_PREFIX) {
- /*
- * The entire sequence exists in the dictionary. A separator should
- * be the next symbol.
- */
- final int arc = fsa.getArc(match.node, separator);
-
- /*
- * The situation when the arc points to a final node should NEVER
- * happen. After all, we want the word to have SOME base form.
- */
- if (arc != 0 && !fsa.isArcFinal(arc)) {
- // There is such a word in the dictionary. Return its base forms.
- int formsCount = 0;
-
- finalStatesIterator.restartFrom(fsa.getEndNode(arc));
- while (finalStatesIterator.hasNext()) {
- final ByteBuffer bb = finalStatesIterator.next();
- final byte[] ba = bb.array();
- final int bbSize = bb.remaining();
-
- if (formsCount >= forms.length) {
- forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
- for (int k = 0; k < forms.length; k++) {
- if (forms[k] == null)
- forms[k] = new WordData(decoder);
- }
- }
-
- /*
- * Now, expand the prefix/ suffix 'compression' and store
- * the base form.
- */
- final WordData wordData = forms[formsCount++];
- wordData.reset();
-
- wordData.wordBuffer = byteBuffer;
- wordData.wordCharSequence = word;
-
- /*
- * Find the separator byte's position splitting word form
- * and tag.
- */
- int sepPos;
- for (sepPos = 0; sepPos < bbSize; sepPos++) {
- if (ba[sepPos] == separator)
- break;
- }
-
- /*
- * Decode the stem into stem buffer.
- */
- wordData.stemBuffer.clear();
- wordData.stemBuffer = decodeStem(wordData.stemBuffer, ba,
- sepPos, byteBuffer, dictionaryMetadata);
- wordData.stemBuffer.flip();
-
- // Skip separator character.
- sepPos++;
-
- /*
- * Decode the tag data.
- */
- wordData.tagBuffer = BufferUtils.ensureCapacity(
- wordData.tagBuffer, bbSize - sepPos);
- wordData.tagBuffer.clear();
- wordData.tagBuffer.put(ba, sepPos, bbSize - sepPos);
- wordData.tagBuffer.flip();
- }
-
- formsList.wrap(forms, 0, formsCount);
- return formsList;
- }
- } else {
- /*
- * this case is somewhat confusing: we should have hit the separator
- * first... I don't really know how to deal with it at the time
- * being.
- */
- }
-
- return Collections.emptyList();
- }
-
- /**
- * Decode the base form of an inflected word and save its decoded form into
- * a byte buffer.
- *
- * @param bb
- * The byte buffer to save the result to. A new buffer may be
- * allocated if the capacity of <code>bb</code> is not large
- * enough to store the result. The buffer is not flipped upon
- * return.
- *
- * @param inflectedBuffer
- * Inflected form's bytes (decoded properly).
- *
- * @param bytes
- * Bytes of the encoded base form, starting at 0 index.
- *
- * @param len
- * Length of the encode base form.
- *
- * @return Returns either <code>bb</code> or a new buffer whose capacity is
- * large enough to store the output of the decoded data.
- */
- public static ByteBuffer decodeStem(ByteBuffer bb, byte[] bytes, int len,
- ByteBuffer inflectedBuffer, DictionaryMetadata metadata) {
- bb.clear();
-
- // Empty length? Weird, but return an empty buffer.
- if (len == 0) {
- return bb;
- }
-
- // Determine inflected string's length in bytes, in the same encoding.
- final byte[] infBytes = inflectedBuffer.array();
- final int infLen = inflectedBuffer.remaining();
- final int code0 = bytes[0] - 'A';
-
- final boolean fsaPrefixes = metadata.usesPrefixes;
- final boolean fsaInfixes = metadata.usesInfixes;
-
- // Increase buffer size, if needed.
- if (bb.capacity() < infLen + len) {
- bb = ByteBuffer.allocate(infLen + len);
- }
-
- if (code0 >= 0) {
- if (!fsaPrefixes && !fsaInfixes) {
- if (code0 <= infLen) {
- bb.put(infBytes, 0, infLen - code0);
- bb.put(bytes, 1, len - 1);
- return bb;
- }
- } else if (fsaPrefixes && !fsaInfixes) {
- if (len > 1) {
- final int stripAtEnd = bytes[1] - 'A' + code0;
- if (stripAtEnd <= infLen) {
- bb.put(infBytes, code0, infLen - stripAtEnd);
- bb.put(bytes, 2, len - 2);
- return bb;
- }
- }
- } else if (fsaInfixes) {
- // Note: Prefixes are silently assumed here.
- if (len > 2) {
- final int stripAtBeginning = bytes[1] - 'A' + code0;
- final int stripAtEnd = bytes[2] - 'A' + stripAtBeginning;
- if (stripAtEnd <= infLen) {
- bb.put(infBytes, 0, code0);
- bb.put(infBytes, stripAtBeginning, infLen - stripAtEnd);
- bb.put(bytes, 3, len - 3);
- return bb;
- }
- }
- }
- }
-
- /*
- * This is a fallback in case some junk is detected above. Return the
- * base form only if this is the case.
- */
- bb.clear();
- bb.put(bytes, 0, len);
- return bb;
- }
-
- /**
- * Encode a character sequence into a byte buffer, optionally expanding
- * buffer.
- */
- private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
- bytes.clear();
- final int maxCapacity = (int) (chars.remaining() * encoder
- .maxBytesPerChar());
- if (bytes.capacity() <= maxCapacity) {
- bytes = ByteBuffer.allocate(maxCapacity);
- }
-
- chars.mark();
- encoder.reset();
- encoder.encode(chars, bytes, true);
- bytes.flip();
- chars.reset();
-
- return bytes;
- }
-
- /**
- * Return an iterator over all {@link WordData} entries available in the
- * embedded {@link Dictionary}.
- */
- public Iterator<WordData> iterator() {
- return new DictionaryIterator(dictionary, decoder, true);
- }
-
- /**
- * @return Return the {@link Dictionary} used by this object.
- */
- public Dictionary getDictionary() {
- return dictionary;
- }
-}