summaryrefslogtreecommitdiff
path: root/src/morfologik
diff options
context:
space:
mode:
Diffstat (limited to 'src/morfologik')
-rw-r--r--src/morfologik/dictionaries/pl.LICENSE8
-rw-r--r--src/morfologik/dictionaries/pl.dictbin1806661 -> 0 bytes
-rw-r--r--src/morfologik/dictionaries/pl.info13
-rw-r--r--src/morfologik/fsa/CFSA.java364
-rw-r--r--src/morfologik/fsa/CFSA2.java404
-rw-r--r--src/morfologik/fsa/CFSA2Serializer.java536
-rw-r--r--src/morfologik/fsa/ConstantArcSizeFSA.java134
-rw-r--r--src/morfologik/fsa/FSA.java270
-rw-r--r--src/morfologik/fsa/FSA5.java323
-rw-r--r--src/morfologik/fsa/FSA5Serializer.java334
-rw-r--r--src/morfologik/fsa/FSABuilder.java486
-rw-r--r--src/morfologik/fsa/FSAFinalStatesIterator.java154
-rw-r--r--src/morfologik/fsa/FSAFlags.java64
-rw-r--r--src/morfologik/fsa/FSAHeader.java52
-rw-r--r--src/morfologik/fsa/FSAInfo.java157
-rw-r--r--src/morfologik/fsa/FSASerializer.java45
-rw-r--r--src/morfologik/fsa/FSATraversal.java169
-rw-r--r--src/morfologik/fsa/FSAUtils.java202
-rw-r--r--src/morfologik/fsa/MatchResult.java86
-rw-r--r--src/morfologik/fsa/NullMessageLogger.java24
-rw-r--r--src/morfologik/fsa/StateVisitor.java11
-rw-r--r--src/morfologik/stemming/ArrayViewList.java111
-rw-r--r--src/morfologik/stemming/Dictionary.java169
-rw-r--r--src/morfologik/stemming/DictionaryIterator.java144
-rw-r--r--src/morfologik/stemming/DictionaryLookup.java355
-rw-r--r--src/morfologik/stemming/DictionaryMetadata.java122
-rw-r--r--src/morfologik/stemming/IStemmer.java20
-rw-r--r--src/morfologik/stemming/PolishStemmer.java43
-rw-r--r--src/morfologik/stemming/WordData.java247
-rw-r--r--src/morfologik/tools/FSABuildTool.java486
-rw-r--r--src/morfologik/tools/FSADumpTool.java286
-rw-r--r--src/morfologik/tools/IMessageLogger.java25
-rw-r--r--src/morfologik/tools/InflectionFramesTool.java118
-rw-r--r--src/morfologik/tools/Launcher.java159
-rw-r--r--src/morfologik/tools/MorphEncoder.java399
-rw-r--r--src/morfologik/tools/MorphEncodingTool.java213
-rw-r--r--src/morfologik/tools/PolishStemmingTool.java191
-rw-r--r--src/morfologik/tools/SharedOptions.java153
-rw-r--r--src/morfologik/tools/Tool.java84
-rw-r--r--src/morfologik/tools/WriterMessageLogger.java123
-rw-r--r--src/morfologik/util/Arrays.java68
-rw-r--r--src/morfologik/util/BufferUtils.java54
-rw-r--r--src/morfologik/util/FileUtils.java137
-rw-r--r--src/morfologik/util/ResourceUtils.java58
44 files changed, 0 insertions, 7601 deletions
diff --git a/src/morfologik/dictionaries/pl.LICENSE b/src/morfologik/dictionaries/pl.LICENSE
deleted file mode 100644
index 8529618..0000000
--- a/src/morfologik/dictionaries/pl.LICENSE
+++ /dev/null
@@ -1,8 +0,0 @@
-LICENCE
-
-The dictionary comes from Morfologik project. Morfologik uses data from
-Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
-is licenced on the terms of (inter alia) LGPL and Creative Commons
-ShareAlike. The part-of-speech tags were added in Morfologik project and
-are not found in the data from sjp.pl. The tagset is similar to IPI PAN
-tagset.
diff --git a/src/morfologik/dictionaries/pl.dict b/src/morfologik/dictionaries/pl.dict
deleted file mode 100644
index 1ebd28a..0000000
--- a/src/morfologik/dictionaries/pl.dict
+++ /dev/null
Binary files differ
diff --git a/src/morfologik/dictionaries/pl.info b/src/morfologik/dictionaries/pl.info
deleted file mode 100644
index 0c933c9..0000000
--- a/src/morfologik/dictionaries/pl.info
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Dictionary metadata.
-#
-
-fsa.dict.author=morfologik.blogspot.com
-fsa.dict.created=19.11.2010
-fsa.dict.license=LGPL or Creative Commons ShareAlike license (pick any suitable). http://morfologik.blogspot.com
-
-fsa.dict.separator=+
-fsa.dict.encoding=iso-8859-2
-
-fsa.dict.uses-prefixes=true
-fsa.dict.uses-infixes=true
diff --git a/src/morfologik/fsa/CFSA.java b/src/morfologik/fsa/CFSA.java
deleted file mode 100644
index 695664d..0000000
--- a/src/morfologik/fsa/CFSA.java
+++ /dev/null
@@ -1,364 +0,0 @@
-package morfologik.fsa;
-
-import static morfologik.fsa.FSAFlags.*;
-import static morfologik.util.FileUtils.readFully;
-
-import java.io.*;
-import java.util.*;
-
-/**
- * CFSA (Compact Finite State Automaton) binary format implementation. This is a
- * slightly reorganized version of {@link FSA5} offering smaller automata size
- * at some (minor) performance penalty.
- *
- * <p><b>Note:</b> Serialize to {@link CFSA2} for new code.</p>
- *
- * <p>The encoding of automaton body is as follows.</p>
- *
- * <pre>
- * ---- FSA header (standard)
- * Byte Description
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | +------ '\'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 1 | | | | | | | | | +------ 'f'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 2 | | | | | | | | | +------ 's'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 3 | | | | | | | | | +------ 'a'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 4 | | | | | | | | | +------ version (fixed 0xc5)
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 5 | | | | | | | | | +------ filler character
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 6 | | | | | | | | | +------ annot character
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding.
- * : : : : : : : : : |
- * +-+-+-+-+-+-+-+-+/
- *
- * ---- Start of a node; only if automaton was compiled with NUMBERS option.
- *
- * Byte
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | \ LSB
- * +-+-+-+-+-+-+-+-+ +
- * 1 | | | | | | | | | | number of strings recognized
- * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
- * : : : : : : : : : | from this node.
- * +-+-+-+-+-+-+-+-+ +
- * ctl-1 | | | | | | | | | / MSB
- * +-+-+-+-+-+-+-+-+/
- *
- * ---- A vector of node's arcs. Conditional format, depending on flags.
- *
- * 1) NEXT bit set, mapped arc label.
- *
- * +--------------- arc's label mapped in M bits if M's field value > 0
- * | +------------- node pointed to is next
- * | | +----------- the last arc of the node
- * _______| | | +--------- the arc is final
- * / | | | |
- * +-+-+-+-+-+-+-+-+\
- * 0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
- * +-+-+-+-+-+-+-+-+/
- *
- * 2) NEXT bit set, label separate.
- *
- * +--------------- arc's label stored separately (M's field is zero).
- * | +------------- node pointed to is next
- * | | +----------- the last arc of the node
- * | | | +--------- the arc is final
- * | | | |
- * +-+-+-+-+-+-+-+-+\
- * 0 |0|0|0|0|0|1|L|F| +------ flags
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 1 | | | | | | | | | +------ label
- * +-+-+-+-+-+-+-+-+/
- *
- * 3) NEXT bit not set. Full arc.
- *
- * +------------- node pointed to is next
- * | +----------- the last arc of the node
- * | | +--------- the arc is final
- * | | |
- * +-+-+-+-+-+-+-+-+\
- * 0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 1 | | | | | | | | | +------ label
- * +-+-+-+-+-+-+-+-+/
- * : : : : : : : : :
- * +-+-+-+-+-+-+-+-+\
- * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
- * +-+-+-+-+-+-+-+-+/
- * </pre>
- */
-public final class CFSA extends FSA {
- /**
- * Automaton header version value.
- */
- public static final byte VERSION = (byte) 0xC5;
-
- /**
- * Bitmask indicating that an arc corresponds to the last character of a
- * sequence available when building the automaton.
- */
- public static final int BIT_FINAL_ARC = 1 << 0;
-
- /**
- * Bitmask indicating that an arc is the last one of the node's list and the
- * following one belongs to another node.
- */
- public static final int BIT_LAST_ARC = 1 << 1;
-
- /**
- * Bitmask indicating that the target node of this arc follows it in the
- * compressed automaton structure (no goto field).
- */
- public static final int BIT_TARGET_NEXT = 1 << 2;
-
- /**
- * An array of bytes with the internal representation of the automaton.
- * Please see the documentation of this class for more information on how
- * this structure is organized.
- */
- public byte[] arcs;
-
- /**
- * The length of the node header structure (if the automaton was compiled with
- * <code>NUMBERS</code> option). Otherwise zero.
- */
- public final int nodeDataLength;
-
- /**
- * Flags for this automaton version.
- */
- private final Set<FSAFlags> flags;
-
- /**
- * Number of bytes each address takes in full, expanded form (goto length).
- */
- public final int gtl;
-
- /**
- * Label mapping for arcs of type (1) (see class documentation). The array
- * is indexed by mapped label's value and contains the original label.
- */
- public final byte[] labelMapping;
-
- /**
- * Creates a new automaton, reading it from a file in FSA format, version 5.
- */
- public CFSA(InputStream fsaStream) throws IOException {
- // Read the header first.
- final FSAHeader header = FSAHeader.read(fsaStream);
-
- // Ensure we have the correct version.
- if (header.version != VERSION) {
- throw new IOException("This class can read FSA version 5 only: " + header.version);
- }
-
- /*
- * Determine if the automaton was compiled with NUMBERS. If so, modify
- * ctl and goto fields accordingly.
- */
- flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
- if ((header.gtl & 0xf0) != 0) {
- this.nodeDataLength = (header.gtl >>> 4) & 0x0f;
- this.gtl = header.gtl & 0x0f;
- flags.add(NUMBERS);
- } else {
- this.nodeDataLength = 0;
- this.gtl = header.gtl & 0x0f;
- }
-
- /*
- * Read mapping dictionary.
- */
- labelMapping = new byte[1 << 5];
- readFully(fsaStream, labelMapping);
-
- /*
- * Read arcs' data.
- */
- arcs = readFully(fsaStream);
- }
-
- /**
- * Returns the start node of this automaton. May return <code>0</code> if
- * the start node is also an end node.
- */
- @Override
- public int getRootNode() {
- // Skip dummy node marking terminating state.
- final int epsilonNode = skipArc(getFirstArc(0));
-
- // And follow the epsilon node's first (and only) arc.
- return getDestinationNodeOffset(getFirstArc(epsilonNode));
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getFirstArc(int node) {
- return nodeDataLength + node;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getNextArc(int arc) {
- if (isArcLast(arc))
- return 0;
- else
- return skipArc(arc);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getArc(int node, byte label) {
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (getArcLabel(arc) == label)
- return arc;
- }
-
- // An arc labeled with "label" not found.
- return 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getEndNode(int arc) {
- final int nodeOffset = getDestinationNodeOffset(arc);
- if (0 == nodeOffset) {
- throw new RuntimeException("This is a terminal arc [" + arc + "]");
- }
- return nodeOffset;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public byte getArcLabel(int arc) {
- if (isNextSet(arc) && isLabelCompressed(arc)) {
- return this.labelMapping[(arcs[arc] >>> 3) & 0x1f];
- } else {
- return arcs[arc + 1];
- }
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getRightLanguageCount(int node) {
- assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
- return FSA5.decodeFromBytes(arcs, node, nodeDataLength);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcFinal(int arc) {
- return (arcs[arc] & BIT_FINAL_ARC) != 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcTerminal(int arc) {
- return (0 == getDestinationNodeOffset(arc));
- }
-
- /**
- * Returns <code>true</code> if this arc has <code>NEXT</code> bit set.
- *
- * @see #BIT_LAST_ARC
- */
- public boolean isArcLast(int arc) {
- return (arcs[arc] & BIT_LAST_ARC) != 0;
- }
-
- /**
- * @see #BIT_TARGET_NEXT
- */
- public boolean isNextSet(int arc) {
- return (arcs[arc] & BIT_TARGET_NEXT) != 0;
- }
-
- /**
- * Returns <code>true</code> if the label is compressed inside flags byte.
- */
- public boolean isLabelCompressed(int arc) {
- assert isNextSet(arc) : "Only applicable to arcs with NEXT bit.";
- return (arcs[arc] & (-1 << 3)) != 0;
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>For this automaton version, an additional {@link FSAFlags#NUMBERS} flag
- * may be set to indicate the automaton contains extra fields for each node.</p>
- */
- public Set<FSAFlags> getFlags() {
- return Collections.unmodifiableSet(flags);
- }
-
- /**
- * Returns the address of the node pointed to by this arc.
- */
- final int getDestinationNodeOffset(int arc) {
- if (isNextSet(arc)) {
- /* The destination node follows this arc in the array. */
- return skipArc(arc);
- } else {
- /*
- * The destination node address has to be extracted from the arc's
- * goto field.
- */
- int r = 0;
- for (int i = gtl; --i >= 1;) {
- r = r << 8 | (arcs[arc + 1 + i] & 0xff);
- }
- r = r << 8 | (arcs[arc] & 0xff);
- return r >>> 3;
- }
- }
-
- /**
- * Read the arc's layout and skip as many bytes, as needed, to skip it.
- */
- private int skipArc(int offset) {
- if (isNextSet(offset)) {
- if (isLabelCompressed(offset)) {
- offset++;
- } else {
- offset += 1 + 1;
- }
- } else {
- offset += 1 + gtl;
- }
- return offset;
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/CFSA2.java b/src/morfologik/fsa/CFSA2.java
deleted file mode 100644
index 6955da4..0000000
--- a/src/morfologik/fsa/CFSA2.java
+++ /dev/null
@@ -1,404 +0,0 @@
-package morfologik.fsa;
-
-import static morfologik.util.FileUtils.readFully;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.EnumSet;
-import java.util.Set;
-
-import morfologik.util.FileUtils;
-
-/**
- * CFSA (Compact Finite State Automaton) binary format implementation, version 2:
- * <ul>
- * <li>{@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one.</li>
- * <li>v-coded goto field</li>
- * <li>v-coded perfect hashing numbers, if any</li>
- * <li>31 most frequent labels integrated with flags byte</li>
- * </ul>
- *
- * <p>The encoding of automaton body is as follows.</p>
- *
- * <pre>
- * ---- CFSA header
- * Byte Description
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | +------ '\'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 1 | | | | | | | | | +------ 'f'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 2 | | | | | | | | | +------ 's'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 3 | | | | | | | | | +------ 'a'
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 4 | | | | | | | | | +------ version (fixed 0xc6)
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 5 | | | | | | | | | +----\
- * +-+-+-+-+-+-+-+-+/ \ flags [MSB first]
- * +-+-+-+-+-+-+-+-+\ /
- * 6 | | | | | | | | | +----/
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 7 | | | | | | | | | +------ label lookup table size
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 8-32 | | | | | | | | | +------ label value lookup table
- * : : : : : : : : : |
- * +-+-+-+-+-+-+-+-+/
- *
- * ---- Start of a node; only if automaton was compiled with NUMBERS option.
- *
- * Byte
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | \
- * +-+-+-+-+-+-+-+-+ +
- * 1 | | | | | | | | | | number of strings recognized
- * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
- * : : : : : : : : : | from this node. v-coding
- * +-+-+-+-+-+-+-+-+ +
- * | | | | | | | | | /
- * +-+-+-+-+-+-+-+-+/
- *
- * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
- *
- * 1) NEXT bit set, mapped arc label.
- *
- * +----------------------- node pointed to is next
- * | +--------------------- the last arc of the node
- * | | +------------------- this arc leads to a final state (acceptor)
- * | | | _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
- * | | | / | | | |
- * +-+-+-+-+-+-+-+-+\
- * 0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
- * +-+-+-+-+-+-+-+-+/
- * +-+-+-+-+-+-+-+-+\
- * 1 | | | | | | | | | +------ optional label if M == 0
- * +-+-+-+-+-+-+-+-+/
- * : : : : : : : : :
- * +-+-+-+-+-+-+-+-+\
- * |A|A|A|A|A|A|A|A| +------ v-coded goto address
- * +-+-+-+-+-+-+-+-+/
- * </pre>
- */
-public final class CFSA2 extends FSA {
- /**
- * Automaton header version value.
- */
- public static final byte VERSION = (byte) 0xc6;
-
- /**
- * The target node of this arc follows the last arc of the current state
- * (no goto field).
- */
- public static final int BIT_TARGET_NEXT = 1 << 7;
-
- /**
- * The arc is the last one from the current node's arcs list.
- */
- public static final int BIT_LAST_ARC = 1 << 6;
-
- /**
- * The arc corresponds to the last character of a sequence
- * available when building the automaton (acceptor transition).
- */
- public static final int BIT_FINAL_ARC = 1 << 5;
-
- /**
- * The count of bits assigned to storing an indexed label.
- */
- static final int LABEL_INDEX_BITS = 5;
-
- /**
- * Masks only the M bits of a flag byte.
- */
- static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1;
-
- /**
- * Maximum size of the labels index.
- */
- static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1;
-
- /**
- * An array of bytes with the internal representation of the automaton.
- * Please see the documentation of this class for more information on how
- * this structure is organized.
- */
- public byte[] arcs;
-
- /**
- * Flags for this automaton version.
- */
- private final EnumSet<FSAFlags> flags;
-
- /**
- * Label mapping for M-indexed labels.
- */
- public final byte[] labelMapping;
-
- /**
- * If <code>true</code> states are prepended with numbers.
- */
- private final boolean hasNumbers;
-
- /**
- * Epsilon node's offset.
- */
- private final int epsilon = 0;
-
- /**
- * Reads an automaton from a byte stream.
- */
- public CFSA2(InputStream in) throws IOException {
- // Read the header first.
- if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in))
- throw new IOException("Invalid file header magic bytes.");
-
- // Ensure we have the correct version.
- final int version = FileUtils.readByte(in);
- if (version != VERSION) {
- throw new IOException("This class can only read FSA version: " + VERSION);
- }
-
- // Read flags.
- short flagBits = FileUtils.readShort(in);
- flags = EnumSet.noneOf(FSAFlags.class);
- for (FSAFlags f : FSAFlags.values()) {
- if (FSAFlags.isSet(flagBits, f))
- flags.add(f);
- }
-
- if (flagBits != FSAFlags.asShort(flags))
- throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits));
-
- this.hasNumbers = flags.contains(FSAFlags.NUMBERS);
-
- /*
- * Read mapping dictionary.
- */
- int labelMappingSize = FileUtils.readByte(in) & 0xff;
- labelMapping = new byte[labelMappingSize];
- readFully(in, labelMapping);
-
- /*
- * Read arcs' data.
- */
- arcs = readFully(in);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getRootNode() {
- // Skip dummy node marking terminating state.
- return getDestinationNodeOffset(getFirstArc(epsilon));
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getFirstArc(int node) {
- if (hasNumbers) {
- return skipVInt(node);
- } else {
- return node;
- }
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getNextArc(int arc) {
- if (isArcLast(arc))
- return 0;
- else
- return skipArc(arc);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getArc(int node, byte label) {
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (getArcLabel(arc) == label)
- return arc;
- }
-
- // An arc labeled with "label" not found.
- return 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getEndNode(int arc) {
- final int nodeOffset = getDestinationNodeOffset(arc);
- assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc;
- assert nodeOffset < arcs.length : "Node out of bounds.";
- return nodeOffset;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public byte getArcLabel(int arc) {
- int index = arcs[arc] & LABEL_INDEX_MASK;
- if (index > 0) {
- return this.labelMapping[index];
- } else {
- return arcs[arc + 1];
- }
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getRightLanguageCount(int node) {
- assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
- return readVInt(arcs, node);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcFinal(int arc) {
- return (arcs[arc] & BIT_FINAL_ARC) != 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcTerminal(int arc) {
- return (0 == getDestinationNodeOffset(arc));
- }
-
- /**
- * Returns <code>true</code> if this arc has <code>NEXT</code> bit set.
- *
- * @see #BIT_LAST_ARC
- */
- public boolean isArcLast(int arc) {
- return (arcs[arc] & BIT_LAST_ARC) != 0;
- }
-
- /**
- * @see #BIT_TARGET_NEXT
- */
- public boolean isNextSet(int arc) {
- return (arcs[arc] & BIT_TARGET_NEXT) != 0;
- }
-
- /**
- * {@inheritDoc}
- */
- public Set<FSAFlags> getFlags() {
- return flags;
- }
-
- /**
- * Returns the address of the node pointed to by this arc.
- */
- final int getDestinationNodeOffset(int arc) {
- if (isNextSet(arc)) {
- /* Follow until the last arc of this state. */
- while (!isArcLast(arc)) {
- arc = getNextArc(arc);
- }
-
- /* And return the byte right after it. */
- return skipArc(arc);
- } else {
- /*
- * The destination node address is v-coded. v-code starts either
- * at the next byte (label indexed) or after the next byte (label explicit).
- */
- return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1));
- }
- }
-
- /**
- * Read the arc's layout and skip as many bytes, as needed, to skip it.
- */
- private int skipArc(int offset) {
- int flag = arcs[offset++];
-
- // Explicit label?
- if ((flag & LABEL_INDEX_MASK) == 0) {
- offset++;
- }
-
- // Explicit goto?
- if ((flag & BIT_TARGET_NEXT) == 0) {
- offset = skipVInt(offset);
- }
-
- assert offset < this.arcs.length;
- return offset;
- }
-
- /**
- * Read a v-int.
- */
- static int readVInt(byte[] array, int offset) {
- byte b = array[offset];
- int value = b & 0x7F;
-
- for (int shift = 7; b < 0; shift += 7) {
- b = array[++offset];
- value |= (b & 0x7F) << shift;
- }
-
- return value;
- }
-
- /**
- * Write a v-int to a byte array.
- */
- static int writeVInt(byte[] array, int offset, int value) {
- assert value >= 0 : "Can't v-code negative ints.";
-
- while (value > 0x7F) {
- array[offset++] = (byte) (0x80 | (value & 0x7F));
- value >>= 7;
- }
- array[offset++] = (byte) value;
-
- return offset;
- }
-
- /**
- * Return the byte-length of a v-coded int.
- */
- static int vIntLength(int value) {
- assert value >= 0 : "Can't v-code negative ints.";
-
- int bytes;
- for (bytes = 1; value >= 0x80; bytes++) {
- value >>= 7;
- }
-
- return bytes;
- }
-
- /**
- * Skip a v-int.
- */
- private int skipVInt(int offset) {
- while (arcs[offset++] < 0);
- return offset;
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/CFSA2Serializer.java b/src/morfologik/fsa/CFSA2Serializer.java
deleted file mode 100644
index 11c7b13..0000000
--- a/src/morfologik/fsa/CFSA2Serializer.java
+++ /dev/null
@@ -1,536 +0,0 @@
-package morfologik.fsa;
-
-import static morfologik.fsa.CFSA2.BIT_FINAL_ARC;
-import static morfologik.fsa.CFSA2.BIT_LAST_ARC;
-import static morfologik.fsa.CFSA2.BIT_TARGET_NEXT;
-import static morfologik.fsa.FSAFlags.FLEXIBLE;
-import static morfologik.fsa.FSAFlags.NEXTBIT;
-import static morfologik.fsa.FSAFlags.NUMBERS;
-import static morfologik.fsa.FSAFlags.STOPBIT;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayDeque;
-import java.util.Comparator;
-import java.util.EnumSet;
-import java.util.PriorityQueue;
-import java.util.Set;
-
-import morfologik.fsa.FSAUtils.IntIntHolder;
-import morfologik.tools.IMessageLogger;
-import morfologik.util.FileUtils;
-
-import com.carrotsearch.hppc.BitSet;
-import com.carrotsearch.hppc.BoundedProportionalArraySizingStrategy;
-import com.carrotsearch.hppc.IntArrayList;
-import com.carrotsearch.hppc.IntIntOpenHashMap;
-import com.carrotsearch.hppc.IntStack;
-import com.carrotsearch.hppc.cursors.IntCursor;
-import com.carrotsearch.hppc.cursors.IntIntCursor;
-
-/**
- * Serializes in-memory {@link FSA} graphs to {@link CFSA2}.
- *
- * <p>
- * It is possible to serialize the automaton with numbers required for perfect
- * hashing. See {@link #withNumbers()} method.
- * </p>
- *
- * @see CFSA2
- * @see FSA#read(java.io.InputStream)
- */
-public final class CFSA2Serializer implements FSASerializer {
- /**
- * Supported flags.
- */
- private final static EnumSet<FSAFlags> flags = EnumSet.of(NUMBERS, FLEXIBLE, STOPBIT, NEXTBIT);
-
- /**
- * No-state id.
- */
- private final static int NO_STATE = -1;
-
- /**
- * <code>true</code> if we should serialize with numbers.
- *
- * @see #withNumbers()
- */
- private boolean withNumbers;
-
- /**
- * A hash map of [state, offset] pairs.
- */
- private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
-
- /**
- * A hash map of [state, right-language-count] pairs.
- */
- private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
-
- /**
- * Scratch array for serializing vints.
- */
- private final byte [] scratch = new byte [5];
-
- /**
- * The most frequent labels for integrating with the flags field.
- */
- private byte [] labelsIndex;
-
- /**
- * Inverted index of labels to be integrated with flags field. A label
- * at index <code>i<code> has the index or zero (no integration).
- */
- private int [] labelsInvIndex;
-
- /**
- * Logger for progress.
- */
- private IMessageLogger logger = new NullMessageLogger();
-
- /**
- * Serialize the automaton with the number of right-language sequences in
- * each node. This is required to implement perfect hashing. The numbering
- * also preserves the order of input sequences.
- *
- * @return Returns the same object for easier call chaining.
- */
- public CFSA2Serializer withNumbers() {
- withNumbers = true;
- return this;
- }
-
- /**
- * Serializes any {@link FSA} to {@link CFSA2} stream.
- *
- * @see #withNumbers
- * @return Returns <code>os</code> for chaining.
- */
- @Override
- public <T extends OutputStream> T serialize(final FSA fsa, T os) throws IOException {
- /*
- * Calculate the most frequent labels and build indexed labels dictionary.
- */
- computeLabelsIndex(fsa);
-
- /*
- * Calculate the number of bytes required for the node data, if
- * serializing with numbers.
- */
- if (withNumbers) {
- this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
- }
-
- /*
- * Linearize all the states, optimizing their layout.
- */
- IntArrayList linearized = linearize(fsa);
-
- /*
- * Emit the header.
- */
- FileUtils.writeInt(os, FSAHeader.FSA_MAGIC);
- os.write(CFSA2.VERSION);
-
- EnumSet<FSAFlags> fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
- if (withNumbers) fsaFlags.add(NUMBERS);
- FileUtils.writeShort(os, FSAFlags.asShort(fsaFlags));
-
- /*
- * Emit labels index.
- */
- os.write(labelsIndex.length);
- os.write(labelsIndex);
-
- /*
- * Emit the automaton.
- */
- int size = emitNodes(fsa, os, linearized);
- assert size == 0 : "Size changed in the final pass?";
-
- return os;
- }
-
- /**
- * Compute a set of labels to be integrated with the flags field.
- */
- private void computeLabelsIndex(final FSA fsa) {
- // Compute labels count.
- final int [] countByValue = new int [256];
-
- fsa.visitAllStates(new StateVisitor() {
- public boolean accept(int state) {
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
- countByValue[fsa.getArcLabel(arc) & 0xff]++;
- return true;
- }
- });
-
- // Order by descending frequency of counts and increasing label value.
- Comparator<IntIntHolder> comparator = new Comparator<IntIntHolder>() {
- public int compare(IntIntHolder o1, IntIntHolder o2) {
- int countDiff = o2.b - o1.b;
- if (countDiff == 0) {
- countDiff = o1.a - o2.a;
- }
- return countDiff;
- }
- };
-
- PriorityQueue<IntIntHolder> labelAndCount = new PriorityQueue<IntIntHolder>(countByValue.length, comparator);
- for (int label = 0; label < countByValue.length; label++) {
- if (countByValue[label] > 0) {
- labelAndCount.add(new IntIntHolder(label, countByValue[label]));
- }
- }
-
- labelsIndex = new byte [1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)];
- labelsInvIndex = new int [256];
- for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) {
- IntIntHolder p = labelAndCount.remove();
- labelsInvIndex[p.a] = i;
- labelsIndex[i] = (byte) p.a;
- }
- }
-
- /**
- * Return supported flags.
- */
- @Override
- public Set<FSAFlags> getFlags() {
- return flags;
- }
-
- /**
- * Linearization of states.
- */
- private IntArrayList linearize(final FSA fsa) throws IOException {
- /*
- * Compute the states with most inlinks. These should be placed as close to the
- * start of the automaton, as possible so that v-coded addresses are tiny.
- */
- final IntIntOpenHashMap inlinkCount = computeInlinkCount(fsa);
-
- /*
- * An array of ordered states for serialization.
- */
- final IntArrayList linearized = new IntArrayList(0,
- new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f));
-
- /*
- * Determine which states should be linearized first (at fixed positions) so as to
- * minimize the place occupied by goto fields.
- */
- int maxStates = Integer.MAX_VALUE;
- int minInlinkCount = 2;
- ArrayDeque<Integer> statesQueue = computeFirstStates(inlinkCount, maxStates, minInlinkCount);
- IntArrayList states = new IntArrayList();
- while (!statesQueue.isEmpty())
- states.add(statesQueue.pop());
-
- /*
- * Compute initial addresses, without node rearrangements.
- */
- int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets);
-
- /*
- * Probe for better node arrangements by selecting between [lower, upper]
- * nodes from the potential candidate nodes list.
- */
- IntArrayList sublist = new IntArrayList();
- sublist.buffer = states.buffer;
- sublist.elementsCount = states.elementsCount;
-
- /*
- * Probe the initial region a little bit, looking for optimal cut. It can't be binary search
- * because the result isn't monotonic.
- */
- logger.startPart("Compacting");
- logger.log("Initial output size", serializedSize);
- int cutAt = 0;
- for (int cut = Math.min(25, states.size()); cut <= Math.min(150, states.size()); cut += 25) {
- sublist.elementsCount = cut;
- int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
- logger.log("Moved " + sublist.size() + " states, output size", newSize);
- if (newSize >= serializedSize) {
- break;
- }
- cutAt = cut;
- }
-
- /*
- * Cut at the calculated point and repeat linearization.
- */
- sublist.elementsCount = cutAt;
- int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
-
- logger.log("Will move " + sublist.size() + " states, final size", size);
- logger.endPart();
-
- return linearized;
- }
-
- /**
- * Linearize all states, putting <code>states</code> in front of the automaton and
- * calculating stable state offsets.
- */
- private int linearizeAndCalculateOffsets(FSA fsa, IntArrayList states,
- IntArrayList linearized, IntIntOpenHashMap offsets) throws IOException
- {
- final BitSet visited = new BitSet();
- final IntStack nodes = new IntStack();
- linearized.clear();
-
- /*
- * Linearize states with most inlinks first.
- */
- for (int i = 0; i < states.size(); i++) {
- linearizeState(fsa, nodes, linearized, visited, states.get(i));
- }
-
- /*
- * Linearize the remaining states by chaining them one after another, in depth-order.
- */
- nodes.push(fsa.getRootNode());
- while (!nodes.isEmpty()) {
- final int node = nodes.pop();
- if (visited.get(node))
- continue;
-
- linearizeState(fsa, nodes, linearized, visited, node);
- }
-
- /*
- * Calculate new state offsets. This is iterative. We start with
- * maximum potential offsets and recalculate until converged.
- */
- int MAX_OFFSET = Integer.MAX_VALUE;
- for (IntCursor c : linearized) {
- offsets.put(c.value, MAX_OFFSET);
- }
-
- int i, j = 0;
- while ((i = emitNodes(fsa, null, linearized)) > 0) {
- j = i;
- }
- return j;
- }
-
- /**
- * Add a state to linearized list.
- */
- private void linearizeState(final FSA fsa,
- IntStack nodes,
- IntArrayList linearized,
- BitSet visited, int node)
- {
- linearized.add(node);
- visited.set(node);
- for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
- if (!fsa.isArcTerminal(arc)) {
- final int target = fsa.getEndNode(arc);
- if (!visited.get(target))
- nodes.push(target);
- }
- }
- }
-
- /**
- * Compute the set of states that should be linearized first to minimize other
- * states goto length.
- */
- private ArrayDeque<Integer> computeFirstStates(IntIntOpenHashMap inlinkCount,
- int maxStates,
- int minInlinkCount)
- {
- Comparator<IntIntHolder> comparator = new Comparator<FSAUtils.IntIntHolder>() {
- public int compare(IntIntHolder o1, IntIntHolder o2) {
- int v = o1.a - o2.a;
- return v == 0 ? (o1.b - o2.b) : v;
- }
- };
-
- PriorityQueue<IntIntHolder> stateInlink = new PriorityQueue<IntIntHolder>(1, comparator);
- IntIntHolder scratch = new IntIntHolder();
- for (IntIntCursor c : inlinkCount) {
- if (c.value > minInlinkCount) {
- scratch.a = c.value;
- scratch.b = c.key;
-
- if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) {
- stateInlink.add(new IntIntHolder(c.value, c.key));
- if (stateInlink.size() > maxStates) stateInlink.remove();
- }
- }
- }
-
- ArrayDeque<Integer> states = new ArrayDeque<Integer>();
- while (!stateInlink.isEmpty()) {
- IntIntHolder i = stateInlink.remove();
- states.addFirst(i.b);
- }
- return states;
- }
-
- /**
- * Compute in-link count for each state.
- */
- private IntIntOpenHashMap computeInlinkCount(final FSA fsa) {
- IntIntOpenHashMap inlinkCount = new IntIntOpenHashMap();
- BitSet visited = new BitSet();
- IntStack nodes = new IntStack();
- nodes.push(fsa.getRootNode());
-
- while (!nodes.isEmpty()) {
- final int node = nodes.pop();
- if (visited.get(node))
- continue;
-
- visited.set(node);
-
- for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
- if (!fsa.isArcTerminal(arc)) {
- final int target = fsa.getEndNode(arc);
- inlinkCount.putOrAdd(target, 1, 1);
- if (!visited.get(target))
- nodes.push(target);
- }
- }
- }
-
- return inlinkCount;
- }
-
- /**
- * Update arc offsets assuming the given goto length.
- */
- private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException {
- int offset = 0;
-
- // Add epsilon state.
- offset += emitNodeData(os, 0);
- if (fsa.getRootNode() != 0)
- offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode()));
- else
- offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0);
-
- boolean offsetsChanged = false;
- final int max = linearized.size();
- for (IntCursor c : linearized) {
- final int state = c.value;
- final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE;
-
- if (os == null) {
- offsetsChanged |= (offsets.get(state) != offset);
- offsets.put(state, offset);
- } else {
- assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset;
- }
-
- offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0);
- offset += emitNodeArcs(fsa, os, state, nextState);
- }
-
- return offsetsChanged ? offset : 0;
- }
-
- /**
- * Emit all arcs of a single node.
- */
- private int emitNodeArcs(FSA fsa, OutputStream os,
- final int state, final int nextState) throws IOException {
- int offset = 0;
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
- int targetOffset;
- final int target;
-
- if (fsa.isArcTerminal(arc)) {
- target = 0;
- targetOffset = 0;
- } else {
- target = fsa.getEndNode(arc);
- targetOffset = offsets.get(target);
- }
-
- int flags = 0;
-
- if (fsa.isArcFinal(arc)) {
- flags |= BIT_FINAL_ARC;
- }
-
- if (fsa.getNextArc(arc) == 0) {
- flags |= BIT_LAST_ARC;
- }
-
- if (targetOffset != 0 && target == nextState) {
- flags |= BIT_TARGET_NEXT;
- targetOffset = 0;
- }
-
- offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset);
- }
-
- return offset;
- }
-
- /** */
- private int emitArc(OutputStream os, int flags, byte label, int targetOffset)
- throws IOException
- {
- int length = 0;
-
- int labelIndex = labelsInvIndex[label & 0xff];
- if (labelIndex > 0) {
- if (os != null) os.write(flags | labelIndex);
- length++;
- } else {
- if (os != null) {
- os.write(flags);
- os.write(label);
- }
- length += 2;
- }
-
- if ((flags & BIT_TARGET_NEXT) == 0) {
- int len = CFSA2.writeVInt(scratch, 0, targetOffset);
- if (os != null) {
- os.write(scratch, 0, len);
- }
- length += len;
- }
-
- return length;
- }
-
- /** */
- private int emitNodeData(OutputStream os, int number) throws IOException {
- int size = 0;
-
- if (withNumbers) {
- size = CFSA2.writeVInt(scratch, 0, number);
- if (os != null) {
- os.write(scratch, 0, size);
- }
- }
-
- return size;
- }
-
- /** */
- @Override
- public CFSA2Serializer withFiller(byte filler) {
- throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file.");
- }
-
- /** */
- @Override
- public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) {
- throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file.");
- }
-
- @Override
- public CFSA2Serializer withLogger(IMessageLogger logger) {
- this.logger = logger;
- return this;
- }
-}
diff --git a/src/morfologik/fsa/ConstantArcSizeFSA.java b/src/morfologik/fsa/ConstantArcSizeFSA.java
deleted file mode 100644
index 2f6d412..0000000
--- a/src/morfologik/fsa/ConstantArcSizeFSA.java
+++ /dev/null
@@ -1,134 +0,0 @@
-package morfologik.fsa;
-
-import java.util.Collections;
-import java.util.Set;
-
-/**
- * An FSA with constant-size arc representation produced directly
- * by {@link FSABuilder}.
- *
- * @see FSABuilder
- */
-public final class ConstantArcSizeFSA extends FSA {
- /** Size of the target address field (constant for the builder). */
- public final static int TARGET_ADDRESS_SIZE = 4;
-
- /** Size of the flags field (constant for the builder). */
- public final static int FLAGS_SIZE = 1;
-
- /** Size of the label field (constant for the builder). */
- public final static int LABEL_SIZE = 1;
-
- /**
- * Size of a single arc structure.
- */
- public final static int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE;
-
- /** Offset of the flags field inside an arc. */
- public final static int FLAGS_OFFSET = 0;
-
- /** Offset of the label field inside an arc. */
- public final static int LABEL_OFFSET = FLAGS_SIZE;
-
- /** Offset of the address field inside an arc. */
- public final static int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE;
-
- /** A dummy address of the terminal state. */
- final static int TERMINAL_STATE = 0;
-
- /**
- * An arc flag indicating the target node of an arc corresponds to a final
- * state.
- */
- public final static int BIT_ARC_FINAL = 1 << 1;
-
- /** An arc flag indicating the arc is last within its state. */
- public final static int BIT_ARC_LAST = 1 << 0;
-
- /**
- * An epsilon state. The first and only arc of this state points either
- * to the root or to the terminal state, indicating an empty automaton.
- */
- private final int epsilon;
-
- /**
- * FSA data, serialized as a byte array.
- */
- private final byte[] data;
-
- /**
- * @param data FSA data. There must be no trailing bytes after the last state.
- */
- ConstantArcSizeFSA(byte[] data, int epsilon) {
- assert epsilon == 0 : "Epsilon is not zero?";
-
- this.epsilon = epsilon;
- this.data = data;
- }
-
- @Override
- public int getRootNode() {
- return getEndNode(getFirstArc(epsilon));
- }
-
- @Override
- public int getFirstArc(int node) {
- return node;
- }
-
- @Override
- public int getArc(int node, byte label) {
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (getArcLabel(arc) == label)
- return arc;
- }
- return 0;
- }
-
- @Override
- public int getNextArc(int arc) {
- if (isArcLast(arc))
- return 0;
- return arc + ARC_SIZE;
- }
-
- @Override
- public byte getArcLabel(int arc) {
- return data[arc + LABEL_OFFSET];
- }
-
- /**
- * Fills the target state address of an arc.
- */
- private int getArcTarget(int arc) {
- arc += ADDRESS_OFFSET;
- return (data[arc]) << 24 |
- (data[arc + 1] & 0xff) << 16 |
- (data[arc + 2] & 0xff) << 8 |
- (data[arc + 3] & 0xff);
- }
-
- @Override
- public boolean isArcFinal(int arc) {
- return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
- }
-
- @Override
- public boolean isArcTerminal(int arc) {
- return getArcTarget(arc) == 0;
- }
-
- private boolean isArcLast(int arc) {
- return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
- }
-
- @Override
- public int getEndNode(int arc) {
- return getArcTarget(arc);
- }
-
- @Override
- public Set<FSAFlags> getFlags() {
- return Collections.emptySet();
- }
-}
diff --git a/src/morfologik/fsa/FSA.java b/src/morfologik/fsa/FSA.java
deleted file mode 100644
index 28b44a2..0000000
--- a/src/morfologik/fsa/FSA.java
+++ /dev/null
@@ -1,270 +0,0 @@
-package morfologik.fsa;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-/**
- * This is a top abstract class for handling finite state automata. These
- * automata are arc-based, a design described in Jan Daciuk's <i>Incremental
- * Construction of Finite-State Automata and Transducers, and Their Use in the
- * Natural Language Processing</i> (PhD thesis, Technical University of Gdansk).
- *
- * <p>
- * Concrete subclasses (implementations) provide varying tradeoffs and features:
- * traversal speed vs. memory size, for example.
- * </p>
- *
- * @see FSABuilder
- */
-public abstract class FSA implements Iterable<ByteBuffer> {
- /**
- * @return Returns the identifier of the root node of this automaton.
- * Returns 0 if the start node is also the end node (the automaton
- * is empty).
- */
- public abstract int getRootNode();
-
- /**
- * @return Returns the identifier of the first arc leaving <code>node</code>
- * or 0 if the node has no outgoing arcs.
- */
- public abstract int getFirstArc(int node);
-
- /**
- * @return Returns the identifier of the next arc after <code>arc</code> and
- * leaving <code>node</code>. Zero is returned if no more arcs are
- * available for the node.
- */
- public abstract int getNextArc(int arc);
-
- /**
- * @return Returns the identifier of an arc leaving <code>node</code> and
- * labeled with <code>label</code>. An identifier equal to 0 means
- * the node has no outgoing arc labeled <code>label</code>.
- */
- public abstract int getArc(int node, byte label);
-
- /**
- * Return the label associated with a given <code>arc</code>.
- */
- public abstract byte getArcLabel(int arc);
-
- /**
- * Returns <code>true</code> if the destination node at the end of this
- * <code>arc</code> corresponds to an input sequence created when building
- * this automaton.
- */
- public abstract boolean isArcFinal(int arc);
-
- /**
- * Returns <code>true</code> if this <code>arc</code> does not have a
- * terminating node (@link {@link #getEndNode(int)} will throw an
- * exception). Implies {@link #isArcFinal(int)}.
- */
- public abstract boolean isArcTerminal(int arc);
-
- /**
- * Return the end node pointed to by a given <code>arc</code>. Terminal arcs
- * (those that point to a terminal state) have no end node representation
- * and throw a runtime exception.
- */
- public abstract int getEndNode(int arc);
-
- /**
- * Returns a set of flags for this FSA instance.
- */
- public abstract Set<FSAFlags> getFlags();
-
- /**
- * Calculates the number of arcs of a given node. Unless really required,
- * use the following idiom for looping through all arcs:
- * <pre>
- * for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
- * }
- * </pre>
- */
- public int getArcCount(int node) {
- int count = 0;
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- count++;
- }
- return count;
- }
-
- /**
- * @return Returns the number of sequences reachable from the given state if
- * the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of
- * the right language of the state, in other words.
- *
- * @throws UnsupportedOperationException If the automaton was not compiled with
- * {@link FSAFlags#NUMBERS}. The value can then be computed by manual count
- * of {@link #getSequences(int)}.
- */
- public int getRightLanguageCount(int node) {
- throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
- }
-
- /**
- * Returns an iterator over all binary sequences starting at the given FSA
- * state (node) and ending in final nodes. This corresponds to a set of
- * suffixes of a given prefix from all sequences stored in the automaton.
- *
- * <p>
- * The returned iterator is a {@link ByteBuffer} whose contents changes on
- * each call to {@link Iterator#next()}. The keep the contents between calls
- * to {@link Iterator#next()}, one must copy the buffer to some other
- * location.
- * </p>
- *
- * <p>
- * <b>Important.</b> It is guaranteed that the returned byte buffer is
- * backed by a byte array and that the content of the byte buffer starts at
- * the array's index 0.
- * </p>
- *
- * @see Iterable
- */
- public Iterable<ByteBuffer> getSequences(final int node) {
- if (node == 0) {
- return Collections.<ByteBuffer> emptyList();
- }
-
- return new Iterable<ByteBuffer>() {
- public Iterator<ByteBuffer> iterator() {
- return new FSAFinalStatesIterator(FSA.this, node);
- }
- };
- }
-
- /**
- * An alias of calling {@link #iterator} directly ({@link FSA} is also
- * {@link Iterable}).
- */
- public final Iterable<ByteBuffer> getSequences() {
- return getSequences(getRootNode());
- }
-
- /**
- * Returns an iterator over all binary sequences starting from the initial
- * FSA state (node) and ending in final nodes. The returned iterator is a
- * {@link ByteBuffer} whose contents changes on each call to
- * {@link Iterator#next()}. The keep the contents between calls to
- * {@link Iterator#next()}, one must copy the buffer to some other location.
- *
- * <p>
- * <b>Important.</b> It is guaranteed that the returned byte buffer is
- * backed by a byte array and that the content of the byte buffer starts at
- * the array's index 0.
- * </p>
- *
- * @see Iterable
- */
- public final Iterator<ByteBuffer> iterator() {
- return getSequences().iterator();
- }
-
- /**
- * Visit all states. The order of visiting is undefined. This method may be faster
- * than traversing the automaton in post or preorder since it can scan states
- * linearly. Returning false from {@link StateVisitor#accept(int)}
- * immediately terminates the traversal.
- */
- public <T extends StateVisitor> T visitAllStates(T v) {
- return visitInPostOrder(v);
- }
-
- /**
- * Same as {@link #visitInPostOrder(StateVisitor, int)},
- * starting from root automaton node.
- */
- public <T extends StateVisitor> T visitInPostOrder(T v) {
- return visitInPostOrder(v, getRootNode());
- }
-
- /**
- * Visits all states reachable from <code>node</code> in postorder.
- * Returning false from {@link StateVisitor#accept(int)}
- * immediately terminates the traversal.
- */
- public <T extends StateVisitor> T visitInPostOrder(T v, int node) {
- visitInPostOrder(v, node, new BitSet());
- return v;
- }
-
- /** Private recursion. */
- private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) {
- if (visited.get(node))
- return true;
- visited.set(node);
-
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (!isArcTerminal(arc)) {
- if (!visitInPostOrder(v, getEndNode(arc), visited))
- return false;
- }
- }
-
- return v.accept(node);
- }
-
- /**
- * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node.
- */
- public <T extends StateVisitor> T visitInPreOrder(T v) {
- return visitInPreOrder(v, getRootNode());
- }
-
- /**
- * Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)}
- * skips traversal of all sub-states of a given state.
- */
- public <T extends StateVisitor> T visitInPreOrder(T v, int node) {
- visitInPreOrder(v, node, new BitSet());
- return v;
- }
-
- /** Private recursion. */
- private void visitInPreOrder(StateVisitor v, int node, BitSet visited) {
- if (visited.get(node))
- return;
- visited.set(node);
-
- if (v.accept(node)) {
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (!isArcTerminal(arc)) {
- visitInPreOrder(v, getEndNode(arc), visited);
- }
- }
- }
- }
-
- /**
- * A factory for reading automata in any of the supported versions. If
- * possible, explicit constructors should be used.
- *
- * @see FSA5#FSA5(InputStream)
- */
- @SuppressWarnings("unchecked")
- public static <T extends FSA> T read(InputStream in) throws IOException {
- if (!in.markSupported()) {
- in = new BufferedInputStream(in, Math.max(FSAHeader.MAX_HEADER_LENGTH + 1, 1024));
- }
-
- in.mark(FSAHeader.MAX_HEADER_LENGTH);
- FSAHeader header = FSAHeader.read(in);
- in.reset();
-
- if (header.version == FSA5.VERSION)
- return (T) new FSA5(in);
-
- if (header.version == CFSA.VERSION)
- return (T) new CFSA(in);
-
- if (header.version == CFSA2.VERSION)
- return (T) new CFSA2(in);
-
- throw new IOException("Unsupported automaton version: "
- + header.version);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/FSA5.java b/src/morfologik/fsa/FSA5.java
deleted file mode 100644
index d43f4d8..0000000
--- a/src/morfologik/fsa/FSA5.java
+++ /dev/null
@@ -1,323 +0,0 @@
-package morfologik.fsa;
-
-import static morfologik.fsa.FSAFlags.FLEXIBLE;
-import static morfologik.fsa.FSAFlags.NEXTBIT;
-import static morfologik.fsa.FSAFlags.NUMBERS;
-import static morfologik.fsa.FSAFlags.STOPBIT;
-import static morfologik.util.FileUtils.readFully;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.Set;
-
-/**
- * FSA binary format implementation for version 5.
- *
- * <p>
- * Version 5 indicates the dictionary was built with these flags:
- * {@link FSAFlags#FLEXIBLE}, {@link FSAFlags#STOPBIT} and
- * {@link FSAFlags#NEXTBIT}. The internal representation of the FSA must
- * therefore follow this description (please note this format describes only a
- * single transition (arc), not the entire dictionary file).
- *
- * <pre>
- * ---- this node header present only if automaton was compiled with NUMBERS option.
- * Byte
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | \ LSB
- * +-+-+-+-+-+-+-+-+ +
- * 1 | | | | | | | | | | number of strings recognized
- * +-+-+-+-+-+-+-+-+ +----- by the automaton starting
- * : : : : : : : : : | from this node.
- * +-+-+-+-+-+-+-+-+ +
- * ctl-1 | | | | | | | | | / MSB
- * +-+-+-+-+-+-+-+-+/
- *
- * ---- remaining part of the node
- *
- * Byte
- * +-+-+-+-+-+-+-+-+\
- * 0 | | | | | | | | | +------ label
- * +-+-+-+-+-+-+-+-+/
- *
- * +------------- node pointed to is next
- * | +----------- the last arc of the node
- * | | +--------- the arc is final
- * | | |
- * +-----------+
- * | | | | |
- * ___+___ | | | |
- * / \ | | | |
- * MSB LSB |
- * 7 6 5 4 3 2 1 0 |
- * +-+-+-+-+-+-+-+-+ |
- * 1 | | | | | | | | | \ \
- * +-+-+-+-+-+-+-+-+ \ \ LSB
- * +-+-+-+-+-+-+-+-+ +
- * 2 | | | | | | | | | |
- * +-+-+-+-+-+-+-+-+ |
- * 3 | | | | | | | | | +----- target node address (in bytes)
- * +-+-+-+-+-+-+-+-+ | (not present except for the byte
- * : : : : : : : : : | with flags if the node pointed to
- * +-+-+-+-+-+-+-+-+ + is next)
- * gtl | | | | | | | | | / MSB
- * +-+-+-+-+-+-+-+-+ /
- * gtl+1 (gtl = gotoLength)
- * </pre>
- */
-public final class FSA5 extends FSA {
- /**
- * Default filler byte.
- */
- public final static byte DEFAULT_FILLER = '_';
-
- /**
- * Default annotation byte.
- */
- public final static byte DEFAULT_ANNOTATION = '+';
-
- /**
- * Automaton version as in the file header.
- */
- public static final byte VERSION = 5;
-
- /**
- * Bit indicating that an arc corresponds to the last character of a
- * sequence available when building the automaton.
- */
- public static final int BIT_FINAL_ARC = 1 << 0;
-
- /**
- * Bit indicating that an arc is the last one of the node's list and the
- * following one belongs to another node.
- */
- public static final int BIT_LAST_ARC = 1 << 1;
-
- /**
- * Bit indicating that the target node of this arc follows it in the
- * compressed automaton structure (no goto field).
- */
- public static final int BIT_TARGET_NEXT = 1 << 2;
-
- /**
- * An offset in the arc structure, where the address and flags field begins.
- * In version 5 of FSA automata, this value is constant (1, skip label).
- */
- public final static int ADDRESS_OFFSET = 1;
-
- /**
- * An array of bytes with the internal representation of the automaton.
- * Please see the documentation of this class for more information on how
- * this structure is organized.
- */
- public final byte[] arcs;
-
- /**
- * The length of the node header structure (if the automaton was compiled with
- * <code>NUMBERS</code> option). Otherwise zero.
- */
- public final int nodeDataLength;
-
- /**
- * Flags for this automaton version.
- */
- private final Set<FSAFlags> flags;
-
- /**
- * Number of bytes each address takes in full, expanded form (goto length).
- */
- public final int gtl;
-
- /** Filler character. */
- public final byte filler;
-
- /** Annotation character. */
- public final byte annotation;
-
- /**
- * Read and wrap a binary automaton in FSA version 5.
- */
- public FSA5(InputStream fsaStream) throws IOException {
- // Read the header first.
- final FSAHeader header = FSAHeader.read(fsaStream);
-
- // Ensure we have version 5.
- if (header.version != VERSION) {
- throw new IOException("This class can read FSA version 5 only: " + header.version);
- }
-
- /*
- * Determine if the automaton was compiled with NUMBERS. If so, modify
- * ctl and goto fields accordingly.
- */
- flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
- if ((header.gtl & 0xf0) != 0) {
- flags.add(NUMBERS);
- }
-
- this.nodeDataLength = (header.gtl >>> 4) & 0x0f;
- this.gtl = header.gtl & 0x0f;
-
- this.filler = header.filler;
- this.annotation = header.annotation;
-
- arcs = readFully(fsaStream);
- }
-
- /**
- * Returns the start node of this automaton.
- */
- @Override
- public int getRootNode() {
- // Skip dummy node marking terminating state.
- final int epsilonNode = skipArc(getFirstArc(0));
-
- // And follow the epsilon node's first (and only) arc.
- return getDestinationNodeOffset(getFirstArc(epsilonNode));
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getFirstArc(int node) {
- return nodeDataLength + node;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public final int getNextArc(int arc) {
- if (isArcLast(arc))
- return 0;
- else
- return skipArc(arc);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getArc(int node, byte label) {
- for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
- if (getArcLabel(arc) == label)
- return arc;
- }
-
- // An arc labeled with "label" not found.
- return 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public int getEndNode(int arc) {
- final int nodeOffset = getDestinationNodeOffset(arc);
- assert nodeOffset != 0 : "No target node for terminal arcs.";
- return nodeOffset;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public byte getArcLabel(int arc) {
- return arcs[arc];
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcFinal(int arc) {
- return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isArcTerminal(int arc) {
- return (0 == getDestinationNodeOffset(arc));
- }
-
- /**
- * Returns the number encoded at the given node. The number equals the count
- * of the set of suffixes reachable from <code>node</code> (called its right
- * language).
- */
- @Override
- public int getRightLanguageCount(int node) {
- assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
- return decodeFromBytes(arcs, node, nodeDataLength);
- }
-
- /**
- * {@inheritDoc}
- *
- * <p>For this automaton version, an additional {@link FSAFlags#NUMBERS} flag
- * may be set to indicate the automaton contains extra fields for each node.</p>
- */
- @Override
- public Set<FSAFlags> getFlags() {
- return Collections.unmodifiableSet(flags);
- }
-
- /**
- * Returns <code>true</code> if this arc has <code>LAST</code> bit set.
- *
- * @see #BIT_LAST_ARC
- */
- public boolean isArcLast(int arc) {
- return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0;
- }
-
- /**
- * @see #BIT_TARGET_NEXT
- */
- public boolean isNextSet(int arc) {
- return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0;
- }
-
- /**
- * Returns an n-byte integer encoded in byte-packed representation.
- */
- static final int decodeFromBytes(
- final byte[] arcs, final int start, final int n)
- {
- int r = 0;
- for (int i = n; --i >= 0;) {
- r = r << 8 | (arcs[start + i] & 0xff);
- }
- return r;
- }
-
- /**
- * Returns the address of the node pointed to by this arc.
- */
- final int getDestinationNodeOffset(int arc) {
- if (isNextSet(arc)) {
- /* The destination node follows this arc in the array. */
- return skipArc(arc);
- } else {
- /*
- * The destination node address has to be extracted from the arc's
- * goto field.
- */
- return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3;
- }
- }
-
- /**
- * Read the arc's layout and skip as many bytes, as needed.
- */
- private int skipArc(int offset) {
- return offset + (isNextSet(offset)
- ? 1 + 1 /* label + flags */
- : 1 + gtl /* label + flags/address */);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/FSA5Serializer.java b/src/morfologik/fsa/FSA5Serializer.java
deleted file mode 100644
index be017a4..0000000
--- a/src/morfologik/fsa/FSA5Serializer.java
+++ /dev/null
@@ -1,334 +0,0 @@
-package morfologik.fsa;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import morfologik.tools.IMessageLogger;
-
-import com.carrotsearch.hppc.*;
-import com.carrotsearch.hppc.BitSet;
-
-import static morfologik.fsa.FSAFlags.*;
-
-/**
- * Serializes in-memory {@link FSA} graphs to a binary format compatible with
- * Jan Daciuk's <code>fsa</code>'s package <code>FSA5</code> format.
- *
- * <p>
- * It is possible to serialize the automaton with numbers required for perfect
- * hashing. See {@link #withNumbers()} method.
- * </p>
- *
- * @see FSA5
- * @see FSA#read(java.io.InputStream)
- */
-public final class FSA5Serializer implements FSASerializer {
- /**
- * Maximum number of bytes for a serialized arc.
- */
- private final static int MAX_ARC_SIZE = 1 + 5;
-
- /**
- * Maximum number of bytes for per-node data.
- */
- private final static int MAX_NODE_DATA_SIZE = 16;
-
- /**
- * Number of bytes for the arc's flags header (arc representation without
- * the goto address).
- */
- private final static int SIZEOF_FLAGS = 1;
-
- /**
- * Supported flags.
- */
- private final static EnumSet<FSAFlags> flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT);
-
- /**
- * @see FSA5#filler
- */
- public byte fillerByte = FSA5.DEFAULT_FILLER;
-
- /**
- * @see FSA5#annotation
- */
- public byte annotationByte = FSA5.DEFAULT_ANNOTATION;
-
- /**
- * <code>true</code> if we should serialize with numbers.
- *
- * @see #withNumbers()
- */
- private boolean withNumbers;
-
- /**
- * A hash map of [state, offset] pairs.
- */
- private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
-
- /**
- * A hash map of [state, right-language-count] pairs.
- */
- private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
-
- /**
- * Serialize the automaton with the number of right-language sequences in
- * each node. This is required to implement perfect hashing. The numbering
- * also preserves the order of input sequences.
- *
- * @return Returns the same object for easier call chaining.
- */
- public FSA5Serializer withNumbers() {
- withNumbers = true;
- return this;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public FSA5Serializer withFiller(byte filler) {
- this.fillerByte = filler;
- return this;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) {
- this.annotationByte = annotationSeparator;
- return this;
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public FSASerializer withLogger(IMessageLogger logger) {
- return this;
- }
-
- /**
- * Serialize root state <code>s</code> to an output stream in
- * <code>FSA5</code> format.
- *
- * @see #withNumbers
- * @return Returns <code>os</code> for chaining.
- */
- @Override
- public <T extends OutputStream> T serialize(final FSA fsa, T os)
- throws IOException {
-
- // Prepare space for arc offsets and linearize all the states.
- int[] linearized = linearize(fsa);
-
- /*
- * Calculate the number of bytes required for the node data, if
- * serializing with numbers.
- */
- int nodeDataLength = 0;
- if (withNumbers) {
- this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
- int maxNumber = numbers.get(fsa.getRootNode());
- while (maxNumber > 0) {
- nodeDataLength++;
- maxNumber >>>= 8;
- }
- }
-
- // Calculate minimal goto length.
- int gtl = 1;
- while (true) {
- // First pass: calculate offsets of states.
- if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) {
- gtl++;
- continue;
- }
-
- // Second pass: check if goto overflows anywhere.
- if (emitArcs(fsa, null, linearized, gtl, nodeDataLength))
- break;
-
- gtl++;
- }
-
- /*
- * Emit the header.
- */
- os.write(new byte[] { '\\', 'f', 's', 'a' });
- os.write(FSA5.VERSION);
- os.write(fillerByte);
- os.write(annotationByte);
- os.write((nodeDataLength << 4) | gtl);
-
- /*
- * Emit the automaton.
- */
- boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength);
- assert gtlUnchanged : "gtl changed in the final pass.";
-
- return os;
- }
-
- /**
- * Return supported flags.
- */
- @Override
- public Set<FSAFlags> getFlags() {
- return flags;
- }
-
- /**
- * Linearization of states.
- */
- private int[] linearize(final FSA fsa) {
- int[] linearized = new int[0];
- int last = 0;
-
- BitSet visited = new BitSet();
- IntStack nodes = new IntStack();
- nodes.push(fsa.getRootNode());
-
- while (!nodes.isEmpty()) {
- final int node = nodes.pop();
- if (visited.get(node))
- continue;
-
- if (last >= linearized.length) {
- linearized = Arrays.copyOf(linearized, linearized.length + 100000);
- }
-
- visited.set(node);
- linearized[last++] = node;
-
- for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
- if (!fsa.isArcTerminal(arc)) {
- int target = fsa.getEndNode(arc);
- if (!visited.get(target))
- nodes.push(target);
- }
- }
- }
-
- return Arrays.copyOf(linearized, last);
- }
-
- /**
- * Update arc offsets assuming the given goto length.
- */
- private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized,
- int gtl, int nodeDataLength) throws IOException {
- final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE,
- MAX_ARC_SIZE));
-
- int offset = 0;
-
- // Add dummy terminal state.
- offset += emitNodeData(bb, os, nodeDataLength, 0);
- offset += emitArc(bb, os, gtl, 0, (byte) 0, 0);
-
- // Add epsilon state.
- offset += emitNodeData(bb, os, nodeDataLength, 0);
- if (fsa.getRootNode() != 0)
- offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0);
- else
- offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC , (byte) '^', 0);
-
- int maxStates = linearized.length;
- for (int j = 0; j < maxStates; j++) {
- final int s = linearized[j];
-
- if (os == null) {
- offsets.put(s, offset);
- } else {
- assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset;
- }
-
- offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0);
-
- for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
- int targetOffset;
- final int target;
- if (fsa.isArcTerminal(arc)) {
- targetOffset = 0;
- target = 0;
- } else {
- target = fsa.getEndNode(arc);
- targetOffset = offsets.get(target);
- }
-
- int flags = 0;
- if (fsa.isArcFinal(arc)) {
- flags |= FSA5.BIT_FINAL_ARC;
- }
-
- if (fsa.getNextArc(arc) == 0) {
- flags |= FSA5.BIT_LAST_ARC;
-
- if (j + 1 < maxStates && target == linearized[j + 1]
- && targetOffset != 0) {
- flags |= FSA5.BIT_TARGET_NEXT;
- targetOffset = 0;
- }
- }
-
- int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset);
- if (bytes < 0)
- // gtl too small. interrupt eagerly.
- return false;
-
- offset += bytes;
- }
- }
-
- return true;
- }
-
- /** */
- private int emitArc(ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset)
- throws IOException
- {
- int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl;
-
- flags |= (targetOffset << 3);
- bb.put(label);
- for (int b = 0; b < arcBytes; b++) {
- bb.put((byte) flags);
- flags >>>= 8;
- }
-
- if (flags != 0) {
- // gtl too small. interrupt eagerly.
- return -1;
- }
-
- bb.flip();
- int bytes = bb.remaining();
- if (os != null) {
- os.write(bb.array(), bb.position(), bb.remaining());
- }
- bb.clear();
-
- return bytes;
- }
-
- /** */
- private int emitNodeData(ByteBuffer bb, OutputStream os,
- int nodeDataLength, int number) throws IOException {
- if (nodeDataLength > 0 && os != null) {
- for (int i = 0; i < nodeDataLength; i++) {
- bb.put((byte) number);
- number >>>= 8;
- }
-
- bb.flip();
- os.write(bb.array(), bb.position(), bb.remaining());
- bb.clear();
- }
-
- return nodeDataLength;
- }
-}
diff --git a/src/morfologik/fsa/FSABuilder.java b/src/morfologik/fsa/FSABuilder.java
deleted file mode 100644
index 0cf7cc0..0000000
--- a/src/morfologik/fsa/FSABuilder.java
+++ /dev/null
@@ -1,486 +0,0 @@
-package morfologik.fsa;
-
-import java.util.*;
-
-import morfologik.util.Arrays;
-
-import static morfologik.fsa.ConstantArcSizeFSA.*;
-
-/**
- * Fast, memory-conservative finite state automaton builder, returning a
- * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
- * speed and memory consumption).
- */
-public final class FSABuilder {
- /**
- * Debug and information constants.
- *
- * @see FSABuilder#getInfo()
- */
- public enum InfoEntry {
- SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
- SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
- CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
- MAX_ACTIVE_PATH_LENGTH("Max active path"),
- STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
- STATE_REGISTRY_SIZE("Registry hash entries"),
- ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");
-
- private final String stringified;
-
- InfoEntry(String stringified) {
- this.stringified = stringified;
- }
-
- @Override
- public String toString() {
- return stringified;
- }
- }
-
- /** A megabyte. */
- private final static int MB = 1024 * 1024;
-
- /**
- * Internal serialized FSA buffer expand ratio.
- */
- private final static int BUFFER_GROWTH_SIZE = 5 * MB;
-
- /**
- * Maximum number of labels from a single state.
- */
- private final static int MAX_LABELS = 256;
-
- /**
- * Comparator comparing full byte arrays consistently with
- * {@link #compare(byte[], int, int, byte[], int, int)}.
- */
- public static final Comparator<byte[]> LEXICAL_ORDERING = new Comparator<byte[]>() {
- public int compare(byte[] o1, byte[] o2) {
- return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
- }
- };
-
- /**
- * Internal serialized FSA buffer expand ratio.
- */
- private final int bufferGrowthSize;
-
- /**
- * Holds serialized and mutable states. Each state is a sequential list of
- * arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
- */
- private byte[] serialized = new byte[0];
-
- /**
- * Number of bytes already taken in {@link #serialized}. Start from 1 to
- * keep 0 a sentinel value (for the hash set and final state).
- */
- private int size;
-
- /**
- * States on the "active path" (still mutable). Values are addresses of each
- * state's first arc.
- */
- private int[] activePath = new int[0];
-
- /**
- * Current length of the active path.
- */
- private int activePathLen;
-
- /**
- * The next offset at which an arc will be added to the given state on
- * {@link #activePath}.
- */
- private int[] nextArcOffset = new int[0];
-
- /**
- * Root state. If negative, the automaton has been built already and cannot be extended.
- */
- private int root;
-
- /**
- * An epsilon state. The first and only arc of this state points either
- * to the root or to the terminal state, indicating an empty automaton.
- */
- private int epsilon;
-
- /**
- * Hash set of state addresses in {@link #serialized}, hashed by
- * {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
- */
- private int[] hashSet = new int[2];
-
- /**
- * Number of entries currently stored in {@link #hashSet}.
- */
- private int hashSize = 0;
-
- /**
- * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only.
- */
- private byte [] previous;
-
- /**
- * Information about the automaton and its compilation.
- */
- private TreeMap<InfoEntry, Object> info;
-
- /**
- * {@link #previous} sequence's length, used in assertions only.
- */
- private int previousLength;
-
- /** */
- public FSABuilder() {
- this(BUFFER_GROWTH_SIZE);
- }
-
- /** */
- public FSABuilder(int bufferGrowthSize) {
- this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);
-
- // Allocate epsilon state.
- epsilon = allocateState(1);
- serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;
-
- // Allocate root, with an initial empty set of output arcs.
- expandActivePath(1);
- root = activePath[0];
- }
-
- /**
- * Add a single sequence of bytes to the FSA. The input must be lexicographically greater
- * than any previously added sequence.
- */
- public void add(byte[] sequence, int start, int len) {
- assert serialized != null : "Automaton already built.";
- assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 :
- "Input must be sorted: "
- + Arrays.toString(previous, 0, previousLength) + " >= "
- + Arrays.toString(sequence, start, len);
- assert setPrevious(sequence, start, len);
-
- // Determine common prefix length.
- final int commonPrefix = commonPrefix(sequence, start, len);
-
- // Make room for extra states on active path, if needed.
- expandActivePath(len);
-
- // Freeze all the states after the common prefix.
- for (int i = activePathLen - 1; i > commonPrefix; i--) {
- final int frozenState = freezeState(i);
- setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
- nextArcOffset[i] = activePath[i];
- }
-
- // Create arcs to new suffix states.
- for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
- final int p = nextArcOffset[i - 1];
-
- serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
- serialized[p + LABEL_OFFSET] = sequence[j++];
- setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
-
- nextArcOffset[i - 1] = p + ARC_SIZE;
- }
-
- // Save last sequence's length so that we don't need to calculate it again.
- this.activePathLen = len;
- }
-
- /** Number of serialization buffer reallocations. */
- private int serializationBufferReallocations;
-
- /**
- * Complete the automaton.
- */
- public FSA complete() {
- add(new byte[0], 0, 0);
-
- if (nextArcOffset[0] - activePath[0] == 0) {
- // An empty FSA.
- setArcTarget(epsilon, TERMINAL_STATE);
- } else {
- // An automaton with at least a single arc from root.
- root = freezeState(0);
- setArcTarget(epsilon, root);
- }
-
- info = new TreeMap<InfoEntry, Object>();
- info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
- info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
- info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
- info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
- info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
- info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
- info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB,
- (this.serialized.length + this.hashSet.length * 4) / (double) MB);
-
- final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
- this.serialized = null;
- this.hashSet = null;
- return fsa;
- }
-
- /**
- * Build a minimal, deterministic automaton from a sorted list of byte sequences.
- */
- public static FSA build(byte[][] input) {
- final FSABuilder builder = new FSABuilder();
-
- for (byte [] chs : input)
- builder.add(chs, 0, chs.length);
-
- return builder.complete();
- }
-
- /**
- * Build a minimal, deterministic automaton from an iterable list of byte sequences.
- */
- public static FSA build(Iterable<byte[]> input) {
- final FSABuilder builder = new FSABuilder();
-
- for (byte [] chs : input)
- builder.add(chs, 0, chs.length);
-
- return builder.complete();
- }
-
- /**
- * Return various statistics concerning the FSA and its compilation.
- */
- public Map<InfoEntry, Object> getInfo() {
- return info;
- }
-
- /** Is this arc the state's last? */
- private boolean isArcLast(int arc) {
- return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
- }
-
- /** Is this arc final? */
- private boolean isArcFinal(int arc) {
- return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
- }
-
- /** Get label's arc. */
- private byte getArcLabel(int arc) {
- return serialized[arc + LABEL_OFFSET];
- }
-
- /**
- * Fills the target state address of an arc.
- */
- private void setArcTarget(int arc, int state) {
- arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
- for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
- serialized[--arc] = (byte) state;
- state >>>= 8;
- }
- }
-
- /**
- * Returns the address of an arc.
- */
- private int getArcTarget(int arc) {
- arc += ADDRESS_OFFSET;
- return (serialized[arc]) << 24 |
- (serialized[arc + 1] & 0xff) << 16 |
- (serialized[arc + 2] & 0xff) << 8 |
- (serialized[arc + 3] & 0xff);
- }
-
- /**
- * @return The number of common prefix characters with the previous
- * sequence.
- */
- private int commonPrefix(byte[] sequence, int start, int len) {
- // Empty root state case.
- final int max = Math.min(len, activePathLen);
- int i;
- for (i = 0; i < max; i++) {
- final int lastArc = nextArcOffset[i] - ARC_SIZE;
- if (sequence[start++] != getArcLabel(lastArc)) {
- break;
- }
- }
-
- return i;
- }
-
- /**
- * Freeze a state: try to find an equivalent state in the interned states
- * dictionary first, if found, return it, otherwise, serialize the mutable
- * state at <code>activePathIndex</code> and return it.
- */
- private int freezeState(final int activePathIndex) {
- final int start = activePath[activePathIndex];
- final int end = nextArcOffset[activePathIndex];
- final int len = end - start;
-
- // Set the last arc flag on the current active path's state.
- serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;
-
- // Try to locate a state with an identical content in the hash set.
- final int bucketMask = (hashSet.length - 1);
- int slot = hash(start, len) & bucketMask;
- for (int i = 0;;) {
- int state = hashSet[slot];
- if (state == 0) {
- state = hashSet[slot] = serialize(activePathIndex);
- if (++hashSize > hashSet.length / 2)
- expandAndRehash();
- return state;
- } else if (equivalent(state, start, len)) {
- return state;
- }
-
- slot = (slot + (++i)) & bucketMask;
- }
- }
-
- /**
- * Reallocate and rehash the hash set.
- */
- private void expandAndRehash() {
- final int[] newHashSet = new int[hashSet.length * 2];
- final int bucketMask = (newHashSet.length - 1);
-
- for (int j = 0; j < hashSet.length; j++) {
- final int state = hashSet[j];
- if (state > 0) {
- int slot = hash(state, stateLength(state)) & bucketMask;
- for (int i = 0; newHashSet[slot] > 0;) {
- slot = (slot + (++i)) & bucketMask;
- }
- newHashSet[slot] = state;
- }
- }
- this.hashSet = newHashSet;
- }
-
- /**
- * The total length of the serialized state data (all arcs).
- */
- private int stateLength(int state) {
- int arc = state;
- while (!isArcLast(arc)) {
- arc += ARC_SIZE;
- }
- return arc - state + ARC_SIZE;
- }
-
- /** Return <code>true</code> if two regions in {@link #serialized} are identical. */
- private boolean equivalent(int start1, int start2, int len) {
- if (start1 + len > size || start2 + len > size)
- return false;
-
- while (len-- > 0)
- if (serialized[start1++] != serialized[start2++])
- return false;
-
- return true;
- }
-
- /**
- * Serialize a given state on the active path.
- */
- private int serialize(final int activePathIndex) {
- expandBuffers();
-
- final int newState = size;
- final int start = activePath[activePathIndex];
- final int len = nextArcOffset[activePathIndex] - start;
- System.arraycopy(serialized, start, serialized, newState, len);
-
- size += len;
- return newState;
- }
-
- /**
- * Hash code of a fragment of {@link #serialized} array.
- */
- private int hash(int start, int byteCount) {
- assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";
-
- int h = 0;
- for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
- h = 17 * h + getArcLabel(start);
- h = 17 * h + getArcTarget(start);
- if (isArcFinal(start)) h += 17;
- }
-
- return h;
- }
-
- /**
- * Append a new mutable state to the active path.
- */
- private void expandActivePath(int size) {
- if (activePath.length < size) {
- final int p = activePath.length;
- activePath = java.util.Arrays.copyOf(activePath, size);
- nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);
-
- for (int i = p; i < size; i++) {
- nextArcOffset[i] = activePath[i] =
- allocateState(/* assume max labels count */ MAX_LABELS);
- }
- }
- }
-
- /**
- * Expand internal buffers for the next state.
- */
- private void expandBuffers() {
- if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
- serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
- serializationBufferReallocations++;
- }
- }
-
- /**
- * Allocate space for a state with the given number of outgoing labels.
- *
- * @return state offset
- */
- private int allocateState(int labels) {
- expandBuffers();
- final int state = size;
- size += labels * ARC_SIZE;
- return state;
- }
-
- /**
- * Copy <code>current</code> into an internal buffer.
- */
- private boolean setPrevious(byte [] sequence, int start, int length) {
- if (previous == null || previous.length < length) {
- previous = new byte [length];
- }
-
- System.arraycopy(sequence, start, previous, 0, length);
- previousLength = length;
- return true;
- }
-
- /**
- * Lexicographic order of input sequences. By default, consistent with the "C" sort
- * (absolute value of bytes, 0-255).
- */
- public static int compare(byte [] s1, int start1, int lens1,
- byte [] s2, int start2, int lens2) {
- final int max = Math.min(lens1, lens2);
-
- for (int i = 0; i < max; i++) {
- final byte c1 = s1[start1++];
- final byte c2 = s2[start2++];
- if (c1 != c2)
- return (c1 & 0xff) - (c2 & 0xff);
- }
-
- return lens1 - lens2;
- }
-}
diff --git a/src/morfologik/fsa/FSAFinalStatesIterator.java b/src/morfologik/fsa/FSAFinalStatesIterator.java
deleted file mode 100644
index 9e381f4..0000000
--- a/src/morfologik/fsa/FSAFinalStatesIterator.java
+++ /dev/null
@@ -1,154 +0,0 @@
-package morfologik.fsa;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-/**
- * An iterator that traverses the right language of a given node (all sequences
- * reachable from a given node).
- */
-public final class FSAFinalStatesIterator implements Iterator<ByteBuffer> {
- /**
- * Default expected depth of the recursion stack (estimated longest sequence
- * in the automaton). Buffers expand by the same value if exceeded.
- */
- private final static int EXPECTED_MAX_STATES = 15;
-
- /** The FSA to which this iterator belongs. */
- private final FSA fsa;
-
- /** An internal cache for the next element in the FSA */
- private ByteBuffer nextElement;
-
- /**
- * A buffer for the current sequence of bytes from the current node to the
- * root.
- */
- private byte[] buffer = new byte[EXPECTED_MAX_STATES];
-
- /** Reusable byte buffer wrapper around {@link #buffer}. */
- private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer);
-
- /** An arc stack for DFS when processing the automaton. */
- private int[] arcs = new int[EXPECTED_MAX_STATES];
-
- /** Current processing depth in {@link #arcs}. */
- private int position;
-
- /**
- * Create an instance of the iterator for a given node.
- */
- public FSAFinalStatesIterator(FSA fsa, int node) {
- this.fsa = fsa;
-
- if (fsa.getFirstArc(node) != 0) {
- restartFrom(node);
- }
- }
-
- /**
- * Restart walking from <code>node</code>. Allows iterator reuse.
- */
- public void restartFrom(int node) {
- position = 0;
- bufferWrapper.clear();
- nextElement = null;
-
- pushNode(node);
- }
-
- /** Returns <code>true</code> if there are still elements in this iterator. */
- @Override
- public boolean hasNext() {
- if (nextElement == null) {
- nextElement = advance();
- }
-
- return nextElement != null;
- }
-
- /**
- * @return Returns a {@link ByteBuffer} with the sequence corresponding to
- * the next final state in the automaton.
- */
- @Override
- public ByteBuffer next() {
- if (nextElement != null) {
- final ByteBuffer cache = nextElement;
- nextElement = null;
- return cache;
- } else {
- final ByteBuffer cache = advance();
- if (cache == null) {
- throw new NoSuchElementException();
- }
- return cache;
- }
- }
-
- /**
- * Advances to the next available final state.
- */
- private final ByteBuffer advance() {
- if (position == 0) {
- return null;
- }
-
- while (position > 0) {
- final int lastIndex = position - 1;
- final int arc = arcs[lastIndex];
-
- if (arc == 0) {
- // Remove the current node from the queue.
- position--;
- continue;
- }
-
- // Go to the next arc, but leave it on the stack
- // so that we keep the recursion depth level accurate.
- arcs[lastIndex] = fsa.getNextArc(arc);
-
- // Expand buffer if needed.
- final int bufferLength = this.buffer.length;
- if (lastIndex >= bufferLength) {
- this.buffer = Arrays.copyOf(buffer, bufferLength
- + EXPECTED_MAX_STATES);
- this.bufferWrapper = ByteBuffer.wrap(buffer);
- }
- buffer[lastIndex] = fsa.getArcLabel(arc);
-
- if (!fsa.isArcTerminal(arc)) {
- // Recursively descend into the arc's node.
- pushNode(fsa.getEndNode(arc));
- }
-
- if (fsa.isArcFinal(arc)) {
- bufferWrapper.clear();
- bufferWrapper.limit(lastIndex + 1);
- return bufferWrapper;
- }
- }
-
- return null;
- }
-
- /**
- * Not implemented in this iterator.
- */
- @Override
- public void remove() {
- throw new UnsupportedOperationException("Read-only iterator.");
- }
-
- /**
- * Descends to a given node, adds its arcs to the stack to be traversed.
- */
- private void pushNode(int node) {
- // Expand buffers if needed.
- if (position == arcs.length) {
- arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES);
- }
-
- arcs[position++] = fsa.getFirstArc(node);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/FSAFlags.java b/src/morfologik/fsa/FSAFlags.java
deleted file mode 100644
index 7b9a730..0000000
--- a/src/morfologik/fsa/FSAFlags.java
+++ /dev/null
@@ -1,64 +0,0 @@
-package morfologik.fsa;
-
-import java.util.Set;
-
-/**
- * FSA automaton flags. Where applicable, flags follow Daciuk's <code>fsa</code> package.
- */
-public enum FSAFlags {
- /** Daciuk: flexible FSA encoding. */
- FLEXIBLE(1 << 0),
-
- /** Daciuk: stop bit in use. */
- STOPBIT(1 << 1),
-
- /** Daciuk: next bit in use. */
- NEXTBIT(1 << 2),
-
- /** Daciuk: tails compression. */
- TAILS(1 << 3),
-
- /*
- * These flags are outside of byte range (never occur in Daciuk's FSA).
- */
-
- /**
- * The FSA contains right-language count numbers on states.
- *
- * @see FSA#getRightLanguageCount(int)
- */
- NUMBERS(1 << 8),
-
- /**
- * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package
- * compatibility).
- */
- SEPARATORS(1 << 9);
-
- /**
- * Bit mask for the corresponding flag.
- */
- public final int bits;
-
- /** */
- private FSAFlags(int bits) {
- this.bits = bits;
- }
-
- /**
- * Returns <code>true</code> if the corresponding flag is set in the bit set.
- */
- public static boolean isSet(int flags, FSAFlags flag) {
- return (flags & flag.bits) != 0;
- }
-
- /**
- * Returns the set of flags encoded in a single short.
- */
- public static short asShort(Set<FSAFlags> flags) {
- short value = 0;
- for (FSAFlags f : flags)
- value |= f.bits;
- return value;
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/FSAHeader.java b/src/morfologik/fsa/FSAHeader.java
deleted file mode 100644
index 76fd6ff..0000000
--- a/src/morfologik/fsa/FSAHeader.java
+++ /dev/null
@@ -1,52 +0,0 @@
-package morfologik.fsa;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import morfologik.util.FileUtils;
-import static morfologik.util.FileUtils.*;
-
-/**
- * Standard FSA file header, as described in <code>fsa</code> package documentation.
- */
-final class FSAHeader {
- /**
- * FSA magic (4 bytes).
- */
- public final static int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a');
-
- /**
- * Maximum length of the header block.
- */
- public static final int MAX_HEADER_LENGTH = 4 + 8;
-
- /** FSA version number. */
- public byte version;
-
- /** Filler character. */
- public byte filler;
-
- /** Annotation character. */
- public byte annotation;
-
- /** Goto field (may be a compound, depending on the automaton version). */
- public byte gtl;
-
- /**
- * Read FSA header from a stream, consuming its bytes.
- *
- * @throws IOException If the stream ends prematurely or if it contains invalid data.
- */
- public static FSAHeader read(InputStream in) throws IOException {
- if (FSA_MAGIC != FileUtils.readInt(in))
- throw new IOException("Invalid file header magic bytes.");
-
- final FSAHeader h = new FSAHeader();
- h.version = readByte(in);
- h.filler = readByte(in);
- h.annotation = readByte(in);
- h.gtl = readByte(in);
-
- return h;
- }
-}
diff --git a/src/morfologik/fsa/FSAInfo.java b/src/morfologik/fsa/FSAInfo.java
deleted file mode 100644
index dc5cb27..0000000
--- a/src/morfologik/fsa/FSAInfo.java
+++ /dev/null
@@ -1,157 +0,0 @@
-package morfologik.fsa;
-
-import java.util.BitSet;
-import java.util.HashMap;
-
-/**
- * Compute additional information about an FSA: number of arcs, nodes, etc.
- */
-public final class FSAInfo {
- /**
- * Computes the exact number of states and nodes by recursively traversing
- * the FSA.
- */
- private static class NodeVisitor {
- final BitSet visitedArcs = new BitSet();
- final BitSet visitedNodes = new BitSet();
-
- int nodes;
- int arcs;
- int totalArcs;
-
- private final FSA fsa;
-
- NodeVisitor(FSA fsa) {
- this.fsa = fsa;
- }
-
- public void visitNode(final int node) {
- if (visitedNodes.get(node)) {
- return;
- }
- visitedNodes.set(node);
-
- nodes++;
- for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa
- .getNextArc(arc)) {
- if (!visitedArcs.get(arc)) {
- arcs++;
- }
- totalArcs++;
- visitedArcs.set(arc);
-
- if (!fsa.isArcTerminal(arc)) {
- visitNode(fsa.getEndNode(arc));
- }
- }
- }
- }
-
- /**
- * Computes the exact number of final states.
- */
- private static class FinalStateVisitor {
- final HashMap<Integer, Integer> visitedNodes
- = new HashMap<Integer, Integer>();
-
- private final FSA fsa;
-
- FinalStateVisitor(FSA fsa) {
- this.fsa = fsa;
- }
-
- public int visitNode(int node) {
- Integer cached = visitedNodes.get(node);
- if (cached != null)
- return cached;
-
- int fromHere = 0;
- for (int arc = fsa.getFirstArc(node);
- arc != 0; arc = fsa.getNextArc(arc))
- {
- if (fsa.isArcFinal(arc))
- fromHere++;
-
- if (!fsa.isArcTerminal(arc)) {
- fromHere += visitNode(fsa.getEndNode(arc));
- }
- }
- visitedNodes.put(node, fromHere);
- return fromHere;
- }
- }
-
- /**
- * Number of nodes in the automaton.
- */
- public final int nodeCount;
-
- /**
- * Number of arcs in the automaton, excluding an arcs from the zero node
- * (initial) and an arc from the start node to the root node.
- */
- public final int arcsCount;
-
- /**
- * Total number of arcs, counting arcs that physically overlap due to
- * merging.
- */
- public final int arcsCountTotal;
-
- /**
- * Number of final states (number of input sequences stored in the automaton).
- */
- public final int finalStatesCount;
-
- /**
- * Arcs size (in serialized form).
- */
- public final int size;
-
- /*
- *
- */
- public FSAInfo(FSA fsa) {
- final NodeVisitor w = new NodeVisitor(fsa);
- int root = fsa.getRootNode();
- if (root > 0) {
- w.visitNode(root);
- }
-
- this.nodeCount = 1 + w.nodes;
- this.arcsCount = 1 + w.arcs;
- this.arcsCountTotal = 1 + w.totalArcs;
-
- final FinalStateVisitor fsv = new FinalStateVisitor(fsa);
- this.finalStatesCount = fsv.visitNode(fsa.getRootNode());
-
- if (fsa instanceof FSA5) {
- this.size = ((FSA5) fsa).arcs.length;
- } else {
- this.size = 0;
- }
- }
-
- /*
- *
- */
- public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) {
- this.nodeCount = nodeCount;
- this.arcsCount = arcsCount;
- this.arcsCountTotal = arcsCountTotal;
- this.finalStatesCount = finalStatesCount;
- this.size = 0;
- }
-
- /*
- *
- */
- @Override
- public String toString() {
- return "Nodes: " + nodeCount
- + ", arcs visited: " + arcsCount
- + ", arcs total: " + arcsCountTotal
- + ", final states: " + finalStatesCount
- + ", size: " + size;
- }
-}
diff --git a/src/morfologik/fsa/FSASerializer.java b/src/morfologik/fsa/FSASerializer.java
deleted file mode 100644
index 414640e..0000000
--- a/src/morfologik/fsa/FSASerializer.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package morfologik.fsa;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.Set;
-
-import morfologik.tools.IMessageLogger;
-
-/**
- * All FSA serializers to binary formats will implement this interface.
- */
-public interface FSASerializer {
- /**
- * Serialize a finite state automaton to an output stream.
- */
- public <T extends OutputStream> T serialize(FSA fsa, T os) throws IOException;
-
- /**
- * Returns the set of flags supported by the serializer (and the output automaton).
- */
- public Set<FSAFlags> getFlags();
-
- /**
- * Log extra messages during construction.
- */
- public FSASerializer withLogger(IMessageLogger logger);
-
- /**
- * Supports built-in filler separator. Only if {@link #getFlags()} returns
- * {@link FSAFlags#SEPARATORS}.
- */
- public FSASerializer withFiller(byte filler);
-
- /**
- * Supports built-in annotation separator. Only if {@link #getFlags()} returns
- * {@link FSAFlags#SEPARATORS}.
- */
- public FSASerializer withAnnotationSeparator(byte annotationSeparator);
-
- /**
- * Supports built-in right language count on nodes, speeding up perfect hash counts.
- * Only if {@link #getFlags()} returns {@link FSAFlags#NUMBERS}.
- */
- public FSASerializer withNumbers();
-}
diff --git a/src/morfologik/fsa/FSATraversal.java b/src/morfologik/fsa/FSATraversal.java
deleted file mode 100644
index 9e59003..0000000
--- a/src/morfologik/fsa/FSATraversal.java
+++ /dev/null
@@ -1,169 +0,0 @@
-package morfologik.fsa;
-
-import static morfologik.fsa.MatchResult.*;
-
-/**
- * This class implements some common matching and scanning operations on a
- * generic FSA.
- */
-public final class FSATraversal {
- /**
- * Target automaton.
- */
- private final FSA fsa;
-
- /**
- * Traversals of the given FSA.
- */
- public FSATraversal(FSA fsa) {
- this.fsa = fsa;
- }
-
- /**
- * Calculate perfect hash for a given input sequence of bytes. The perfect hash requires
- * that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential
- * order of input sequences used at automaton construction time.
- *
- * @param start Start index in the sequence array.
- * @param length Length of the byte sequence, must be at least 1.
- *
- * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting
- * the number of that sequence in the input used to build the automaton). Returns a negative
- * integer if the input sequence was not part of the input from which the automaton was created.
- * The type of mismatch is a constant defined in {@link MatchResult}.
- */
- public int perfectHash(byte[] sequence, int start, int length, int node) {
- assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option.";
- assert length > 0 : "Must be a non-empty sequence.";
-
- int hash = 0;
- final int end = start + length - 1;
-
- int seqIndex = start;
- byte label = sequence[seqIndex];
-
- // Seek through the current node's labels, looking for 'label', update hash.
- for (int arc = fsa.getFirstArc(node); arc != 0;) {
- if (fsa.getArcLabel(arc) == label) {
- if (fsa.isArcFinal(arc)) {
- if (seqIndex == end)
- return hash;
-
- hash++;
- }
-
- if (fsa.isArcTerminal(arc)) {
- /* The automaton contains a prefix of the input sequence. */
- return AUTOMATON_HAS_PREFIX;
- }
-
- // The sequence is a prefix of one of the sequences stored in the automaton.
- if (seqIndex == end) {
- return SEQUENCE_IS_A_PREFIX;
- }
-
- // Make a transition along the arc, go the target node's first arc.
- arc = fsa.getFirstArc(fsa.getEndNode(arc));
- label = sequence[++seqIndex];
- continue;
- } else {
- if (fsa.isArcFinal(arc))
- hash++;
- if (!fsa.isArcTerminal(arc))
- hash += fsa.getRightLanguageCount(fsa.getEndNode(arc));
- }
-
- arc = fsa.getNextArc(arc);
- }
-
- // Labels of this node ended without a match on the sequence.
- // Perfect hash does not exist.
- return NO_MATCH;
- }
-
- /**
- * @see #perfectHash(byte[], int, int, int)
- */
- public int perfectHash(byte[] sequence) {
- return perfectHash(sequence, 0, sequence.length, fsa.getRootNode());
- }
-
- /**
- * Same as {@link #match(byte[], int, int, int)}, but allows passing
- * a reusable {@link MatchResult} object so that no intermediate garbage is
- * produced.
- *
- * @return The same object as <code>result</code>, but with reset internal
- * type and other fields.
- */
- public MatchResult match(MatchResult result,
- byte[] sequence, int start, int length, int node)
- {
- if (node == 0) {
- result.reset(NO_MATCH, start, node);
- return result;
- }
-
- final FSA fsa = this.fsa;
- final int end = start + length;
- for (int i = start; i < end; i++) {
- final int arc = fsa.getArc(node, sequence[i]);
- if (arc != 0) {
- if (fsa.isArcFinal(arc) && i + 1 == end) {
- /* The automaton has an exact match of the input sequence. */
- result.reset(EXACT_MATCH, i, node);
- return result;
- }
-
- if (fsa.isArcTerminal(arc)) {
- /* The automaton contains a prefix of the input sequence. */
- result.reset(AUTOMATON_HAS_PREFIX, i + 1, 0);
- return result;
- }
-
- // Make a transition along the arc.
- node = fsa.getEndNode(arc);
- } else {
- result.reset(NO_MATCH, i, node);
- return result;
- }
- }
-
- /* The sequence is a prefix of at least one sequence in the automaton. */
- result.reset(SEQUENCE_IS_A_PREFIX, 0, node);
- return result;
- }
-
- /**
- * Finds a matching path in the dictionary for a given sequence of labels
- * from <code>sequence</code> and starting at node <code>node</code>.
- *
- * @param sequence
- * An array of labels to follow in the FSA.
- * @param start
- * Starting index in <code>sequence</code>.
- * @param length
- * How many symbols to consider from <code>sequence</code>?
- * @param node
- * Start node identifier in the FSA.
- *
- * @see #match(byte [], int)
- */
- public MatchResult match(byte[] sequence, int start, int length, int node) {
- return match(new MatchResult(), sequence, start, length, node);
- }
-
- /**
- * @see #match(byte[], int, int, int)
- */
- public MatchResult match(byte[] sequence, int node) {
- return match(sequence, 0, sequence.length, node);
- }
-
- /**
- * @see #match(byte[], int, int, int)
- */
- public MatchResult match(byte[] sequence) {
- return match(sequence, fsa.getRootNode());
- }
-} \ No newline at end of file
diff --git a/src/morfologik/fsa/FSAUtils.java b/src/morfologik/fsa/FSAUtils.java
deleted file mode 100644
index cad611e..0000000
--- a/src/morfologik/fsa/FSAUtils.java
+++ /dev/null
@@ -1,202 +0,0 @@
-package morfologik.fsa;
-
-import java.io.IOException;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.TreeMap;
-
-import com.carrotsearch.hppc.IntIntOpenHashMap;
-
-/**
- * Other FSA-related utilities not directly associated with the class hierarchy.
- */
-public final class FSAUtils {
- public final static class IntIntHolder {
- public int a;
- public int b;
-
- public IntIntHolder(int a, int b) {
- this.a = a;
- this.b = b;
- }
-
- public IntIntHolder() {
- }
- }
-
- /**
- * Returns the right-language reachable from a given FSA node, formatted
- * as an input for the graphviz package (expressed in the <code>dot</code>
- * language).
- */
- public static String toDot(FSA fsa, int node) {
- try {
- StringWriter w = new StringWriter();
- toDot(w, fsa, node);
- return w.toString();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Saves the right-language reachable from a given FSA node, formatted
- * as an input for the graphviz package (expressed in the <code>dot</code>
- * language), to the given writer.
- */
- public static void toDot(Writer w, FSA fsa, int node) throws IOException {
- w.write("digraph Automaton {\n");
- w.write(" rankdir = LR;\n");
-
- final BitSet visited = new BitSet();
-
- w.write(" stop [shape=doublecircle,label=\"\"];\n");
- w.write(" initial [shape=plaintext,label=\"\"];\n");
- w.write(" initial -> " + node + "\n\n");
-
- visitNode(w, 0, fsa, node, visited);
- w.write("}\n");
- }
-
- private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException {
- visited.set(s);
- w.write(" "); w.write(Integer.toString(s));
-
- if (fsa.getFlags().contains(FSAFlags.NUMBERS)) {
- int nodeNumber = fsa.getRightLanguageCount(s);
- w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n");
- } else {
- w.write(" [shape=circle,label=\"\"];\n");
- }
-
- for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
- w.write(" ");
- w.write(Integer.toString(s));
- w.write(" -> ");
- if (fsa.isArcTerminal(arc)) {
- w.write("stop");
- } else {
- w.write(Integer.toString(fsa.getEndNode(arc)));
- }
-
- final byte label = fsa.getArcLabel(arc);
- w.write(" [label=\"");
- if (Character.isLetterOrDigit(label))
- w.write((char) label);
- else {
- w.write("0x");
- w.write(Integer.toHexString(label & 0xFF));
- }
- w.write("\"");
- if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\"");
- if (fsa instanceof FSA5) {
- if (((FSA5) fsa).isNextSet(arc)) {
- w.write(" color=\"blue\"");
- }
- }
-
- w.write("]\n");
- }
-
- for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
- if (!fsa.isArcTerminal(arc)) {
- int endNode = fsa.getEndNode(arc);
- if (!visited.get(endNode)) {
- visitNode(w, d + 1, fsa, endNode, visited);
- }
- }
- }
- }
-
- /**
- * All byte sequences generated as the right language of <code>state</code>.
- */
- public static ArrayList<byte[]> rightLanguage(FSA fsa, int state) {
- final ArrayList<byte[]> rl = new ArrayList<byte[]>();
- final byte [] buffer = new byte [0];
-
- descend(fsa, state, buffer, 0, rl);
-
- return rl;
- }
-
- /**
- * Recursive descend and collection of the right language.
- */
- private static byte [] descend(FSA fsa, int state, byte [] b, int position, ArrayList<byte[]> rl) {
-
- if (b.length <= position) {
- b = Arrays.copyOf(b, position + 1);
- }
-
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
- b[position] = fsa.getArcLabel(arc);
-
- if (fsa.isArcFinal(arc)) {
- rl.add(Arrays.copyOf(b, position + 1));
- }
-
- if (!fsa.isArcTerminal(arc))
- b = descend(fsa, fsa.getEndNode(arc), b, position + 1, rl);
- }
-
- return b;
- }
-
- /**
- * Calculate fan-out ratio.
- * @return The returned array: result[outgoing-arcs]
- */
- public static TreeMap<Integer, Integer> calculateFanOuts(final FSA fsa, int root) {
- final int [] result = new int [256];
- fsa.visitInPreOrder(new StateVisitor() {
- public boolean accept(int state) {
- int count = 0;
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
- count++;
- result[count]++;
- return true;
- }
- });
-
- TreeMap<Integer, Integer> output = new TreeMap<Integer, Integer>();
-
- int low = 1; // Omit #0, there is always a single node like that (dummy).
- while (low < result.length && result[low] == 0) low++;
-
- int high = result.length - 1;
- while (high >= 0 && result[high] == 0) high--;
-
- for (int i = low; i <= high; i++) {
- output.put(i, result[i]);
- }
-
- return output;
- }
-
- /**
- * Calculate the size of right language for each state in an FSA.
- */
- public static IntIntOpenHashMap rightLanguageForAllStates(final FSA fsa) {
- final IntIntOpenHashMap numbers = new IntIntOpenHashMap();
-
- fsa.visitInPostOrder(new StateVisitor() {
- public boolean accept(int state) {
- int thisNodeNumber = 0;
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
- thisNodeNumber +=
- (fsa.isArcFinal(arc) ? 1 : 0) +
- (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc)));
- }
- numbers.put(state, thisNodeNumber);
-
- return true;
- }
- });
-
- return numbers;
- }
-}
diff --git a/src/morfologik/fsa/MatchResult.java b/src/morfologik/fsa/MatchResult.java
deleted file mode 100644
index 2f5cbd7..0000000
--- a/src/morfologik/fsa/MatchResult.java
+++ /dev/null
@@ -1,86 +0,0 @@
-package morfologik.fsa;
-
-/**
- * A matching result returned from {@link FSATraversal}.
- *
- * @see FSATraversal
- */
-public final class MatchResult {
- /**
- * The automaton has exactly one match for the input sequence.
- */
- public static final int EXACT_MATCH = 0;
-
- /**
- * The automaton has no match for the input sequence.
- */
- public static final int NO_MATCH = -1;
-
- /**
- * The automaton contains a prefix of the input sequence. That is:
- * one of the input sequences used to build the automaton is a
- * prefix of the input sequence that is shorter than the sequence.
- *
- * <p>{@link MatchResult#index} will contain an index of the
- * first character of the input sequence not present in the
- * dictionary.</p>
- */
- public static final int AUTOMATON_HAS_PREFIX = -3;
-
- /**
- * The sequence is a prefix of at least one sequence in the automaton.
- * {@link MatchResult#node} returns the node from which all sequences
- * with the given prefix start in the automaton.
- */
- public static final int SEQUENCE_IS_A_PREFIX = -4;
-
- /**
- * One of the match kind constants defined in this class.
- *
- * @see #NO_MATCH
- * @see #EXACT_MATCH
- * @see #AUTOMATON_HAS_PREFIX
- * @see #SEQUENCE_IS_A_PREFIX
- */
- public int kind;
-
- /**
- * Input sequence's index, interpretation depends on {@link #kind}.
- */
- public int index;
-
- /**
- * Automaton node, interpretation depends on the {@link #kind}.
- */
- public int node;
-
- /*
- *
- */
- MatchResult(int kind, int index, int node) {
- reset(kind, index, node);
- }
-
- /*
- *
- */
- MatchResult(int kind) {
- reset(kind, 0, 0);
- }
-
- /*
- *
- */
- public MatchResult() {
- reset(NO_MATCH, 0, 0);
- }
-
- /*
- *
- */
- final void reset(int kind, int index, int node) {
- this.kind = kind;
- this.index = index;
- this.node = node;
- }
-}
diff --git a/src/morfologik/fsa/NullMessageLogger.java b/src/morfologik/fsa/NullMessageLogger.java
deleted file mode 100644
index 6d326d9..0000000
--- a/src/morfologik/fsa/NullMessageLogger.java
+++ /dev/null
@@ -1,24 +0,0 @@
-package morfologik.fsa;
-
-import morfologik.tools.IMessageLogger;
-
-/*
- * Do-nothing logger.
- */
-final class NullMessageLogger implements IMessageLogger {
- @Override
- public void log(String msg) {
- }
-
- @Override
- public void startPart(String header) {
- }
-
- @Override
- public void endPart() {
- }
-
- @Override
- public void log(String header, Object v) {
- }
-}
diff --git a/src/morfologik/fsa/StateVisitor.java b/src/morfologik/fsa/StateVisitor.java
deleted file mode 100644
index 8ced239..0000000
--- a/src/morfologik/fsa/StateVisitor.java
+++ /dev/null
@@ -1,11 +0,0 @@
-package morfologik.fsa;
-
-/**
- * State visitor.
- *
- * @see FSA#visitInPostOrder(StateVisitor)
- * @see FSA#visitInPreOrder(StateVisitor)
- */
-public interface StateVisitor {
- public boolean accept(int state);
-} \ No newline at end of file
diff --git a/src/morfologik/stemming/ArrayViewList.java b/src/morfologik/stemming/ArrayViewList.java
deleted file mode 100644
index 230aef6..0000000
--- a/src/morfologik/stemming/ArrayViewList.java
+++ /dev/null
@@ -1,111 +0,0 @@
-package morfologik.stemming;
-
-import java.util.*;
-
-/**
- * A view over a range of an array.
- */
-@SuppressWarnings("serial")
-final class ArrayViewList<E> extends AbstractList<E>
- implements RandomAccess, java.io.Serializable
-{
- /** Backing array. */
- private E[] a;
- private int start;
- private int length;
-
- /*
- *
- */
- ArrayViewList(E[] array, int start, int length) {
- if (array == null)
- throw new IllegalArgumentException();
- wrap(a, start, length);
- }
-
- /*
- *
- */
- public int size() {
- return length;
- }
-
- /*
- *
- */
- public E get(int index) {
- return a[start + index];
- }
-
- /*
- *
- */
- public E set(int index, E element) {
- throw new UnsupportedOperationException();
- }
-
- /*
- *
- */
- public void add(int index, E element) {
- throw new UnsupportedOperationException();
- }
-
- /*
- *
- */
- public E remove(int index) {
- throw new UnsupportedOperationException();
- }
-
- /*
- *
- */
- public boolean addAll(int index, Collection<? extends E> c) {
- throw new UnsupportedOperationException();
- }
-
- /*
- *
- */
- public int indexOf(Object o) {
- if (o == null) {
- for (int i = start; i < start + length; i++)
- if (a[i] == null)
- return i - start;
- } else {
- for (int i = start; i < start + length; i++)
- if (o.equals(a[i]))
- return i - start;
- }
- return -1;
- }
-
- public ListIterator<E> listIterator() {
- return listIterator(0);
- }
-
- /*
- *
- */
- public ListIterator<E> listIterator(final int index) {
- return Arrays.asList(a).subList(start, start + length).listIterator(
- index);
- }
-
- /*
- *
- */
- public boolean contains(Object o) {
- return indexOf(o) != -1;
- }
-
- /*
- *
- */
- void wrap(E[] array, int start, int length) {
- this.a = array;
- this.start = start;
- this.length = length;
- }
-}
diff --git a/src/morfologik/stemming/Dictionary.java b/src/morfologik/stemming/Dictionary.java
deleted file mode 100644
index 7441d5e..0000000
--- a/src/morfologik/stemming/Dictionary.java
+++ /dev/null
@@ -1,169 +0,0 @@
-package morfologik.stemming;
-
-import java.io.*;
-import java.net.URL;
-import java.util.*;
-
-import morfologik.fsa.FSA;
-import morfologik.util.FileUtils;
-import morfologik.util.ResourceUtils;
-
-/**
- * A dictionary combines {@link FSA} automaton and metadata describing the
- * internals of dictionary entries' coding ({@link DictionaryMetadata}.
- *
- * <p>
- * A dictionary consists of two files:
- * <ul>
- * <li>an actual compressed FSA file,
- * <li>a metadata file, describing the dictionary.
- * </ul>
- * Use static methods in this class to read dictionaries and their metadata.
- */
-public final class Dictionary {
- /**
- * Expected metadata file extension.
- */
- public final static String METADATA_FILE_EXTENSION = "info";
-
- /**
- * {@link FSA} automaton with the compiled dictionary data.
- */
- public final FSA fsa;
-
- /**
- * Metadata associated with the dictionary.
- */
- public final DictionaryMetadata metadata;
-
- /**
- * Default loaded dictionaries.
- */
- public static final WeakHashMap<String, Dictionary> defaultDictionaries = new WeakHashMap<String, Dictionary>();
-
- /**
- * It is strongly recommended to use static methods in this class for
- * reading dictionaries.
- *
- * @param fsa
- * An instantiated {@link FSA} instance.
- *
- * @param metadata
- * A map of attributes describing the compression format and
- * other settings not contained in the FSA automaton. For an
- * explanation of available attributes and their possible values,
- * see {@link DictionaryMetadata}.
- */
- public Dictionary(FSA fsa, DictionaryMetadata metadata) {
- this.fsa = fsa;
- this.metadata = metadata;
- }
-
- /**
- * Attempts to load a dictionary using the path to the FSA file and the
- * expected metadata extension.
- */
- public static Dictionary read(File fsaFile) throws IOException {
- final File featuresFile = new File(fsaFile.getParent(),
- getExpectedFeaturesName(fsaFile.getName()));
-
- FileUtils.assertExists(featuresFile, true, false);
-
- return readAndClose(new FileInputStream(fsaFile), new FileInputStream(
- featuresFile));
- }
-
- /**
- * <p>
- * Attempts to load a dictionary using the URL to the FSA file and the
- * expected metadata extension.
- *
- * <p>
- * This method can be used to load resource-based dictionaries, but be aware
- * of JAR resource-locking issues that arise from resource URLs.
- */
- public static Dictionary read(URL fsaURL) throws IOException {
- final String fsa = fsaURL.toExternalForm();
- final String features = getExpectedFeaturesName(fsa);
-
- return readAndClose(ResourceUtils.openInputStream(fsa), ResourceUtils
- .openInputStream(features));
- }
-
- /**
- * Attempts to load a dictionary from opened streams of FSA dictionary data
- * and associated metadata.
- */
- public static Dictionary readAndClose(InputStream fsaData,
- InputStream featuresData) throws IOException {
- try {
- final Properties properties = new Properties();
- properties.load(featuresData);
-
- final DictionaryMetadata features = DictionaryMetadata
- .fromMap(properties);
- final FSA fsa = FSA.read(fsaData);
-
- return new Dictionary(fsa, features);
- } finally {
- FileUtils.close(fsaData, featuresData);
- }
- }
-
- /**
- * Returns the expected name of the metadata file, based on the name of the
- * FSA dictionary file. The expected name is resolved by truncating any
- * suffix of <code>name</code> and appending
- * {@link #METADATA_FILE_EXTENSION}.
- */
- public static String getExpectedFeaturesName(String name) {
- final int dotIndex = name.lastIndexOf('.');
- final String featuresName;
- if (dotIndex >= 0) {
- featuresName = name.substring(0, dotIndex) + "."
- + METADATA_FILE_EXTENSION;
- } else {
- featuresName = name + "." + METADATA_FILE_EXTENSION;
- }
-
- return featuresName;
- }
-
- /**
- * Return a built-in dictionary for a given ISO language code. Dictionaries
- * are cached internally for potential reuse.
- *
- * @throws RuntimeException
- * Throws a {@link RuntimeException} if the dictionary is not
- * bundled with the library.
- */
- public static Dictionary getForLanguage(String languageCode) {
- if (languageCode == null || "".equals(languageCode)) {
- throw new IllegalArgumentException(
- "Language code must not be empty.");
- }
-
- synchronized (defaultDictionaries) {
- Dictionary dict = defaultDictionaries.get(languageCode);
- if (dict != null)
- return dict;
-
- try {
- final String dictPath = "morfologik/dictionaries/" + languageCode + ".dict";
- final String metaPath = Dictionary
- .getExpectedFeaturesName(dictPath);
-
- dict = Dictionary.readAndClose(
- ResourceUtils.openInputStream(dictPath),
- ResourceUtils.openInputStream(metaPath));
-
- defaultDictionaries.put(languageCode, dict);
- return dict;
- } catch (IOException e) {
- throw new RuntimeException(
- "Default dictionary resource for language '"
- + languageCode + "not found.", e);
- }
- }
- }
-}
diff --git a/src/morfologik/stemming/DictionaryIterator.java b/src/morfologik/stemming/DictionaryIterator.java
deleted file mode 100644
index 19b6dad..0000000
--- a/src/morfologik/stemming/DictionaryIterator.java
+++ /dev/null
@@ -1,144 +0,0 @@
-package morfologik.stemming;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.CharsetDecoder;
-import java.util.Iterator;
-
-import morfologik.util.BufferUtils;
-
-/**
- * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems
- * can be decoded from compressed format or the compressed form can be
- * preserved.
- */
-public final class DictionaryIterator implements Iterator<WordData> {
- private final CharsetDecoder decoder;
- private final Iterator<ByteBuffer> entriesIter;
- private final WordData entry;
- private final byte separator;
- private final DictionaryMetadata dictionaryMetadata;
- private final boolean decodeStems;
-
- private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0);
- private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0);
- private ByteBuffer temp = ByteBuffer.allocate(0);
-
- public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder,
- boolean decodeStems) {
- this.entriesIter = dictionary.fsa.iterator();
- this.separator = dictionary.metadata.separator;
- this.dictionaryMetadata = dictionary.metadata;
- this.decoder = decoder;
- this.entry = new WordData(decoder);
- this.decodeStems = decodeStems;
- }
-
- public boolean hasNext() {
- return entriesIter.hasNext();
- }
-
- public WordData next() {
- final ByteBuffer entryBuffer = entriesIter.next();
- entry.reset();
-
- /*
- * Entries are typically: inflected<SEP>codedBase<SEP>tag so try to find
- * this split.
- */
- byte[] ba = entryBuffer.array();
- int bbSize = entryBuffer.remaining();
-
- int sepPos;
- for (sepPos = 0; sepPos < bbSize; sepPos++) {
- if (ba[sepPos] == separator)
- break;
- }
-
- if (sepPos == bbSize) {
- throw new RuntimeException("Invalid dictionary "
- + "entry format (missing separator).");
- }
-
- inflectedBuffer.clear();
- inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos);
- inflectedBuffer.put(ba, 0, sepPos);
- inflectedBuffer.flip();
-
- inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer);
- entry.wordBuffer = inflectedBuffer;
- entry.wordCharSequence = inflectedCharBuffer;
-
- temp.clear();
- temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos);
- sepPos++;
- temp.put(ba, sepPos, bbSize - sepPos);
- temp.flip();
-
- ba = temp.array();
- bbSize = temp.remaining();
-
- /*
- * Find the next separator byte's position splitting word form and tag.
- */
- sepPos = 0;
- for (; sepPos < bbSize; sepPos++) {
- if (ba[sepPos] == separator)
- break;
- }
-
- /*
- * Decode the stem into stem buffer.
- */
- entry.stemBuffer.clear();
- if (decodeStems) {
- entry.stemBuffer = DictionaryLookup.decodeStem(entry.stemBuffer,
- ba, sepPos, inflectedBuffer, dictionaryMetadata);
- } else {
- entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer,
- sepPos);
- entry.stemBuffer.put(ba, 0, sepPos);
- }
- entry.stemBuffer.flip();
-
- // Skip separator character, if present.
- if (sepPos + 1 <= bbSize) {
- sepPos++;
- }
-
- /*
- * Decode the tag data.
- */
- entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize
- - sepPos);
- entry.tagBuffer.clear();
- entry.tagBuffer.put(ba, sepPos, bbSize - sepPos);
- entry.tagBuffer.flip();
-
- return entry;
- }
-
- /**
- * Decode the byte buffer, optionally expanding the char buffer.
- */
- private CharBuffer bytesToChars(ByteBuffer bytes, CharBuffer chars) {
- chars.clear();
- final int maxCapacity = (int) (bytes.remaining() * decoder
- .maxCharsPerByte());
- if (chars.capacity() <= maxCapacity) {
- chars = CharBuffer.allocate(maxCapacity);
- }
-
- bytes.mark();
- decoder.reset();
- decoder.decode(bytes, chars, true);
- chars.flip();
- bytes.reset();
-
- return chars;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-}
diff --git a/src/morfologik/stemming/DictionaryLookup.java b/src/morfologik/stemming/DictionaryLookup.java
deleted file mode 100644
index ac90107..0000000
--- a/src/morfologik/stemming/DictionaryLookup.java
+++ /dev/null
@@ -1,355 +0,0 @@
-package morfologik.stemming;
-
-import static morfologik.fsa.MatchResult.*;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.*;
-import java.util.*;
-
-import morfologik.fsa.*;
-import morfologik.util.BufferUtils;
-
-/**
- * This class implements a dictionary lookup over an FSA dictionary. The
- * dictionary for this class should be prepared from a text file using Jan
- * Daciuk's FSA package (see link below).
- *
- * <p>
- * <b>Important:</b> finite state automatons in Jan Daciuk's implementation use
- * <em>bytes</em> not unicode characters. Therefore objects of this class always
- * have to be constructed with an encoding used to convert Java strings to byte
- * arrays and the other way around. You <b>can</b> use UTF-8 encoding, as it
- * should not conflict with any control sequences and separator characters.
- *
- * @see <a href="http://www.eti.pg.gda.pl/~jandac/fsa.html">FSA package Web
- * site</a>
- */
-public final class DictionaryLookup implements IStemmer, Iterable<WordData> {
- /** An FSA used for lookups. */
- private final FSATraversal matcher;
-
- /** An iterator for walking along the final states of {@link #fsa}. */
- private final FSAFinalStatesIterator finalStatesIterator;
-
- /** FSA's root node. */
- private final int rootNode;
-
- /** Expand buffers and arrays by this constant. */
- private final static int EXPAND_SIZE = 10;
-
- /** Private internal array of reusable word data objects. */
- private WordData[] forms = new WordData[0];
-
- /** A "view" over an array implementing */
- private ArrayViewList<WordData> formsList = new ArrayViewList<WordData>(
- forms, 0, forms.length);
-
- /**
- * Features of the compiled dictionary.
- *
- * @see DictionaryMetadata
- */
- private final DictionaryMetadata dictionaryMetadata;
-
- /**
- * Charset encoder for the FSA.
- */
- private final CharsetEncoder encoder;
-
- /**
- * Charset decoder for the FSA.
- */
- private final CharsetDecoder decoder;
-
- /**
- * The FSA we are using.
- */
- private final FSA fsa;
-
- /**
- * Internal reusable buffer for encoding words into byte arrays using
- * {@link #encoder}.
- */
- private ByteBuffer byteBuffer = ByteBuffer.allocate(0);
-
- /**
- * Internal reusable buffer for encoding words into byte arrays using
- * {@link #encoder}.
- */
- private CharBuffer charBuffer = CharBuffer.allocate(0);
-
- /**
- * Reusable match result.
- */
- private final MatchResult matchResult = new MatchResult();
-
- /**
- * The {@link Dictionary} this lookup is using.
- */
- private final Dictionary dictionary;
-
- /**
- * <p>
- * Creates a new object of this class using the given FSA for word lookups
- * and encoding for converting characters to bytes.
- *
- * @throws IllegalArgumentException
- * if FSA's root node cannot be acquired (dictionary is empty).
- */
- public DictionaryLookup(Dictionary dictionary)
- throws IllegalArgumentException {
- this.dictionary = dictionary;
- this.dictionaryMetadata = dictionary.metadata;
- this.rootNode = dictionary.fsa.getRootNode();
- this.fsa = dictionary.fsa;
- this.matcher = new FSATraversal(fsa);
- this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
-
- if (rootNode == 0) {
- throw new IllegalArgumentException(
- "Dictionary must have at least the root node.");
- }
-
- if (dictionaryMetadata == null) {
- throw new IllegalArgumentException(
- "Dictionary metadata must not be null.");
- }
-
- try {
- Charset charset = Charset.forName(dictionaryMetadata.encoding);
- encoder = charset.newEncoder();
- decoder = charset.newDecoder().onMalformedInput(
- CodingErrorAction.REPORT).onUnmappableCharacter(
- CodingErrorAction.REPORT);
- } catch (UnsupportedCharsetException e) {
- throw new RuntimeException(
- "FSA's encoding charset is not supported: "
- + dictionaryMetadata.encoding);
- }
- }
-
- /**
- * Searches the automaton for a symbol sequence equal to <code>word</code>,
- * followed by a separator. The result is a stem (decompressed accordingly
- * to the dictionary's specification) and an optional tag data.
- */
- public List<WordData> lookup(CharSequence word) {
- final byte separator = dictionaryMetadata.separator;
-
- // Encode word characters into bytes in the same encoding as the FSA's.
- charBuffer.clear();
- charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
- for (int i = 0; i < word.length(); i++)
- charBuffer.put(word.charAt(i));
- charBuffer.flip();
- byteBuffer = charsToBytes(charBuffer, byteBuffer);
-
- // Try to find a partial match in the dictionary.
- final MatchResult match = matcher.match(matchResult, byteBuffer
- .array(), 0, byteBuffer.remaining(), rootNode);
-
- if (match.kind == SEQUENCE_IS_A_PREFIX) {
- /*
- * The entire sequence exists in the dictionary. A separator should
- * be the next symbol.
- */
- final int arc = fsa.getArc(match.node, separator);
-
- /*
- * The situation when the arc points to a final node should NEVER
- * happen. After all, we want the word to have SOME base form.
- */
- if (arc != 0 && !fsa.isArcFinal(arc)) {
- // There is such a word in the dictionary. Return its base forms.
- int formsCount = 0;
-
- finalStatesIterator.restartFrom(fsa.getEndNode(arc));
- while (finalStatesIterator.hasNext()) {
- final ByteBuffer bb = finalStatesIterator.next();
- final byte[] ba = bb.array();
- final int bbSize = bb.remaining();
-
- if (formsCount >= forms.length) {
- forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE);
- for (int k = 0; k < forms.length; k++) {
- if (forms[k] == null)
- forms[k] = new WordData(decoder);
- }
- }
-
- /*
- * Now, expand the prefix/ suffix 'compression' and store
- * the base form.
- */
- final WordData wordData = forms[formsCount++];
- wordData.reset();
-
- wordData.wordBuffer = byteBuffer;
- wordData.wordCharSequence = word;
-
- /*
- * Find the separator byte's position splitting word form
- * and tag.
- */
- int sepPos;
- for (sepPos = 0; sepPos < bbSize; sepPos++) {
- if (ba[sepPos] == separator)
- break;
- }
-
- /*
- * Decode the stem into stem buffer.
- */
- wordData.stemBuffer.clear();
- wordData.stemBuffer = decodeStem(wordData.stemBuffer, ba,
- sepPos, byteBuffer, dictionaryMetadata);
- wordData.stemBuffer.flip();
-
- // Skip separator character.
- sepPos++;
-
- /*
- * Decode the tag data.
- */
- wordData.tagBuffer = BufferUtils.ensureCapacity(
- wordData.tagBuffer, bbSize - sepPos);
- wordData.tagBuffer.clear();
- wordData.tagBuffer.put(ba, sepPos, bbSize - sepPos);
- wordData.tagBuffer.flip();
- }
-
- formsList.wrap(forms, 0, formsCount);
- return formsList;
- }
- } else {
- /*
- * this case is somewhat confusing: we should have hit the separator
- * first... I don't really know how to deal with it at the time
- * being.
- */
- }
-
- return Collections.emptyList();
- }
-
- /**
- * Decode the base form of an inflected word and save its decoded form into
- * a byte buffer.
- *
- * @param bb
- * The byte buffer to save the result to. A new buffer may be
- * allocated if the capacity of <code>bb</code> is not large
- * enough to store the result. The buffer is not flipped upon
- * return.
- *
- * @param inflectedBuffer
- * Inflected form's bytes (decoded properly).
- *
- * @param bytes
- * Bytes of the encoded base form, starting at 0 index.
- *
- * @param len
- * Length of the encode base form.
- *
- * @return Returns either <code>bb</code> or a new buffer whose capacity is
- * large enough to store the output of the decoded data.
- */
- public static ByteBuffer decodeStem(ByteBuffer bb, byte[] bytes, int len,
- ByteBuffer inflectedBuffer, DictionaryMetadata metadata) {
- bb.clear();
-
- // Empty length? Weird, but return an empty buffer.
- if (len == 0) {
- return bb;
- }
-
- // Determine inflected string's length in bytes, in the same encoding.
- final byte[] infBytes = inflectedBuffer.array();
- final int infLen = inflectedBuffer.remaining();
- final int code0 = bytes[0] - 'A';
-
- final boolean fsaPrefixes = metadata.usesPrefixes;
- final boolean fsaInfixes = metadata.usesInfixes;
-
- // Increase buffer size, if needed.
- if (bb.capacity() < infLen + len) {
- bb = ByteBuffer.allocate(infLen + len);
- }
-
- if (code0 >= 0) {
- if (!fsaPrefixes && !fsaInfixes) {
- if (code0 <= infLen) {
- bb.put(infBytes, 0, infLen - code0);
- bb.put(bytes, 1, len - 1);
- return bb;
- }
- } else if (fsaPrefixes && !fsaInfixes) {
- if (len > 1) {
- final int stripAtEnd = bytes[1] - 'A' + code0;
- if (stripAtEnd <= infLen) {
- bb.put(infBytes, code0, infLen - stripAtEnd);
- bb.put(bytes, 2, len - 2);
- return bb;
- }
- }
- } else if (fsaInfixes) {
- // Note: Prefixes are silently assumed here.
- if (len > 2) {
- final int stripAtBeginning = bytes[1] - 'A' + code0;
- final int stripAtEnd = bytes[2] - 'A' + stripAtBeginning;
- if (stripAtEnd <= infLen) {
- bb.put(infBytes, 0, code0);
- bb.put(infBytes, stripAtBeginning, infLen - stripAtEnd);
- bb.put(bytes, 3, len - 3);
- return bb;
- }
- }
- }
- }
-
- /*
- * This is a fallback in case some junk is detected above. Return the
- * base form only if this is the case.
- */
- bb.clear();
- bb.put(bytes, 0, len);
- return bb;
- }
-
- /**
- * Encode a character sequence into a byte buffer, optionally expanding
- * buffer.
- */
- private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
- bytes.clear();
- final int maxCapacity = (int) (chars.remaining() * encoder
- .maxBytesPerChar());
- if (bytes.capacity() <= maxCapacity) {
- bytes = ByteBuffer.allocate(maxCapacity);
- }
-
- chars.mark();
- encoder.reset();
- encoder.encode(chars, bytes, true);
- bytes.flip();
- chars.reset();
-
- return bytes;
- }
-
- /**
- * Return an iterator over all {@link WordData} entries available in the
- * embedded {@link Dictionary}.
- */
- public Iterator<WordData> iterator() {
- return new DictionaryIterator(dictionary, decoder, true);
- }
-
- /**
- * @return Return the {@link Dictionary} used by this object.
- */
- public Dictionary getDictionary() {
- return dictionary;
- }
-}
diff --git a/src/morfologik/stemming/DictionaryMetadata.java b/src/morfologik/stemming/DictionaryMetadata.java
deleted file mode 100644
index ce5e507..0000000
--- a/src/morfologik/stemming/DictionaryMetadata.java
+++ /dev/null
@@ -1,122 +0,0 @@
-package morfologik.stemming;
-
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.*;
-
-/**
- * Description of attributes, their types and default values.
- *
- * @see Dictionary
- */
-public final class DictionaryMetadata {
- /**
- * Attribute name for {@link #separator}.
- */
- public final static String ATTR_NAME_SEPARATOR = "fsa.dict.separator";
-
- /**
- * Attribute name for {@link #encoding}.
- */
- public final static String ATTR_NAME_ENCODING = "fsa.dict.encoding";
-
- /**
- * Attribute name for {@link #usesPrefixes}.
- */
- public final static String ATTR_NAME_USES_PREFIXES = "fsa.dict.uses-prefixes";
-
- /**
- * Attribute name for {@link #usesInfixes}.
- */
- public final static String ATTR_NAME_USES_INFIXES = "fsa.dict.uses-infixes";
-
- /**
- * A separator character between fields (stem, lemma, form). The character
- * must be within byte range (FSA uses bytes internally).
- */
- public final byte separator;
-
- /**
- * Encoding used for converting bytes to characters and vice versa.
- */
- public final String encoding;
-
- /**
- * True if the dictionary was compiled with prefix compression.
- */
- public final boolean usesPrefixes;
-
- /**
- * True if the dictionary was compiled with infix compression.
- */
- public final boolean usesInfixes;
-
- /**
- * Other meta data not included above.
- */
- public final Map<String, String> metadata;
-
- /**
- * Creates an immutable instance of {@link DictionaryMetadata}.
- */
- public DictionaryMetadata(char separator, String encoding,
- boolean usesPrefixes, boolean usesInfixes,
- Map<String, String> metadata) {
- this.encoding = encoding;
- this.usesPrefixes = usesPrefixes;
- this.usesInfixes = usesInfixes;
-
- try {
- final byte[] separatorBytes = new String(new char[] { separator })
- .getBytes(encoding);
- if (separatorBytes.length != 1) {
- throw new RuntimeException(
- "Separator character '"
- + separator
- + "' must be a single byte after transformation with encoding: "
- + encoding);
- }
- this.separator = separatorBytes[0];
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException("Encoding not supported on this VM: "
- + encoding);
- }
-
- this.metadata = Collections
- .unmodifiableMap(new HashMap<String, String>(metadata));
- }
-
- /**
- * Converts attributes in a {@link Map} to an instance of {@link Dictionary}
- * , validating attribute values.
- */
- static DictionaryMetadata fromMap(Properties properties) throws IOException {
- final String separator = properties.getProperty(ATTR_NAME_SEPARATOR);
- if (separator == null || separator.length() != 1) {
- throw new IOException("Attribute " + ATTR_NAME_SEPARATOR
- + " must be " + "a single character.");
- }
-
- final String encoding = properties.getProperty(ATTR_NAME_ENCODING);
- if (encoding == null || encoding.length() == 0) {
- throw new IOException("Attribute " + ATTR_NAME_ENCODING
- + " must be " + "present and non-empty.");
- }
-
- final boolean usesPrefixes = Boolean.valueOf(
- properties.getProperty(ATTR_NAME_USES_PREFIXES, "false"))
- .booleanValue();
-
- final boolean usesInfixes = Boolean.valueOf(
- properties.getProperty(ATTR_NAME_USES_INFIXES, "false"))
- .booleanValue();
-
- final HashMap<String, String> metadata = new HashMap<String, String>();
- for (Map.Entry<Object, Object> e : properties.entrySet()) {
- metadata.put(e.getKey().toString(), e.getValue().toString());
- }
-
- return new DictionaryMetadata(separator.charAt(0), encoding,
- usesPrefixes, usesInfixes, metadata);
- }
-}
diff --git a/src/morfologik/stemming/IStemmer.java b/src/morfologik/stemming/IStemmer.java
deleted file mode 100644
index 6e59526..0000000
--- a/src/morfologik/stemming/IStemmer.java
+++ /dev/null
@@ -1,20 +0,0 @@
-package morfologik.stemming;
-
-import java.util.List;
-
-/**
- * A generic &quot;stemmer&quot; interface in Morfologik.
- */
-public interface IStemmer {
- /**
- * Returns a list of {@link WordData} entries for a given word. The returned
- * list is never <code>null</code>. Depending on the stemmer's
- * implementation the {@link WordData} may carry the stem and additional
- * information (tag) or just the stem.
- * <p>
- * The returned list and any object it contains are not usable after a
- * subsequent call to this method. Any data that should be stored in between
- * must be copied by the caller.
- */
- public List<WordData> lookup(CharSequence word);
-}
diff --git a/src/morfologik/stemming/PolishStemmer.java b/src/morfologik/stemming/PolishStemmer.java
deleted file mode 100644
index 299f76a..0000000
--- a/src/morfologik/stemming/PolishStemmer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package morfologik.stemming;
-
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * A dictionary-based stemmer for the Polish language. This stemmer requires an
- * FSA-compiled dictionary to be present in classpath resources.
- *
- * <b>Objects of this class are not thread safe.</b>
- *
- * @see morfologik.stemming.DictionaryLookup
- */
-public final class PolishStemmer implements IStemmer, Iterable<WordData> {
- /**
- * Dictionary lookup delegate.
- */
- private final DictionaryLookup delegate;
-
- /**
- * This constructor is initialized with a built-in dictionary or fails with
- * a runtime exception if the dictionary is not available.
- */
- public PolishStemmer() {
- final String languageCode = "pl";
-
- this.delegate = new DictionaryLookup(Dictionary.getForLanguage(languageCode));
- }
-
- /**
- * {@inheritDoc}
- */
- public List<WordData> lookup(CharSequence word) {
- return delegate.lookup(word);
- }
-
- /**
- * Iterates over all dictionary forms stored in this stemmer.
- */
- public Iterator<WordData> iterator() {
- return delegate.iterator();
- }
-}
diff --git a/src/morfologik/stemming/WordData.java b/src/morfologik/stemming/WordData.java
deleted file mode 100644
index 4341bc4..0000000
--- a/src/morfologik/stemming/WordData.java
+++ /dev/null
@@ -1,247 +0,0 @@
-package morfologik.stemming;
-
-import java.io.UnsupportedEncodingException;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.*;
-
-import morfologik.util.BufferUtils;
-
-/**
- * Stem and tag data associated with a given word.
- *
- * <p>
- * <b>Important notes:</b>
- * <ul>
- * <li>Objects of this class are <i>volatile</i> (their content changes on
- * subsequent calls to {@link DictionaryLookup} class. If you need a copy of the
- * stem or tag data for a given word, you have to create a custom buffer
- * yourself and copy the associated data, perform {@link #clone()} or create
- * strings (they are immutable) using {@link #getStem()} and then
- * {@link CharSequence#toString()}.</li>
- * <li>Objects of this class must not be used in any Java collections. In fact
- * both equals and hashCode methods are overridden and throw exceptions to
- * prevent accidental damage.</li>
- * </ul>
- */
-public final class WordData implements Cloneable {
- /**
- * Error information if somebody puts us in a Java collection.
- */
- private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use"
- + " in Java collections framework (volatile content). Refer to documentation.";
-
- /** Character encoding in internal buffers. */
- private final CharsetDecoder decoder;
-
- /**
- * Inflected word form data.
- */
- CharSequence wordCharSequence;
-
- /**
- * Character sequence after converting {@link #stemBuffer} using
- * {@link #decoder}.
- */
- private CharBuffer stemCharSequence;
-
- /**
- * Character sequence after converting {@link #tagBuffer} using
- * {@link #decoder}.
- */
- private CharBuffer tagCharSequence;
-
- /** Byte buffer holding the inflected word form data. */
- ByteBuffer wordBuffer;
-
- /** Byte buffer holding stem data. */
- ByteBuffer stemBuffer;
-
- /** Byte buffer holding tag data. */
- ByteBuffer tagBuffer;
-
- /**
- * Package scope constructor.
- */
- WordData(CharsetDecoder decoder) {
- this.decoder = decoder;
-
- stemBuffer = ByteBuffer.allocate(0);
- tagBuffer = ByteBuffer.allocate(0);
- stemCharSequence = CharBuffer.allocate(0);
- tagCharSequence = CharBuffer.allocate(0);
- }
-
- /**
- * A constructor for tests only.
- */
- WordData(String stem, String tag, String encoding) {
- this(Charset.forName(encoding).newDecoder());
-
- try {
- if (stem != null)
- stemBuffer.put(stem.getBytes(encoding));
- if (tag != null)
- tagBuffer.put(tag.getBytes(encoding));
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Copy the stem's binary data (no charset decoding) to a custom byte
- * buffer. If the buffer is null or not large enough to hold the result, a
- * new buffer is allocated.
- *
- * @param target
- * Target byte buffer to copy the stem buffer to or
- * <code>null</code> if a new buffer should be allocated.
- *
- * @return Returns <code>target</code> or the new reallocated buffer.
- */
- public ByteBuffer getStemBytes(ByteBuffer target) {
- target = BufferUtils.ensureCapacity(target, stemBuffer.remaining());
- stemBuffer.mark();
- target.put(stemBuffer);
- stemBuffer.reset();
- target.flip();
- return target;
- }
-
- /**
- * Copy the tag's binary data (no charset decoding) to a custom byte buffer.
- * If the buffer is null or not large enough to hold the result, a new
- * buffer is allocated.
- *
- * @param target
- * Target byte buffer to copy the tag buffer to or
- * <code>null</code> if a new buffer should be allocated.
- *
- * @return Returns <code>target</code> or the new reallocated buffer.
- */
- public ByteBuffer getTagBytes(ByteBuffer target) {
- target = BufferUtils.ensureCapacity(target, tagBuffer.remaining());
- tagBuffer.mark();
- target.put(tagBuffer);
- tagBuffer.reset();
- target.flip();
- return target;
- }
-
- /**
- * Copy the inflected word's binary data (no charset decoding) to a custom
- * byte buffer. If the buffer is null or not large enough to hold the
- * result, a new buffer is allocated.
- *
- * @param target
- * Target byte buffer to copy the word buffer to or
- * <code>null</code> if a new buffer should be allocated.
- *
- * @return Returns <code>target</code> or the new reallocated buffer.
- */
- public ByteBuffer getWordBytes(ByteBuffer target) {
- target = BufferUtils.ensureCapacity(target, wordBuffer.remaining());
- wordBuffer.mark();
- target.put(wordBuffer);
- wordBuffer.reset();
- target.flip();
- return target;
- }
-
- /**
- * @return Return tag data decoded to a character sequence or
- * <code>null</code> if no associated tag data exists.
- */
- public CharSequence getTag() {
- tagCharSequence = decode(tagBuffer, tagCharSequence);
- return tagCharSequence.remaining() == 0 ? null : tagCharSequence;
- }
-
- /**
- * @return Return stem data decoded to a character sequence or
- * <code>null</code> if no associated stem data exists.
- */
- public CharSequence getStem() {
- stemCharSequence = decode(stemBuffer, stemCharSequence);
- return stemCharSequence.remaining() == 0 ? null : stemCharSequence;
- }
-
- /**
- * @return Return inflected word form data. Usually the parameter passed to
- * {@link DictionaryLookup#lookup(CharSequence)}.
- */
- public CharSequence getWord() {
- return wordCharSequence;
- }
-
- /*
- *
- */
- @Override
- public boolean equals(Object obj) {
- throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
- }
-
- /*
- *
- */
- @Override
- public int hashCode() {
- throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE);
- }
-
- /**
- * Declare a covariant of {@link Object#clone()} that returns a deep copy of
- * this object. The content of all internal buffers is copied.
- */
- @Override
- protected WordData clone() {
- final WordData clone = new WordData(this.decoder);
- clone.wordCharSequence = cloneCharSequence(wordCharSequence);
- clone.wordBuffer = getWordBytes(null);
- clone.stemBuffer = getStemBytes(null);
- clone.tagBuffer = getTagBytes(null);
- return clone;
- }
-
- /**
- * Clone char sequences only if not immutable.
- */
- private CharSequence cloneCharSequence(CharSequence chs) {
- if (chs instanceof String)
- return chs;
- return chs.toString();
- }
-
- /**
- * Reset internal structures for storing another word's data.
- */
- void reset() {
- this.wordCharSequence = null;
- this.wordBuffer = null;
- this.stemCharSequence.clear();
- this.tagCharSequence.clear();
- this.stemBuffer.clear();
- this.tagBuffer.clear();
- }
-
- /**
- * Decode byte buffer, optionally expanding the char buffer to.
- */
- private CharBuffer decode(ByteBuffer bytes, CharBuffer chars) {
- chars.clear();
- final int maxCapacity = (int) (bytes.remaining() * decoder
- .maxCharsPerByte());
- if (chars.capacity() <= maxCapacity) {
- chars = CharBuffer.allocate(maxCapacity);
- }
-
- bytes.mark();
- decoder.reset();
- decoder.decode(bytes, chars, true);
- chars.flip();
- bytes.reset();
-
- return chars;
- }
-}
diff --git a/src/morfologik/tools/FSABuildTool.java b/src/morfologik/tools/FSABuildTool.java
deleted file mode 100644
index 17ff102..0000000
--- a/src/morfologik/tools/FSABuildTool.java
+++ /dev/null
@@ -1,486 +0,0 @@
-package morfologik.tools;
-
-import java.io.*;
-import java.util.*;
-
-import morfologik.fsa.*;
-
-import org.apache.commons.cli.*;
-
-import com.carrotsearch.hppc.IntIntOpenHashMap;
-import com.carrotsearch.hppc.cursors.IntIntCursor;
-
-/**
- * Convert from plain text input to a serialized FSA in any of the
- * available {@link Format}s.
- */
-public final class FSABuildTool extends Tool {
- /**
- * One megabyte.
- */
- private final static int MB = 1024 * 1024;
-
- /**
- * The serialization format to use for the binary output.
- */
- public enum Format {
- FSA5,
- CFSA2;
-
- public FSASerializer getSerializer() {
- switch (this) {
- case FSA5:
- return new FSA5Serializer();
-
- case CFSA2:
- return new CFSA2Serializer();
-
- default:
- throw new RuntimeException();
- }
- }
- }
-
- /**
- * Be more verbose about progress.
- */
- private boolean printProgress;
-
- /**
- * Serializer used for emitting the FSA.
- */
- private FSASerializer serializer;
-
- /**
- * Output format name.
- */
- private Format format;
-
- /**
- * Warn about CR characters in the input (usually not what you want).
- */
- private boolean crWarning = false;
-
- /**
- * If <code>true</code>, the input is not buffered and sorted in-memory, but
- * must be sorted externally (using the "C" convention: unsigned byte values).
- */
- private boolean inputSorted;
-
- /**
- * Print additional statistics about the output automaton.
- */
- private boolean statistics;
-
- /**
- * The actual construction of the FSA.
- */
- private FSABuilder builder = new FSABuilder();
-
- /**
- * Start time.
- */
- private long start = System.currentTimeMillis();
-
- private IMessageLogger logger;
-
- /**
- * Gets fed with the lines read from the input.
- */
- private static interface LineConsumer {
- /**
- * Process the buffer, return the same buffer or a new buffer (for
- * swapping).
- */
- byte[] process(byte[] buffer, int pos);
- }
-
- /**
- * To help break out of the anonymous delegate on error.
- */
- @SuppressWarnings("serial")
- private static class TerminateProgramException extends RuntimeException {
- public TerminateProgramException(String msg) {
- super(msg);
- }
-
- public synchronized Throwable fillInStackTrace() {
- return null;
- }
- }
-
- /**
- * Command line entry point after parsing arguments.
- */
- protected void go(CommandLine line) throws Exception {
- String[] args = line.getArgs();
- if (args.length != 0) {
- printUsage();
- return;
- }
-
- // Parse the input options.
- parseOptions(line);
-
- logger = new WriterMessageLogger(new PrintWriter(System.err));
- this.serializer.withLogger(logger);
-
- try {
- InputStream inputStream = initializeInput(line);
-
- if (inputSorted) {
- logger.log("Assuming input is already sorted");
- }
-
- final FSA fsa;
- if (inputSorted) {
- fsa = processSortedInput(inputStream);
- } else {
- fsa = processUnsortedInput(inputStream);
- }
- if (crWarning) logger.log("Warning: input contained carriage returns?");
-
- if (statistics) {
- logger.startPart("Statistics");
- FSAInfo info = new FSAInfo(fsa);
- TreeMap<Integer, Integer> fanout = FSAUtils.calculateFanOuts(fsa, fsa.getRootNode());
- logger.endPart();
-
- final IntIntOpenHashMap numbers = new IntIntOpenHashMap();
- fsa.visitInPostOrder(new StateVisitor() {
- public boolean accept(int state) {
- int thisNodeNumber = 0;
- for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
- thisNodeNumber +=
- (fsa.isArcFinal(arc) ? 1 : 0) +
- (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc)));
- }
- numbers.put(state, thisNodeNumber);
- return true;
- }
- });
-
- int singleRLC = 0;
- for (IntIntCursor c : numbers) {
- if (c.value == 1) singleRLC++;
- }
-
- logger.log("Nodes", info.nodeCount);
- logger.log("Arcs", info.arcsCount);
- logger.log("Tail nodes", singleRLC);
-
- logger.log("States with the given # of outgoing arcs:");
- for (Map.Entry<Integer, Integer> e : fanout.entrySet()) {
- logger.log(" #" + e.getKey(), e.getValue());
- }
-
- logger.log("FSA builder properties:");
- for (Map.Entry<FSABuilder.InfoEntry, Object> e : builder.getInfo().entrySet()) {
- logger.log(e.getKey().toString(), e.getValue());
- }
- }
-
- // Save the result.
- logger.startPart("Serializing " + format);
- serializer.serialize(fsa, initializeOutput(line)).close();
- logger.endPart();
- } catch (OutOfMemoryError e) {
- logger.log("Error: Out of memory. Pass -Xmx1024m argument (or more) to java.");
- }
- }
-
- /**
- * Process unsorted input (sort and construct FSA).
- */
- private FSA processUnsortedInput(InputStream inputStream)
- throws IOException {
- final FSA root;
- logger.startPart("Reading input");
- final ArrayList<byte[]> input = readInput(inputStream);
- logger.endPart();
-
- logger.log("Input sequences", input.size());
-
- logger.startPart("Sorting");
- Collections.sort(input, FSABuilder.LEXICAL_ORDERING);
- logger.endPart();
-
- logger.startPart("Building FSA");
- for (byte [] bb : input)
- builder.add(bb, 0, bb.length);
- root = builder.complete();
- logger.endPart();
- return root;
- }
-
- /**
- *
- */
- private FSA processSortedInput(InputStream inputStream)
- throws IOException {
-
- int lines = forAllLines(inputStream, new LineConsumer() {
- private byte [] current;
- private byte [] previous = null;
- private int previousLen;
- private int line;
-
- public byte[] process(byte[] current, int currentLen) {
- line++;
-
- // Verify the order.
- if (previous != null) {
- if (FSABuilder.compare(previous, 0, previousLen, current, 0, currentLen) > 0) {
- logger.log("\n\nERROR: The input is not sorted: \n" +
- dumpLine(previous, previousLen) + "\n" +
- dumpLine(current, currentLen));
- throw new TerminateProgramException("Input is not sorted.");
- }
- }
-
- // Add to the automaton.
- builder.add(current, 0, currentLen);
-
- // Swap buffers.
- this.current = previous != null ? previous : new byte [current.length];
- this.previous = current;
- this.previousLen = currentLen;
-
- return this.current;
- }
- });
-
- logger.startPart("Building FSA");
- FSA fsa = builder.complete();
- logger.endPart();
- logger.log("Input sequences", lines);
-
- return fsa;
- }
-
- /**
- * Dump input line, byte-by-byte.
- */
- protected String dumpLine(byte[] line, int length) {
- StringBuilder builder = new StringBuilder();
- for (int i = 0; i < length; i++) {
- if (i > 0) builder.append(" ");
- builder.append(String.format("%02x", line[i]));
- }
- builder.append(" | ");
- for (int i = 0; i < length; i++) {
- if (Character.isLetterOrDigit(line[i]))
- builder.append((char) line[i]);
- else
- builder.append(".");
- }
- return builder.toString();
- }
-
- /**
- * Parse input options.
- */
- private void parseOptions(CommandLine line) {
- String opt;
-
- opt = SharedOptions.outputFormatOption.getOpt();
- if (line.hasOption(opt)) {
- String formatValue = line.getOptionValue(opt);
- try {
- format = Format.valueOf(formatValue.toUpperCase());
- } catch (IllegalArgumentException e) {
- throw new TerminateProgramException("Not a valid format: "
- + formatValue);
- }
- } else {
- format = Format.FSA5;
- }
- serializer = format.getSerializer();
-
- opt = SharedOptions.fillerCharacterOption.getLongOpt();
- if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) {
- String chr = line.getOptionValue(opt);
- checkSingleByte(chr);
- serializer.withFiller(chr.getBytes()[0]);
- }
-
- opt = SharedOptions.annotationSeparatorCharacterOption.getLongOpt();
- if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) {
- String chr = line.getOptionValue(opt);
- checkSingleByte(chr);
- serializer.withAnnotationSeparator(chr.getBytes()[0]);
- }
-
- opt = SharedOptions.withNumbersOption.getOpt();
- if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.NUMBERS)) {
- serializer.withNumbers();
- }
-
- opt = SharedOptions.progressOption.getLongOpt();
- if (line.hasOption(opt)) {
- printProgress = true;
- }
-
- opt = SharedOptions.inputSortedOption.getLongOpt();
- if (line.hasOption(opt)) {
- inputSorted = true;
- }
-
- opt = SharedOptions.statistics.getLongOpt();
- if (line.hasOption(opt)) {
- statistics = true;
- }
- }
-
- private boolean requiredCapability(String opt, FSAFlags flag) {
- if (!serializer.getFlags().contains(flag)) {
- throw new RuntimeException("This serializer does not support option: " + opt);
- }
- return true;
- }
-
- /**
- * Check if the argument is a single byte after conversion using platform-default
- * encoding.
- */
- public static void checkSingleByte(String chr) {
- if (chr.getBytes().length == 1)
- return;
-
- throw new IllegalArgumentException("Filler and annotation characters must be single" +
- "-byte values, " + chr + " has " + chr.getBytes().length + " bytes.");
- }
-
- /**
- * Read all the input lines, unsorted.
- */
- private ArrayList<byte[]> readInput(InputStream is) throws IOException {
- final ArrayList<byte[]> result = new ArrayList<byte[]>();
- forAllLines(is, new LineConsumer() {
- public byte[] process(byte[] buffer, int pos) {
- result.add(java.util.Arrays.copyOf(buffer, pos));
- return buffer;
- }
- });
- return result;
- }
-
- /**
- * Apply line consumer to all non-empty lines.
- */
- private int forAllLines(InputStream is, LineConsumer lineConsumer) throws IOException {
- int lines = 0;
- byte[] buffer = new byte[0];
- int line = 0, b, pos = 0;
- while ((b = is.read()) != -1) {
- if (b == '\r' && !crWarning) {
- crWarning = true;
- }
-
- if (b == '\n') {
- if (pos > 0) {
- buffer = lineConsumer.process(buffer, pos);
- pos = 0;
- lines++;
- }
-
- if (printProgress && line++ > 0 && (line % 1000000) == 0) {
- logger.log(String.format(Locale.ENGLISH, "%6.2fs, sequences: %d", elapsedTime(), line));
- }
- } else {
- if (pos >= buffer.length) {
- buffer = java.util.Arrays.copyOf(buffer, buffer.length + 10);
- }
- buffer[pos++] = (byte) b;
- }
- }
-
- if (pos > 0) {
- lineConsumer.process(buffer, pos);
- lines++;
- }
-
- return lines;
- }
-
- private double elapsedTime() {
- return (System.currentTimeMillis() - start) / 1000.0d;
- }
-
- @Override
- protected void printUsage() {
- final HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp(this.getClass().getName(), options, true);
- }
-
- @Override
- protected void initializeOptions(Options options) {
- options.addOption(SharedOptions.inputFileOption);
- options.addOption(SharedOptions.outputFileOption);
-
- options.addOption(SharedOptions.outputFormatOption);
-
- options.addOption(SharedOptions.fillerCharacterOption);
- options.addOption(SharedOptions.annotationSeparatorCharacterOption);
-
- options.addOption(SharedOptions.withNumbersOption);
- options.addOption(SharedOptions.progressOption);
-
- options.addOption(SharedOptions.inputSortedOption);
-
- options.addOption(SharedOptions.statistics);
- }
-
- /**
- *
- */
- private static OutputStream initializeOutput(CommandLine line)
- throws IOException, ParseException {
- final OutputStream output;
- final String opt = SharedOptions.outputFileOption.getOpt();
- if (line.hasOption(opt)) {
- // Use output file.
- output = new FileOutputStream((File) line.getParsedOptionValue(opt));
- } else {
- // Use standard output.
- output = System.out;
- }
- return new BufferedOutputStream(output);
- }
-
- /**
- *
- */
- private InputStream initializeInput(CommandLine line)
- throws IOException, ParseException {
- final InputStream input;
- final String opt = SharedOptions.inputFileOption.getOpt();
-
- if (line.hasOption(opt)) {
- // Use input file.
- File inputFile = (File) line.getParsedOptionValue(opt);
- if (!inputSorted && inputFile.length() > 20 * MB) {
- logger.log("WARN: The input file is quite large, avoid\n" +
- " in-memory sorting by piping pre-sorted\n" +
- " input directly to fsa_build. Linux:\n" +
- " export LC_ALL=C && \\\n" +
- " sort input | \\\n" +
- " java -jar morfologik.jar fsa_build --sorted -o dict.fsa");
- }
-
- input = new FileInputStream(inputFile);
- } else {
- // Use standard input.
- input = System.in;
- }
- return new BufferedInputStream(input);
- }
-
- /**
- * Command line entry point.
- */
- public static void main(String[] args) throws Exception {
- final FSABuildTool tool = new FSABuildTool();
- tool.go(args);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/tools/FSADumpTool.java b/src/morfologik/tools/FSADumpTool.java
deleted file mode 100644
index 281d187..0000000
--- a/src/morfologik/tools/FSADumpTool.java
+++ /dev/null
@@ -1,286 +0,0 @@
-package morfologik.tools;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.Locale;
-import java.util.Map;
-
-import morfologik.fsa.*;
-import morfologik.stemming.*;
-import morfologik.util.FileUtils;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.Options;
-
-/**
- * This utility will dump the information and contents of a given {@link FSA}
- * dictionary. It can dump dictionaries in the raw form (as fed to the
- * <code>fsa_build</code> program) or decoding compressed stem forms.
- */
-public final class FSADumpTool extends Tool {
- /**
- * Writer used to print messages and dictionary dump.
- */
- private OutputStream os;
-
- /**
- * Print raw data only, no headers.
- */
- private boolean dataOnly;
-
- /**
- * Decode from prefix/infix/suffix encodings.
- */
- private boolean decode;
-
- /**
- * Dump graphviz DOT file instead of automaton sequences.
- */
- private boolean dot;
-
- /**
- * Command line entry point after parsing arguments.
- */
- protected void go(CommandLine line) throws Exception {
- final File dictionaryFile = (File) line
- .getParsedOptionValue(SharedOptions.fsaDictionaryFileOption
- .getOpt());
-
- dataOnly = line.hasOption(SharedOptions.dataOnly.getOpt());
- decode = line.hasOption(SharedOptions.decode.getOpt());
- dot = line.hasOption(SharedOptions.dot.getLongOpt());
-
- FileUtils.assertExists(dictionaryFile, true, false);
-
- dump(dictionaryFile);
- }
-
- /**
- * Dumps the content of a dictionary to a file.
- */
- private void dump(File dictionaryFile)
- throws UnsupportedEncodingException, IOException {
- final long start = System.currentTimeMillis();
-
- final Dictionary dictionary;
- final FSA fsa;
-
- if (!dictionaryFile.canRead()) {
- printWarning("Dictionary file does not exist: "
- + dictionaryFile.getAbsolutePath());
- return;
- }
-
- this.os = new BufferedOutputStream(System.out, 1024 * 32);
-
- if (hasMetadata(dictionaryFile)) {
- dictionary = Dictionary.read(dictionaryFile);
- fsa = dictionary.fsa;
-
- final String encoding = dictionary.metadata.encoding;
- if (!Charset.isSupported(encoding)) {
- printWarning("Dictionary's charset is not supported "
- + "on this JVM: " + encoding);
- return;
- }
- } else {
- dictionary = null;
- fsa = FSA.read(new FileInputStream(dictionaryFile));
- printWarning("Warning: FSA automaton without metadata file.");
- }
-
- printExtra("FSA properties");
- printExtra("--------------------");
- printExtra("FSA implementation : " + fsa.getClass().getName());
- printExtra("Compiled with flags : " + fsa.getFlags().toString());
-
- if (!dataOnly) {
- final FSAInfo info = new FSAInfo(fsa);
- printExtra("Number of arcs : "
- + info.arcsCount + "/" + info.arcsCountTotal);
- printExtra("Number of nodes : " + info.nodeCount);
- printExtra("Number of final st. : " + info.finalStatesCount);
- printExtra("");
- }
-
- // Separator for dumping.
- char separator = '\t';
-
- if (fsa instanceof FSA5) {
- printExtra("FSA5 properties");
- printExtra("--------------------");
- printFSA5((FSA5) fsa);
- printExtra("");
- }
-
- if (dictionary != null) {
- printExtra("Dictionary metadata");
- printExtra("--------------------");
- printExtra("Encoding : " + dictionary.metadata.encoding);
- printExtra("Separator byte : 0x"
- + Integer.toHexString(dictionary.metadata.separator)
- + " ('" + decodeSeparator(dictionary) + "')");
- printExtra("Uses prefixes : "
- + dictionary.metadata.usesPrefixes);
- printExtra("Uses infixes : "
- + dictionary.metadata.usesInfixes);
- printExtra("");
-
- printExtra("Dictionary metadata (all keys)");
- printExtra("---------------------------------");
-
- for (Map.Entry<String, String> e : dictionary.metadata.metadata
- .entrySet()) {
- printExtra(String
- .format("%-27s : %s", e.getKey(), e.getValue()));
- }
- printExtra("");
- }
-
- int sequences = 0;
- if (decode) {
- if (dictionary == null) {
- printWarning("No dictionary metadata available.");
- return;
- }
-
- printExtra("Decoded FSA data (in the encoding above)");
- printExtra("----------------------------------------");
-
- final DictionaryLookup dl = new DictionaryLookup(dictionary);
- final StringBuilder builder = new StringBuilder();
- final OutputStreamWriter osw = new OutputStreamWriter(os,
- dictionary.metadata.encoding);
-
- CharSequence t;
- for (WordData wd : dl) {
- builder.setLength(0);
- builder.append(wd.getWord());
- builder.append(separator);
-
- t = wd.getStem();
- if (t == null)
- t = "";
- builder.append(t);
- builder.append(separator);
-
- t = wd.getTag();
- if (t == null)
- t = "";
- builder.append(t);
- builder.append('\n');
-
- osw.write(builder.toString());
- sequences++;
- }
- osw.flush();
- } else {
- if (dot) {
- Writer w = new OutputStreamWriter(os);
- FSAUtils.toDot(w, fsa, fsa.getRootNode());
- w.flush();
- } else {
- printExtra("FSA data (raw bytes in the encoding above)");
- printExtra("------------------------------------------");
-
- for (ByteBuffer bb : fsa) {
- os.write(bb.array(), 0, bb.remaining());
- os.write(0x0a);
- sequences++;
- }
- }
- }
-
- printExtra("--------------------");
-
- final long millis = Math.max(1, System.currentTimeMillis() - start);
- printExtra(String
- .format(
- Locale.ENGLISH,
- "Dictionary dumped in %.3f second(s), %d sequences (%d sequences/sec.).",
- millis / 1000.0, sequences,
- (int) (sequences / (millis / 1000.0))));
-
- os.flush();
- }
-
- /**
- * Print {@link FSA5}-specific stuff.
- */
- private void printFSA5(FSA5 fsa) throws IOException {
- printExtra("GTL : " + fsa.gtl);
- printExtra("Node extra data : " + fsa.nodeDataLength);
- printExtra("Annotation separator: " + byteAsChar(fsa.annotation));
- printExtra("Filler character : " + byteAsChar(fsa.filler));
- }
-
- /**
- * Convert a byte to a character, no charset decoding, simple ASCII range mapping.
- */
- private char byteAsChar(byte v) {
- char chr = (char) (v & 0xff);
- if (chr < 127)
- return chr;
- else
- return '?';
- }
-
- /*
- *
- */
- private void printExtra(String msg) throws IOException {
- if (dataOnly)
- return;
- os.write(msg.getBytes());
- os.write(0x0a);
- }
-
- /*
- *
- */
- private void printWarning(String msg) {
- System.err.println(msg);
- }
-
- /*
- *
- */
- private String decodeSeparator(Dictionary dictionary) {
- try {
- return new String(new byte[] { dictionary.metadata.separator },
- dictionary.metadata.encoding);
- } catch (UnsupportedEncodingException e) {
- return "<unsupported encoding: " + dictionary.metadata.encoding
- + ">";
- }
- }
-
- /**
- * Check if there is a metadata file for the given FSA automaton.
- */
- private static boolean hasMetadata(File fsaFile) {
- final File featuresFile = new File(fsaFile.getParent(), Dictionary
- .getExpectedFeaturesName(fsaFile.getName()));
-
- return featuresFile.canRead();
- }
-
- /**
- * Command line options for the tool.
- */
- protected void initializeOptions(Options options) {
- options.addOption(SharedOptions.fsaDictionaryFileOption);
- options.addOption(SharedOptions.dataOnly);
- options.addOption(SharedOptions.decode);
- options.addOption(SharedOptions.dot);
- }
-
- /**
- * Command line entry point.
- */
- public static void main(String[] args) throws Exception {
- final FSADumpTool fsaDump = new FSADumpTool();
- fsaDump.go(args);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/tools/IMessageLogger.java b/src/morfologik/tools/IMessageLogger.java
deleted file mode 100644
index 14f9f00..0000000
--- a/src/morfologik/tools/IMessageLogger.java
+++ /dev/null
@@ -1,25 +0,0 @@
-package morfologik.tools;
-
-public interface IMessageLogger {
-
- /**
- * Log progress to the console.
- */
- public void log(String msg);
-
- /**
- * Log message header and save current time.
- */
- public void startPart(String header);
-
- /**
- *
- */
- public void endPart();
-
- /**
- * Log a two-part message.
- */
- public void log(String header, Object v);
-
-} \ No newline at end of file
diff --git a/src/morfologik/tools/InflectionFramesTool.java b/src/morfologik/tools/InflectionFramesTool.java
deleted file mode 100644
index 612f62c..0000000
--- a/src/morfologik/tools/InflectionFramesTool.java
+++ /dev/null
@@ -1,118 +0,0 @@
-package morfologik.tools;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.*;
-import java.util.*;
-import java.util.Map.Entry;
-
-import morfologik.stemming.*;
-import morfologik.stemming.Dictionary;
-
-import org.junit.Test;
-
-/**
- * Calculate inflection frames from the Polish dictionary.
- */
-public class InflectionFramesTool {
- public static void main(String[] args) throws IOException {
- new InflectionFramesTool().inflectionFrames();
- }
-
- /* */
- @Test
- @SuppressWarnings( { "unused" })
- public void inflectionFrames() throws IOException {
- final Dictionary pl = Dictionary.getForLanguage("pl");
- final DictionaryLookup dict = new DictionaryLookup(pl);
-
- final CharsetDecoder decoder = Charset.forName(pl.metadata.encoding)
- .newDecoder().onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
-
- final HashMap<String, ArrayList<String>> forms =
- new HashMap<String, ArrayList<String>>();
-
- ByteBuffer stemBuffer = ByteBuffer.allocate(0);
- ByteBuffer inflBuffer = ByteBuffer.allocate(0);
- ByteBuffer stemDecoded = ByteBuffer.allocate(0);
-
- int limit = Integer.MAX_VALUE;
-
- final Iterator<WordData> i = new DictionaryIterator(pl, decoder, false);
- while (i.hasNext() && limit-- > 0) {
- final WordData wd = i.next();
-
- final CharSequence inflected = wd.getWord();
- final CharSequence stemEncoded = wd.getStem();
- final CharSequence tag = wd.getTag();
- if (tag == null)
- continue;
-
- inflBuffer.clear();
- inflBuffer = wd.getWordBytes(inflBuffer);
-
- stemBuffer.clear();
- stemBuffer = wd.getStemBytes(stemBuffer);
-
- stemDecoded = DictionaryLookup.decodeStem(stemDecoded, stemBuffer
- .array(), stemBuffer.remaining(), inflBuffer, pl.metadata);
- stemDecoded.flip();
-
- final String stem = decoder.decode(stemDecoded).toString();
- final String form = tag.toString().intern();
-
- ArrayList<String> frames = forms.get(stem);
- if (frames == null) {
- forms.put(stem, frames = new ArrayList<String>());
- }
-
- if (!frames.contains(form)) {
- frames.add(form);
- }
- }
-
- // Sort the forms so that we get a unique key. Then iteratively add them
- // to another hash (by form this time).
- final HashMap<String, ArrayList<String>> frames =
- new HashMap<String, ArrayList<String>>();
-
- StringBuilder key = new StringBuilder();
- for (Map.Entry<String, ArrayList<String>> e : forms.entrySet()) {
- Collections.sort(e.getValue());
-
- key.setLength(0);
- for (String s : e.getValue())
- key.append(s).append(" ");
-
- final String k = key.toString();
- ArrayList<String> words = frames.get(k);
- if (words == null) {
- frames.put(k, words = new ArrayList<String>());
- }
- words.add(e.getKey());
-
- e.setValue(null);
- }
-
- // Print inflection frames.
- ArrayList<Map.Entry<String, ArrayList<String>>> entries =
- new ArrayList<Map.Entry<String, ArrayList<String>>>();
-
- entries.addAll(frames.entrySet());
- Collections.sort(entries,
- new Comparator<Map.Entry<String, ArrayList<String>>>() {
- public int compare(Entry<String, ArrayList<String>> o1,
- Entry<String, ArrayList<String>> o2) {
- return o2.getValue().size() - o1.getValue().size();
- }
- });
-
- for (Map.Entry<String, ArrayList<String>> e : entries) {
- System.out.println(String.format("%6d %s %s",
- e.getValue().size(), e.getKey(), e.getValue()));
- }
-
- System.out.println("Total frames: " + frames.size());
- }
-}
diff --git a/src/morfologik/tools/Launcher.java b/src/morfologik/tools/Launcher.java
deleted file mode 100644
index 667a6e1..0000000
--- a/src/morfologik/tools/Launcher.java
+++ /dev/null
@@ -1,159 +0,0 @@
-package morfologik.tools;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Method;
-import java.net.URL;
-import java.util.Enumeration;
-import java.util.Iterator;
-import java.util.TreeMap;
-import java.util.jar.Manifest;
-
-import morfologik.util.FileUtils;
-
-/**
- * A launcher for other command-line tools.
- */
-public final class Launcher {
- /**
- * Tool description.
- */
- final static class ToolInfo {
- public final Class<? extends Tool> clazz;
- public final String info;
-
- public ToolInfo(Class<? extends Tool> clazz, String info) {
- this.clazz = clazz;
- this.info = info;
- }
-
- public void invoke(String[] subArgs) throws Exception {
- final Method m = clazz.getMethod("main",
- new Class[] { String[].class });
- m.invoke(null, new Object[] { subArgs });
- }
- }
-
- /**
- * Command line entry point.
- */
- public static void main(String[] args) throws Exception {
- // If so, tools are unavailable and a classpath error has been logged.
- final TreeMap<String, ToolInfo> tools = initTools();
-
- if (tools == null)
- {
- return;
- }
-
- if (args.length == 0) {
- System.out
- .println("Provide tool name and its command-line options. "
- + "Available tools:");
- for (String key : tools.keySet()) {
- final ToolInfo toolInfo = tools.get(key);
- System.out.println(String.format(" %-10s - %s", key,
- toolInfo.info));
- }
- } else {
- final String toolName = args[0];
- if (!tools.containsKey(toolName)) {
- System.out.println("Unknown tool: " + toolName);
- return;
- }
-
- final String[] subArgs = new String[args.length - 1];
- System.arraycopy(args, 1, subArgs, 0, subArgs.length);
-
- final ToolInfo toolInfo = (ToolInfo) tools.get(toolName);
- toolInfo.invoke(subArgs);
- }
- }
-
- /**
- * Initialize and check tools' availability.
- */
- static TreeMap<String, ToolInfo> initTools() {
- TreeMap<String, ToolInfo> tools = new TreeMap<String, ToolInfo>();
-
- tools.put("fsa_build", new ToolInfo(FSABuildTool.class,
- "Create an automaton from plain text files."));
-
- tools.put("fsa_dump", new ToolInfo(FSADumpTool.class,
- "Dump an FSA dictionary."));
-
- tools.put("tab2morph", new ToolInfo(MorphEncodingTool.class,
- "Convert tabbed dictionary to fsa encoding format."));
-
- tools.put("plstem", new ToolInfo(PolishStemmingTool.class,
- "Apply Polish dictionary stemming to the input."));
-
- // Prune unavailable tools.
- for (Iterator<ToolInfo> i = tools.values().iterator(); i.hasNext();) {
- ToolInfo ti = i.next();
- try {
- ti.clazz.newInstance().isAvailable();
- } catch (NoClassDefFoundError e) {
- logJarWarning();
- return null;
- } catch (Throwable e) {
- System.out.println("Tools could not be initialized because" +
- " of an exception during initialization: "
- + e.getClass().getName() + ", " + e.getMessage());
- return null;
- }
- }
-
- return tools;
- }
-
- /**
- * Log a warning about missing JAR dependencies.
- */
- private static void logJarWarning() {
- System.out.println("Tools are unavailable, at least one JAR dependency missing.");
-
- try {
- final Class<Launcher> clazz = Launcher.class;
- final ClassLoader classLoader = clazz.getClassLoader();
-
- final String clazzName = clazz.getName().replace('.', '/') + ".class";
- // Figure out our own class path location.
- final URL launcherLocation = classLoader.getResource(clazzName);
- if (launcherLocation == null)
- return;
-
- String launcherPrefix = launcherLocation.toString()
- .replace(clazzName, "");
-
- // Figure our our location's MANIFEST.MF (class loader may be hitting a few).
- URL manifestResource = null;
- Enumeration<URL> manifests = classLoader.getResources("META-INF/MANIFEST.MF");
- while (manifests.hasMoreElements())
- {
- URL candidate = manifests.nextElement();
- if (candidate.toString().startsWith(launcherPrefix))
- {
- manifestResource = candidate;
- break;
- }
- }
-
- if (manifestResource == null)
- return;
-
- InputStream stream = null;
- try {
- stream = manifestResource.openStream();
- Manifest manifest = new Manifest(stream);
-
- System.out.println("Required JARs: "
- + manifest.getMainAttributes().getValue("Class-Path"));
- } catch (IOException e) {
- FileUtils.close(stream);
- }
- } catch (IOException e) {
- // Ignore.
- }
- }
-}
diff --git a/src/morfologik/tools/MorphEncoder.java b/src/morfologik/tools/MorphEncoder.java
deleted file mode 100644
index 236c1aa..0000000
--- a/src/morfologik/tools/MorphEncoder.java
+++ /dev/null
@@ -1,399 +0,0 @@
-package morfologik.tools;
-
-import java.io.UnsupportedEncodingException;
-
-import morfologik.fsa.FSA5;
-
-/**
- * A class that converts tabular data to fsa morphological format. Three formats
- * are supported:
- * <ul>
- * <li><b>standard</b>, see {@link #standardEncode}</li>
- * <li><b>prefix</b>, see {@link #prefixEncode}</li>
- * <li><b>infix</b>, see {@link #infixEncode}</li>
- * </ul>
- */
-public final class MorphEncoder {
- private final byte annotationSeparator;
-
- private static final int MAX_PREFIX_LEN = 3;
- private static final int MAX_INFIX_LEN = 3;
-
- private static final String UTF8 = "UTF-8";
-
- public MorphEncoder() {
- this(FSA5.DEFAULT_ANNOTATION);
- }
-
- public MorphEncoder(byte annotationSeparator) {
- this.annotationSeparator = annotationSeparator;
- }
-
- public static int commonPrefix(final byte[] s1, final byte[] s2) {
- final int maxLen = Math.min(s1.length, s2.length);
- for (int i = 0; i < maxLen; i++) {
- if (s1[i] != s2[i]) {
- return i;
- }
- }
- return maxLen;
- }
-
- private static byte[] subsequence(final byte[] bytes, final int start) {
- final byte[] newArray = new byte[bytes.length - start];
- System.arraycopy(bytes, start, newArray, 0, bytes.length - start);
- return newArray;
- }
-
- private static int copyTo(byte[] dst, final int pos, final byte[] src) {
- System.arraycopy(src, 0, dst, pos, src.length);
- return src.length;
- }
-
- private static int copyTo(byte[] dst, final int pos, final byte src) {
- byte[] single = new byte[1];
- single[0] = src;
- System.arraycopy(single, 0, dst, pos, 1);
- return 1;
- }
-
- /**
- * This method converts the wordForm, wordLemma and tag to the form:
- *
- * <pre>
- * wordForm + Kending + tags
- * </pre>
- *
- * where '+' is a separator, K is a character that specifies how many
- * characters should be deleted from the end of the inflected form to
- * produce the lexeme by concatenating the stripped string with the ending.
- *
- */
- public byte[] standardEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- final int prefix = commonPrefix(wordForm, wordLemma);
- final int len = wordLemma.length - prefix;
- int pos = 0;
- // 3 = 2 separators and K character
- int arrayLen = l1 + len + 3;
- if (wordTag != null) { //wordTag may be empty for stemming
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- return bytes;
- }
-
- /**
- * This method converts wordform, wordLemma and the tag to the form:
- * <p>
- *
- * <pre>
- * inflected_form + LKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, L is the number of characters to be deleted
- * from the beginning of the word ("A" means none, "B" means one, "C" - 2,
- * etc.), K is a character that specifies how many characters should be
- * deleted from the end of the inflected form to produce the lexeme by
- * concatenating the stripped string with the ending ("A" means none,
- * "B' - 1, "C" - 2, and so on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- */
- public byte[] prefixEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- final int prefix = commonPrefix(wordForm, wordLemma);
-
- // 4 = 2 separators + LK characters
- int arrayLen = l1 + wordLemma.length + 4;
- if (wordTag != null) {
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- int pos = 0;
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- int prefixFound = 0;
- int prefix1 = 0;
- final int max = Math.min(wordForm.length, MAX_PREFIX_LEN);
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- if (prefixFound == 0) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- }
- } else {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- final byte[] finalArray = new byte[pos];
- System.arraycopy(bytes, 0, finalArray, 0, pos);
- return finalArray;
- }
-
- /**
- * This method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + MLKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, M is the position of characters to be deleted
- * towards the beginning of the inflected form ("A" means from the
- * beginning, "B" from the second character, "C" - from the third one, and
- * so on), L is the number of characters to be deleted from the position
- * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a
- * character that specifies how many characters should be deleted from the
- * end of the inflected form to produce the lexeme by concatenating the
- * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so
- * on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- */
- public byte[] infixEncode(final byte[] wordForm,
- final byte[] wordLemma, final byte[] wordTag) {
- final int l1 = wordForm.length;
- int prefixFound = 0;
- int prefix1 = 0;
- final int prefix = commonPrefix(wordForm, wordLemma);
- final int max = Math.min(l1, MAX_INFIX_LEN);
-
- // 5 = 2 separators + MLK characters
- int arrayLen = l1 + wordLemma.length + 5;
- if (wordTag != null) {
- arrayLen += wordTag.length;
- }
- final byte[] bytes = new byte[arrayLen];
- int pos = 0;
- pos += copyTo(bytes, pos, wordForm);
- pos += copyTo(bytes, pos, annotationSeparator);
- if (prefix == 0) {
- // we may have a prefix
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- if (prefixFound == 0) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff));
- pos += copyTo(bytes, pos, wordLemma);
- } else {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- }
- } else { // prefix found but we have to check the infix
-
- for (int i = 1; i <= max; i++) {
- prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma);
- if (prefix1 > 2) {
- prefixFound = i;
- break;
- }
- }
- int prefix2 = 0;
- int infixFound = 0;
- final int max2 = Math.min(l1 - prefix, MAX_INFIX_LEN);
- for (int i = 1; i <= max2; i++) {
- prefix2 = commonPrefix(subsequence(wordForm, prefix + i),
- subsequence(wordLemma, prefix));
- if (prefix2 > 2) {
- infixFound = i;
- break;
- }
- }
-
- if (prefixFound > infixFound) {
- if (prefixFound > 0 && (prefix1 > prefix)) {
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos,
- (byte) ((prefixFound + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((l1 - prefixFound
- - prefix1 + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1));
- } else {
- // infixFound == 0 && prefixFound == 0
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos,
- (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
- } else if (infixFound > 0 && prefix2 > 0) {
- // we have an infix, , and if there seems to be a prefix,
- // the infix is longer
- pos += copyTo(bytes, pos, (byte) ((prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((infixFound + 65) & 0xff));
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix - prefix2
- - infixFound + 65) & 0xff));
- pos += copyTo(bytes, pos,
- subsequence(wordLemma, prefix + prefix2));
- } else {
- // we have an infix, and if there seems to be a prefix,
- // the infix is longer
- // but the common prefix of two words is longer
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) 'A');
- pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff));
- pos += copyTo(bytes, pos, subsequence(wordLemma, prefix));
- }
-
- }
- pos += copyTo(bytes, pos, annotationSeparator);
- if (wordTag != null) {
- pos += copyTo(bytes, pos, wordTag);
- }
- final byte[] finalArray = new byte[pos];
- System.arraycopy(bytes, 0, finalArray, 0, pos);
- return finalArray;
- }
-
- /**
- * Converts a byte array to a given encoding.
- *
- * @param str
- * Byte-array to be converted.
- * @return Java String. If decoding is unsuccessful, the string is empty.
- */
- protected static String asString(final byte[] str, final String encoding) {
- try {
- return new String(str, encoding);
- } catch (UnsupportedEncodingException e) {
- return "";
- }
- }
-
- /**
- * A UTF-8 variant of {@link #standardEncode(byte[], byte[], byte[])} This
- * method converts the wordForm, wordLemma and tag to the form:
- *
- * <pre>
- * wordForm + Kending + tags
- * </pre>
- *
- * where '+' is a separator, K is a character that specifies how many
- * characters should be deleted from the end of the inflected form to
- * produce the lexeme by concatenating the stripped string with the ending.
- *
- * @throws UnsupportedEncodingException
- */
- public String standardEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(standardEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-
- /**
- * A UTF-8 variant of {@link #prefixEncode(byte[], byte[], byte[])} This
- * method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + LKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, L is the number of characters to be deleted
- * from the beginning of the word ("A" means none, "B" means one, "C" - 2,
- * etc.), K is a character that specifies how many characters should be
- * deleted from the end of the inflected form to produce the lexeme by
- * concatenating the stripped string with the ending ("A" means none,
- * "B' - 1, "C" - 2, and so on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- * @throws UnsupportedEncodingException
- */
- public String prefixEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(prefixEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-
- /**
- * A UTF-8 variant of {@link #infixEncode(byte[], byte[], byte[])}.
- *
- * This method converts wordform, wordLemma and the tag to the form:
- * <pre>
- * inflected_form + MLKending + tags
- * </pre>
- * <p>
- * where '+' is a separator, M is the position of characters to be deleted
- * towards the beginning of the inflected form ("A" means from the
- * beginning, "B" from the second character, "C" - from the third one, and
- * so on), L is the number of characters to be deleted from the position
- * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a
- * character that specifies how many characters should be deleted from the
- * end of the inflected form to produce the lexeme by concatenating the
- * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so
- * on).
- *
- * @param wordForm
- * - inflected word form
- * @param wordLemma
- * - canonical form
- * @param wordTag
- * - tag
- * @return the encoded string
- * @throws UnsupportedEncodingException
- */
- public String infixEncodeUTF8(final String wordForm,
- final String wordLemma, final String wordTag)
- throws UnsupportedEncodingException {
- return asString(infixEncode(wordForm.getBytes(UTF8), wordLemma
- .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8);
- }
-}
diff --git a/src/morfologik/tools/MorphEncodingTool.java b/src/morfologik/tools/MorphEncodingTool.java
deleted file mode 100644
index 223a32b..0000000
--- a/src/morfologik/tools/MorphEncodingTool.java
+++ /dev/null
@@ -1,213 +0,0 @@
-package morfologik.tools;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-import morfologik.fsa.FSA5;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-
-/**
- * This utility converts the dictionary in a text (tabbed) format into
- * the format accepted by the fsa building tools. It is meant to replace
- * the Perl and AWK scripts from the original FSA package.
- */
-class MorphEncodingTool extends Tool {
-
- private boolean prefixes = false;
- private boolean infixes = false;
- private boolean noWarn = false;
-
- private MorphEncoder encoder;
-
- /**
- *
- */
- protected void go(final CommandLine line) throws Exception {
-
- noWarn = line.hasOption(SharedOptions.noWarnIfTwoFields.getOpt());
-
- infixes = line.hasOption(SharedOptions.infixEncoding.getOpt());
-
- if (!infixes) {
- prefixes = line.hasOption(SharedOptions.prefixEncoding.getOpt());
- }
-
- char separator = FSA5.DEFAULT_ANNOTATION;
- if (line.hasOption(SharedOptions.annotationSeparatorCharacterOption.getLongOpt())) {
- String sep = line.getOptionValue(SharedOptions.annotationSeparatorCharacterOption.getLongOpt());
-
- if (sep.length() == 1) {
- separator = sep.charAt(0);
- }
-
- FSABuildTool.checkSingleByte(Character.toString(separator));
- }
- encoder = new MorphEncoder((byte) separator);
-
- // Determine input and output streams.
- final DataInputStream input = initializeInput(line);
- final DataOutputStream output = initializeOutput(line);
-
- try {
- process(input, output);
- output.flush();
-
- } finally {
- input.close();
- output.close();
- }
-
- }
-
- /**
- * Split fields
- * @param line
- * byte input buffer
- * @param pos
- * current offset in the file
- * @return
- * an array of three byte arrays; if there are less
- * than three fields, one of byte arrays is null. If the
- * line contains more than three fields, they are ignored.
- */
- private static byte[][] splitFields(final byte[] line, final int pos) {
- byte[][] outputArray = new byte[3][];
- int i = 0;
- int prevPos = 0;
- int arrayInd = 0;
- while (i < pos) {
- if (line[i] == (byte)'\t') { //tab
- outputArray[arrayInd] = new byte[i - prevPos];
- System.arraycopy(line, prevPos, outputArray[arrayInd], 0, i - prevPos);
- prevPos = i + 1;
- arrayInd++;
- }
- i++;
- }
- return outputArray;
- }
-
- /**
- * Process input stream, writing to output stream.
- *
- */
- protected void process(final DataInputStream input, final DataOutputStream output)
- throws IOException {
- long lnumber = 0;
- try {
- int bufPos = 0;
- byte[] buf = new byte[0xfffff]; // assumed that line is shorter than
- // 64K chars
- int dataByte = -1; // not declared within while loop
- byte[][] words;
- while ((dataByte = input.read()) != -1) {
- if (dataByte == (byte) '\n') {
- lnumber++;
- buf[bufPos++] = 9;
- words = splitFields(buf, bufPos);
- for (int i = 0; i < words.length; i++) {
- if (i < 1 && words[i] == null) {
- throw new IllegalArgumentException(
- "The input file has less than 2 fields in line: "
- + lnumber);
- }
- if (words[i] == null && !noWarn) {
- System.err.println("Line number: " + lnumber + " has less than three fields.");
- }
- }
-
- if (infixes) {
- output.write(encoder.infixEncode(words[0], words[1], words[2]));
- } else if (prefixes) {
- output.write(encoder.prefixEncode(words[0], words[1], words[2]));
- } else {
- output.write(encoder.standardEncode(words[0], words[1], words[2]));
- }
-
- output.writeByte('\n'); // Unix line end only.
- bufPos = 0;
- } else {
- if (dataByte != (byte) '\r') {
- buf[bufPos++] = (byte) dataByte;
- }
- }
- }
- } finally {
- input.close();
- }
- }
-
- /**
- * Command line options for the tool.
- */
- protected void initializeOptions(Options options) {
- options.addOption(SharedOptions.inputFileOption);
- options.addOption(SharedOptions.outputFileOption);
- options.addOption(SharedOptions.standardEncoding);
- options.addOption(SharedOptions.prefixEncoding);
- options.addOption(SharedOptions.infixEncoding);
- options.addOption(SharedOptions.noWarnIfTwoFields);
- options.addOption(SharedOptions.annotationSeparatorCharacterOption);
- }
-
- /**
- *
- */
- private static DataOutputStream initializeOutput(CommandLine line)
- throws IOException, ParseException {
- final DataOutputStream output;
- final String opt = SharedOptions.outputFileOption.getOpt();
- if (line.hasOption(opt)) {
- // Use output file.
- output = new DataOutputStream(
- new BufferedOutputStream(
- new FileOutputStream((File) line
- .getParsedOptionValue(opt))));
- } else {
- // Use standard output.
- output = new DataOutputStream(
- new BufferedOutputStream(
- System.out));
- }
- return output;
- }
-
- /**
- *
- */
- private static DataInputStream initializeInput(CommandLine line)
- throws IOException, ParseException {
- final DataInputStream input;
- final String opt = SharedOptions.inputFileOption.getOpt();
- if (line.hasOption(opt)) {
- // Use input file.
- input = new DataInputStream (
- new BufferedInputStream(
- new FileInputStream((File) line
- .getParsedOptionValue(opt))));
- } else {
- // Use standard input.
- input = new DataInputStream(
- new BufferedInputStream(
- System.in));
- }
- return input;
- }
-
- /**
- * Command line entry point.
- */
- public static void main(String[] args) throws Exception {
- final MorphEncodingTool tool = new MorphEncodingTool();
- tool.go(args);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/tools/PolishStemmingTool.java b/src/morfologik/tools/PolishStemmingTool.java
deleted file mode 100644
index 0abd897..0000000
--- a/src/morfologik/tools/PolishStemmingTool.java
+++ /dev/null
@@ -1,191 +0,0 @@
-package morfologik.tools;
-
-import java.io.*;
-import java.text.MessageFormat;
-import java.util.List;
-import java.util.Locale;
-
-import morfologik.stemming.*;
-
-import org.apache.commons.cli.*;
-
-/**
- * This utility parses input text, tokenizes it on whitespace and stems input
- * words, writing them to the output in column-based format:
- *
- * <pre>
- * word stem form
- * word stem form
- * </pre>
- *
- * Words for which no stems or forms are available have empty values in each
- * respective column. Columns are tab-delimited.
- */
-class PolishStemmingTool extends Tool {
- /**
- *
- */
- protected void go(CommandLine line) throws Exception {
- // Determine input/ output encoding.
- final String inputEncoding = getEncodingOption(line,
- SharedOptions.inputEncodingOption.getOpt());
-
- final String outputEncoding = getEncodingOption(line,
- SharedOptions.outputEncodingOption.getOpt());
-
- System.out.println("Input encoding: " + inputEncoding);
- System.out.println("Output encoding: " + outputEncoding);
-
- // Determine input and output streams.
- final Reader input = initializeInput(line, inputEncoding);
- final Writer output = initializeOutput(line, outputEncoding);
-
- final long start = System.currentTimeMillis();
- try {
- final long count = process(input, output);
-
- output.flush();
-
- final long millis = System.currentTimeMillis() - start;
- final double time = millis / 1000.0;
- final double wordsPerSec = time > 0 ? (count / time)
- : Double.POSITIVE_INFINITY;
- System.out
- .println(new MessageFormat(
- "Processed {0} words in {1,number,#.###} seconds ({2,number,#} words per second).",
- Locale.ENGLISH).format(new Object[] {
- new Long(count), new Double(millis / 1000.0),
- new Double(wordsPerSec) }));
- } finally {
- input.close();
- output.close();
- }
-
- }
-
- /**
- * Process input stream, writing to output stream.
- *
- * @return Returns the number of processed words.
- */
- protected long process(Reader input, Writer output) throws IOException {
- final IStemmer stemmer = new PolishStemmer();
- final StreamTokenizer st = new StreamTokenizer(input);
- st.eolIsSignificant(false);
- st.wordChars('+', '+');
-
- long count = 0;
- int token;
- while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) {
- if (token == StreamTokenizer.TT_WORD) {
- final String word = st.sval;
-
- count++;
- final List<WordData> stems = stemmer.lookup(word);
- if (stems.size() == 0) {
- output.write(word);
- output.write("\t-\t-\n");
- } else {
- for (WordData wd : stems) {
- output.write(word);
- output.write("\t");
- output.write(asString(wd.getStem()));
- output.write("\t");
- output.write(asString(wd.getTag()));
- output.write("\n");
- }
- }
- }
- }
-
- return count;
- }
-
- private String asString(CharSequence stem) {
- if (stem == null)
- return "-";
- return stem.toString();
- }
-
- /**
- * Command line options for the tool.
- */
- protected void initializeOptions(Options options) {
- options.addOption(SharedOptions.inputFileOption);
- options.addOption(SharedOptions.inputEncodingOption);
- options.addOption(SharedOptions.outputFileOption);
- options.addOption(SharedOptions.outputEncodingOption);
- }
-
- /**
- *
- */
- private Writer initializeOutput(CommandLine line, String outputEncoding)
- throws IOException, ParseException {
- final Writer output;
- final String opt = SharedOptions.outputFileOption.getOpt();
- if (line.hasOption(opt)) {
- // Use output file.
- output = new OutputStreamWriter(
- new BufferedOutputStream(new FileOutputStream((File) line
- .getParsedOptionValue(opt))), outputEncoding);
- } else {
- // Use standard output.
- output = new OutputStreamWriter(System.out, outputEncoding);
- }
- return output;
- }
-
- /**
- *
- */
- private Reader initializeInput(CommandLine line, String inputEncoding)
- throws IOException, ParseException {
- final Reader input;
- final String opt = SharedOptions.inputFileOption.getOpt();
-
- if (line.hasOption(opt)) {
- // Use input file.
- input = new InputStreamReader(
- new BufferedInputStream(new FileInputStream((File) line
- .getParsedOptionValue(opt))), inputEncoding);
- } else {
- // Use standard input.
- input = new InputStreamReader(System.in, inputEncoding);
- }
- return input;
- }
-
- /**
- *
- */
- private String getEncodingOption(CommandLine line, String opt) {
- String encoding = System.getProperty("file.encoding", "iso-8859-1");
- if (line.hasOption(opt)) {
- encoding = line.getOptionValue(opt);
- }
- return encoding;
- }
-
- /*
- * Check if the dictionary is available.
- */
- @Override
- protected boolean isAvailable() {
- boolean available = true;
- try {
- new PolishStemmer();
- } catch (Throwable t) {
- available = false;
- }
- return available;
- }
-
- /**
- * Command line entry point.
- */
- public static void main(String[] args) throws Exception {
- final PolishStemmingTool tool = new PolishStemmingTool();
- tool.go(args);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/tools/SharedOptions.java b/src/morfologik/tools/SharedOptions.java
deleted file mode 100644
index e047d7f..0000000
--- a/src/morfologik/tools/SharedOptions.java
+++ /dev/null
@@ -1,153 +0,0 @@
-package morfologik.tools;
-
-import java.io.File;
-import java.util.Arrays;
-
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-
-/**
- * Options shared between tools.
- */
-@SuppressWarnings("static-access")
-final class SharedOptions {
- public final static Option fsaDictionaryFileOption = OptionBuilder
- .hasArg()
- .withArgName("file")
- .withDescription("Path to the FSA dictionary.")
- .withLongOpt("dictionary")
- .withType(File.class)
- .isRequired(true)
- .create("d");
-
- public final static Option decode = OptionBuilder
- .withDescription("Decode prefix/ infix/ suffix forms (if available).")
- .withLongOpt("decode")
- .isRequired(false)
- .create("x");
-
- public final static Option dataOnly = OptionBuilder
- .withDescription("Dump only raw FSA data.")
- .withLongOpt("raw-data")
- .isRequired(false)
- .create("r");
-
- public final static Option dot = OptionBuilder
- .withDescription("Dump the automaton as graphviz DOT file.")
- .withLongOpt("dot")
- .isRequired(false)
- .create();
-
- public final static Option inputEncodingOption = OptionBuilder
- .hasArg()
- .withArgName("codepage")
- .withDescription("Input stream encoding.")
- .withLongOpt("input-encoding")
- .isRequired(false)
- .create("ie");
-
- public final static Option outputEncodingOption = OptionBuilder
- .hasArg()
- .withArgName("codepage")
- .withDescription("Output stream encoding.")
- .withLongOpt("output-encoding")
- .isRequired(false)
- .create("oe");
-
- public final static Option inputFileOption = OptionBuilder
- .hasArg()
- .withArgName("file")
- .withDescription("Input file. If missing, standard input is used.")
- .withLongOpt("input")
- .withType(File.class)
- .isRequired(false)
- .create("i");
-
- public final static Option outputFileOption = OptionBuilder
- .hasArg()
- .withArgName("file")
- .withDescription("Output file. If missing, standard output is used.")
- .withLongOpt("output")
- .withType(File.class)
- .isRequired(false)
- .create("o");
-
- public final static Option outputFormatOption = OptionBuilder
- .hasArg()
- .withArgName("format")
- .withDescription("Name of the binary output format. Allowed values: " + Arrays.toString(FSABuildTool.Format.values()))
- .withLongOpt("format")
- .isRequired(false)
- .create("f");
-
- public final static Option fillerCharacterOption = OptionBuilder
- .hasArg()
- .withArgName("char")
- .withDescription("Custom filler character")
- .isRequired(false)
- .withLongOpt("filler")
- .create();
-
- public final static Option annotationSeparatorCharacterOption = OptionBuilder
- .hasArg()
- .withArgName("char")
- .withDescription("Custom annotation separator character")
- .isRequired(false)
- .withLongOpt("annotation")
- .create();
-
- public final static Option withNumbersOption = OptionBuilder
- .withDescription("Include numbers required for perfect hashing (larger automaton)")
- .isRequired(false)
- .withLongOpt("with-numbers")
- .create("n");
-
- public final static Option progressOption = OptionBuilder
- .withDescription("Print more verbose progress information")
- .isRequired(false)
- .withLongOpt("progress")
- .create();
-
- public final static Option inputSortedOption = OptionBuilder
- .withDescription("Assume the input is already sorted using C-sort (builds FSA directly, no in-memory sorting)")
- .isRequired(false)
- .withLongOpt("sorted")
- .create();
-
- public final static Option standardEncoding = OptionBuilder
- .withDescription("Encode suffix forms in a standard way")
- .withLongOpt("suffix")
- .isRequired(false)
- .create("suf");
-
- public final static Option prefixEncoding = OptionBuilder
- .withDescription("Encode suffix forms in a prefix way")
- .withLongOpt("prefix")
- .isRequired(false)
- .create("pre");
-
- public final static Option infixEncoding = OptionBuilder
- .withDescription("Encode suffix forms in an infix way")
- .withLongOpt("infix")
- .isRequired(false)
- .create("inf");
-
- public final static Option noWarnIfTwoFields = OptionBuilder
- .withDescription("Suppress warning for lines with only two fields (for stemming dictionaries)")
- .withLongOpt("nowarn")
- .isRequired(false)
- .create("nw");
-
- public final static Option statistics = OptionBuilder
- .withDescription("Print extra statistics.")
- .isRequired(false)
- .withLongOpt("stats")
- .create();
-
- /**
- * No instances. Use static fields.
- */
- private SharedOptions() {
- // empty
- }
-}
diff --git a/src/morfologik/tools/Tool.java b/src/morfologik/tools/Tool.java
deleted file mode 100644
index ebc1daf..0000000
--- a/src/morfologik/tools/Tool.java
+++ /dev/null
@@ -1,84 +0,0 @@
-package morfologik.tools;
-
-import org.apache.commons.cli.*;
-
-/**
- * Base class for command-line applications.
- */
-abstract class Tool {
- /** Command line options. */
- protected final Options options = new Options();
-
- /**
- * Initializes application context.
- */
- protected final void go(String[] args) {
- initializeOptions(options);
-
- if (args.length == 0) {
- printUsage();
- return;
- }
-
- final Parser parser = new GnuParser();
- final CommandLine line;
- try {
- line = parser.parse(options, args);
- try {
- go(line);
- } catch (Throwable e) {
- printError("Unhandled program error occurred.", e);
- }
- } catch (MissingArgumentException e) {
- printError("Provide the required argument for option: "
- + e.getMessage());
- } catch (MissingOptionException e) {
- printError("Provide the required option: " + e.getMessage());
- } catch (UnrecognizedOptionException e) {
- printError(e.getMessage());
- } catch (ParseException e) {
- printError("Could not parse command line: " + e.getMessage());
- }
- }
-
- /**
- * Print an error and an associated exception.
- */
- protected void printError(String msg, Throwable t) {
- printError(msg);
- t.printStackTrace(System.err);
- }
-
- /**
- * Print an error without an exception.
- */
- protected void printError(String msg) {
- System.err.println();
- System.err.println(msg);
- }
-
- /**
- * Prints usage (options).
- */
- protected void printUsage() {
- final HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp(this.getClass().getName(), options, true);
- }
-
- /**
- * Override and write your stuff using command line options.
- */
- protected abstract void go(CommandLine line) throws Exception;
-
- /**
- * Override and initialize options.
- */
- protected abstract void initializeOptions(Options options);
-
- /**
- * Is the tool available? <code>true</code> by default.
- */
- protected boolean isAvailable() {
- return true;
- }
-}
diff --git a/src/morfologik/tools/WriterMessageLogger.java b/src/morfologik/tools/WriterMessageLogger.java
deleted file mode 100644
index fceb698..0000000
--- a/src/morfologik/tools/WriterMessageLogger.java
+++ /dev/null
@@ -1,123 +0,0 @@
-package morfologik.tools;
-
-import java.io.PrintWriter;
-import java.util.*;
-
-/**
- * A logger dumping info to <code>System.err</code>.
- */
-public class WriterMessageLogger implements IMessageLogger {
- /**
- * Start of the world timestamp.
- */
- private final static long world = System.currentTimeMillis();
-
- /**
- * A single part: name, start timestamp.
- */
- private static class Part {
- final String name;
- final long start;
-
- Part(String name, long start) {
- this.name = name;
- this.start = start;
- }
- }
-
- /**
- * Is the output currently indented?
- */
- private boolean indent;
-
- /**
- * Active parts.
- */
- private ArrayDeque<Part> parts = new ArrayDeque<Part>();
-
- /**
- * Output writer.
- */
- private final PrintWriter writer;
-
- /**
- *
- */
- public WriterMessageLogger(PrintWriter w) {
- this.writer = w;
- }
-
- /*
- *
- */
- @Override
- public void log(String msg) {
- cancelIndent();
-
- writer.println(msg);
- writer.flush();
- }
-
- /*
- *
- */
- @Override
- public void log(String header, Object v) {
- cancelIndent();
-
- if (v instanceof Integer || v instanceof Long) {
- writer.println(String.format(Locale.ENGLISH, "%-30s %,11d", header, v));
- } else {
- writer.println(String.format(Locale.ENGLISH, "%-30s %11s", header, v.toString()));
- }
- writer.flush();
- }
-
- /*
- *
- */
- @Override
- public void startPart(String header) {
- cancelIndent();
-
- Part p = new Part(header, System.currentTimeMillis());
- parts.addLast(p);
-
- writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "..."));
- writer.flush();
-
- indent = true;
- }
-
- /*
- *
- */
- @Override
- public void endPart() {
- long now = System.currentTimeMillis();
- Part p = parts.removeLast();
-
- if (!indent) {
- writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "..."));
- }
-
- writer.println(
- String.format(Locale.ENGLISH, "%13.2f sec. [%6.2f sec.]",
- (now - p.start) / 1000.0,
- (now - world) / 1000.0));
- writer.flush();
-
- indent = false;
- }
-
- /*
- *
- */
- private void cancelIndent() {
- if (indent) {
- System.err.println();
- }
-
- indent = false;
- }
-}
diff --git a/src/morfologik/util/Arrays.java b/src/morfologik/util/Arrays.java
deleted file mode 100644
index 4d1d840..0000000
--- a/src/morfologik/util/Arrays.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package morfologik.util;
-
-/**
- * Compatibility layer for JVM 1.5.
- */
-public final class Arrays {
- private Arrays() {
- // No instances.
- }
-
- /**
- * Compare two lists of objects for reference-equality.
- */
- public static boolean referenceEquals(Object[] a1, int a1s, Object[] a2, int a2s, int length) {
- for (int i = 0; i < length; i++)
- if (a1[a1s++] != a2[a2s++])
- return false;
-
- return true;
- }
-
- /**
- * Compare two arrays for equality.
- */
- public static boolean equals(byte[] a1, int a1s, byte [] a2, int a2s, int length) {
- for (int i = 0; i < length; i++)
- if (a1[a1s++] != a2[a2s++])
- return false;
-
- return true;
- }
-
- /**
- * Compare two arrays for equality.
- */
- public static boolean equals(boolean[] a1, int a1s, boolean[] a2, int a2s, int length) {
- for (int i = 0; i < length; i++)
- if (a1[a1s++] != a2[a2s++])
- return false;
-
- return true;
- }
-
- /**
- * Compare two arrays for equality.
- */
- public static boolean equals(int[] a1, int a1s, int[] a2, int a2s, int length) {
- for (int i = 0; i < length; i++)
- if (a1[a1s++] != a2[a2s++])
- return false;
-
- return true;
- }
-
- /**
- * Convert an array of strings to bytes.
- */
- public static String toString(byte [] bytes, int start, int length)
- {
- if (bytes.length != length)
- {
- final byte [] sub = new byte [length];
- System.arraycopy(bytes, start, sub, 0, length);
- bytes = sub;
- }
- return java.util.Arrays.toString(bytes);
- }
-}
diff --git a/src/morfologik/util/BufferUtils.java b/src/morfologik/util/BufferUtils.java
deleted file mode 100644
index 6ccfbc6..0000000
--- a/src/morfologik/util/BufferUtils.java
+++ /dev/null
@@ -1,54 +0,0 @@
-package morfologik.util;
-
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-
-/**
- * Utility functions for buffers.
- */
-public final class BufferUtils {
-
- /**
- * No instances.
- */
- private BufferUtils() {
- // empty
- }
-
- /**
- * Ensure the byte buffer's capacity. If a new buffer is allocated, its
- * content is empty (the old buffer's contents is not copied).
- *
- * @param buffer
- * The buffer to check or <code>null</code> if a new buffer
- * should be allocated.
- */
- public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) {
- if (buffer == null || buffer.capacity() < capacity) {
- buffer = ByteBuffer.allocate(capacity);
- }
- return buffer;
- }
-
- /**
- * Ensure the char buffer's capacity. If a new buffer is allocated, its
- * content is empty (the old buffer's contents is not copied).
- *
- * @param buffer
- * The buffer to check or <code>null</code> if a new buffer
- * should be allocated.
- */
- public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) {
- if (buffer == null || buffer.capacity() < capacity) {
- buffer = CharBuffer.allocate(capacity);
- }
- return buffer;
- }
-
- /**
- * Convert a byte buffer to a string in platform default encoding.
- */
- public static String toString(ByteBuffer sequence) {
- return new String(sequence.array(), sequence.position(), sequence.remaining());
- }
-} \ No newline at end of file
diff --git a/src/morfologik/util/FileUtils.java b/src/morfologik/util/FileUtils.java
deleted file mode 100644
index 5d62212..0000000
--- a/src/morfologik/util/FileUtils.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package morfologik.util;
-
-import java.io.*;
-
-/**
- * Utility functions.
- */
-public final class FileUtils {
-
- /**
- * No instances.
- */
- private FileUtils() {
- // empty
- }
-
- /**
- * Checks if the given file exists.
- */
- public static void assertExists(File fsaFile, boolean requireFile,
- boolean requireDirectory) throws IOException {
- if (!fsaFile.exists()) {
- throw new IOException("File does not exist: "
- + fsaFile.getAbsolutePath());
- }
-
- if (requireFile) {
- if (!fsaFile.isFile() || !fsaFile.canRead()) {
- throw new IOException("File cannot be read: "
- + fsaFile.getAbsolutePath());
- }
- }
-
- if (requireDirectory) {
- if (!fsaFile.isDirectory()) {
- throw new IOException("Not a directory: "
- + fsaFile.getAbsolutePath());
- }
- }
- }
-
- /**
- * Force any non-null closeables.
- */
- public static void close(Closeable... closeables) {
- for (Closeable c : closeables) {
- if (c != null) {
- try {
- c.close();
- } catch (IOException e) {
- // too bad
- }
- }
- }
- }
-
- /**
- * Reads all bytes from an input stream (until EOF).
- */
- public static byte[] readFully(InputStream stream) throws IOException {
- final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16);
- final byte[] buffer = new byte[1024 * 8];
- int bytesCount;
- while ((bytesCount = stream.read(buffer)) > 0) {
- baos.write(buffer, 0, bytesCount);
- }
- return baos.toByteArray();
- }
-
- /**
- * Read enough bytes to fill <code>array</code> If there are not enough
- * bytes, throw an exception.
- */
- public static void readFully(InputStream in, byte[] array)
- throws IOException {
- int offset = 0;
- int cnt;
- while ((cnt = in.read(array, offset, array.length - offset)) > 0) {
- offset += cnt;
-
- if (offset == array.length)
- break;
- }
-
- if (cnt < 0)
- throw new EOFException();
- }
-
- /**
- * Read exactly 4 bytes from the input stream.
- */
- public static int readInt(InputStream in) throws IOException {
- int v = 0;
- for (int i = 0; i < 4; i++) {
- v = (v << 8) | (readByte(in) & 0xff);
- }
- return v;
- }
-
- /**
- *
- */
- public static void writeInt(OutputStream os, int v) throws IOException {
- os.write( v >>> 24);
- os.write((v >>> 16) & 0xff);
- os.write((v >>> 8) & 0xff);
- os.write( v & 0xff);
- }
-
- /**
- * Read exactly 2 bytes from the input stream.
- */
- public static short readShort(InputStream in) throws IOException {
- return (short) (readByte(in) << 8 |
- readByte(in) & 0xff);
- }
-
- /**
- * Read exactly one byte from the input stream.
- *
- * @throws EOFException if EOF is reached.
- */
- public static byte readByte(InputStream in) throws IOException {
- int b = in.read();
- if (b == -1)
- throw new EOFException();
- return (byte) b;
- }
-
- /**
- *
- */
- public static void writeShort(OutputStream os, short v) throws IOException {
- os.write((v >>> 8) & 0xff);
- os.write( v & 0xff);
- }
-} \ No newline at end of file
diff --git a/src/morfologik/util/ResourceUtils.java b/src/morfologik/util/ResourceUtils.java
deleted file mode 100644
index 2c7bd23..0000000
--- a/src/morfologik/util/ResourceUtils.java
+++ /dev/null
@@ -1,58 +0,0 @@
-package morfologik.util;
-
-import java.io.*;
-import java.net.*;
-
-/**
- * Resource management utilities.
- */
-public final class ResourceUtils {
- /**
- * No instances.
- */
- private ResourceUtils() {
- }
-
- /**
- * Returns an input stream to the resource.
- *
- * @param resource
- * The path leading to the resource. Can be an URL, a path
- * leading to a class resource or a {@link File}.
- *
- * @return InputStream instance.
- * @throws IOException
- * If the resource could not be found or opened.
- */
- public static InputStream openInputStream(String resource)
- throws IOException {
- try {
- // See if the resource is an URL first.
- final URL url = new URL(resource);
- // success, load the resource.
- return url.openStream();
- } catch (MalformedURLException e) {
- // No luck. Fallback to class loader paths.
- }
-
- // Try current thread's class loader first.
- final ClassLoader ldr = Thread.currentThread().getContextClassLoader();
-
- InputStream is;
- if (ldr != null && (is = ldr.getResourceAsStream(resource)) != null) {
- return is;
- } else if ((is = ResourceUtils.class.getResourceAsStream(resource)) != null) {
- return is;
- } else if ((is = ClassLoader.getSystemResourceAsStream(resource)) != null) {
- return is;
- }
-
- // Try file path
- final File f = new File(resource);
- if (f.exists() && f.isFile() && f.canRead()) {
- return new FileInputStream(f);
- }
-
- throw new IOException("Could not locate resource: " + resource);
- }
-}