summaryrefslogtreecommitdiff
path: root/morfologik-tools/src/main/java/morfologik
diff options
context:
space:
mode:
Diffstat (limited to 'morfologik-tools/src/main/java/morfologik')
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java541
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java287
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java112
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/Launcher.java158
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java255
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java193
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java46
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java361
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java152
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/Tool.java102
-rw-r--r--morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java125
11 files changed, 2332 insertions, 0 deletions
diff --git a/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java b/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java
new file mode 100644
index 0000000..687b6cb
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java
@@ -0,0 +1,541 @@
+package morfologik.tools;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import morfologik.fsa.CFSA2Serializer;
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSA5Serializer;
+import morfologik.fsa.FSABuilder;
+import morfologik.fsa.FSAFlags;
+import morfologik.fsa.FSAInfo;
+import morfologik.fsa.FSASerializer;
+import morfologik.fsa.FSAUtils;
+import morfologik.fsa.IMessageLogger;
+import morfologik.fsa.StateVisitor;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.lang.StringEscapeUtils;
+
+import com.carrotsearch.hppc.IntIntOpenHashMap;
+import com.carrotsearch.hppc.cursors.IntIntCursor;
+
+/**
+ * Convert from plain text input to a serialized FSA in any of the
+ * available {@link Format}s.
+ */
+public final class FSABuildTool extends Tool {
+ /**
+ * One megabyte.
+ */
+ private final static int MB = 1024 * 1024;
+
+ /**
+ * The serialization format to use for the binary output.
+ */
+ public enum Format {
+ FSA5,
+ CFSA2;
+
+ public FSASerializer getSerializer() {
+ switch (this) {
+ case FSA5:
+ return new FSA5Serializer();
+
+ case CFSA2:
+ return new CFSA2Serializer();
+
+ default:
+ throw new RuntimeException();
+ }
+ }
+ }
+
+ /**
+ * Be more verbose about progress.
+ */
+ private boolean printProgress;
+
+ /**
+ * Serializer used for emitting the FSA.
+ */
+ private FSASerializer serializer;
+
+ /**
+ * Output format name.
+ */
+ private Format format;
+
+ /**
+ * Warn about CR characters in the input (usually not what you want).
+ */
+ private boolean crWarning = false;
+
+ /**
+ * If <code>true</code>, the input is not buffered and sorted in-memory, but
+ * must be sorted externally (using the "C" convention: unsigned byte values).
+ */
+ private boolean inputSorted;
+
+ /**
+ * Print additional statistics about the output automaton.
+ */
+ private boolean statistics;
+
+ /**
+ * The actual construction of the FSA.
+ */
+ private FSABuilder builder = new FSABuilder();
+
+ /**
+ * Start time.
+ */
+ private long start = System.currentTimeMillis();
+
+ private IMessageLogger logger;
+
+ /**
+ * Gets fed with the lines read from the input.
+ */
+ private static interface LineConsumer {
+ /**
+ * Process the buffer, return the same buffer or a new buffer (for
+ * swapping).
+ */
+ byte[] process(byte[] buffer, int pos);
+ }
+
+ /**
+ * To help break out of the anonymous delegate on error.
+ */
+ @SuppressWarnings("serial")
+ private static class TerminateProgramException extends RuntimeException {
+ public TerminateProgramException(String msg) {
+ super(msg);
+ }
+
+ public synchronized Throwable fillInStackTrace() {
+ return null;
+ }
+ }
+
+ /**
+ * Command line entry point after parsing arguments.
+ */
+ protected void go(CommandLine line) throws Exception {
+ String[] args = line.getArgs();
+ if (args.length != 0) {
+ printUsage();
+ return;
+ }
+
+ // Parse the input options.
+ parseOptions(line);
+
+ logger = new WriterMessageLogger(new PrintWriter(System.err));
+ this.serializer.withLogger(logger);
+
+ BufferedInputStream inputStream = null;
+ try {
+ inputStream = initializeInput(line);
+
+ if (inputSorted) {
+ logger.log("Assuming input is already sorted");
+ }
+
+ checkUtf8Bom(inputStream);
+
+ final FSA fsa;
+ if (inputSorted) {
+ fsa = processSortedInput(inputStream);
+ } else {
+ fsa = processUnsortedInput(inputStream);
+ }
+
+ if (crWarning) logger.log("Warning: input contained carriage returns?");
+
+ if (statistics) {
+ logger.startPart("Statistics");
+ FSAInfo info = new FSAInfo(fsa);
+ TreeMap<Integer, Integer> fanout = FSAUtils.calculateFanOuts(fsa, fsa.getRootNode());
+ logger.endPart();
+
+ final IntIntOpenHashMap numbers = new IntIntOpenHashMap();
+ fsa.visitInPostOrder(new StateVisitor() {
+ public boolean accept(int state) {
+ int thisNodeNumber = 0;
+ for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
+ thisNodeNumber +=
+ (fsa.isArcFinal(arc) ? 1 : 0) +
+ (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc)));
+ }
+ numbers.put(state, thisNodeNumber);
+ return true;
+ }
+ });
+
+ int singleRLC = 0;
+ for (IntIntCursor c : numbers) {
+ if (c.value == 1) singleRLC++;
+ }
+
+ logger.log("Nodes", info.nodeCount);
+ logger.log("Arcs", info.arcsCount);
+ logger.log("Tail nodes", singleRLC);
+
+ logger.log("States with the given # of outgoing arcs:");
+ for (Map.Entry<Integer, Integer> e : fanout.entrySet()) {
+ logger.log(" #" + e.getKey(), e.getValue());
+ }
+
+ logger.log("FSA builder properties:");
+ for (Map.Entry<FSABuilder.InfoEntry, Object> e : builder.getInfo().entrySet()) {
+ logger.log(e.getKey().toString(), e.getValue());
+ }
+ }
+
+ // Save the result.
+ logger.startPart("Serializing " + format);
+ serializer.serialize(fsa, initializeOutput(line)).close();
+ logger.endPart();
+ } catch (OutOfMemoryError e) {
+ logger.log("Error: Out of memory. Pass -Xmx1024m argument (or more) to java.");
+ } finally {
+ if (inputStream != System.in && inputStream != null) {
+ inputStream.close();
+ }
+ }
+ }
+
+ /**
+ * Warn in case UTF-8 BOM is detected as this is 99% a mistake.
+ */
+ private void checkUtf8Bom(InputStream is) throws IOException {
+ if (!is.markSupported()) {
+ // throw a hard assertion.
+ throw new AssertionError("Mark should be supported on input stream.");
+ }
+
+ is.mark(3);
+ if (is.read() == 0xef &&
+ is.read() == 0xbb &&
+ is.read() == 0xbf) {
+ System.err.println("Warning: input starts with UTF-8 BOM bytes which is" +
+ " most likely not what you want. Use header-less UTF-8 file (unless you are" +
+ " encoding plain bytes in which case this message doesn't apply).");
+ }
+ is.reset();
+ }
+
+ /**
+ * Process unsorted input (sort and construct FSA).
+ */
+ private FSA processUnsortedInput(InputStream inputStream)
+ throws IOException {
+ final FSA root;
+ logger.startPart("Reading input");
+ final ArrayList<byte[]> input = readInput(inputStream);
+ logger.endPart();
+
+ logger.log("Input sequences", input.size());
+
+ logger.startPart("Sorting");
+ Collections.sort(input, FSABuilder.LEXICAL_ORDERING);
+ logger.endPart();
+
+ logger.startPart("Building FSA");
+ for (byte [] bb : input)
+ builder.add(bb, 0, bb.length);
+ root = builder.complete();
+ logger.endPart();
+ return root;
+ }
+
+ /**
+ *
+ */
+ private FSA processSortedInput(InputStream inputStream)
+ throws IOException {
+
+ int lines = forAllLines(inputStream, new LineConsumer() {
+ private byte [] current;
+ private byte [] previous = null;
+ private int previousLen;
+
+ public byte[] process(byte[] current, int currentLen) {
+ // Verify the order.
+ if (previous != null) {
+ if (FSABuilder.compare(previous, 0, previousLen, current, 0, currentLen) > 0) {
+ logger.log("\n\nERROR: The input is not sorted: \n" +
+ dumpLine(previous, previousLen) + "\n" +
+ dumpLine(current, currentLen));
+ throw new TerminateProgramException("Input is not sorted.");
+ }
+ }
+
+ // Add to the automaton.
+ builder.add(current, 0, currentLen);
+
+ // Swap buffers.
+ this.current = previous != null ? previous : new byte [current.length];
+ this.previous = current;
+ this.previousLen = currentLen;
+
+ return this.current;
+ }
+ });
+
+ logger.startPart("Building FSA");
+ FSA fsa = builder.complete();
+ logger.endPart();
+ logger.log("Input sequences", lines);
+
+ return fsa;
+ }
+
+ /**
+ * Dump input line, byte-by-byte.
+ */
+ protected String dumpLine(byte[] line, int length) {
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ if (i > 0) builder.append(" ");
+ builder.append(String.format("%02x", line[i]));
+ }
+ builder.append(" | ");
+ for (int i = 0; i < length; i++) {
+ if (Character.isLetterOrDigit(line[i]))
+ builder.append((char) line[i]);
+ else
+ builder.append(".");
+ }
+ return builder.toString();
+ }
+
+ /**
+ * Parse input options.
+ */
+ private void parseOptions(CommandLine line) {
+ String opt;
+
+ opt = SharedOptions.outputFormatOption.getOpt();
+ if (line.hasOption(opt)) {
+ String formatValue = line.getOptionValue(opt);
+ try {
+ format = Format.valueOf(formatValue.toUpperCase());
+ } catch (IllegalArgumentException e) {
+ throw new TerminateProgramException("Not a valid format: "
+ + formatValue);
+ }
+ } else {
+ format = Format.FSA5;
+ }
+ serializer = format.getSerializer();
+
+ Charset defaultCharset = Charset.defaultCharset();
+ opt = SharedOptions.fillerCharacterOption.getLongOpt();
+ if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) {
+ String chr = StringEscapeUtils.unescapeJava(line.getOptionValue(opt));
+ checkSingleByte(chr, defaultCharset);
+ serializer.withFiller(chr.getBytes()[0]);
+ }
+
+ opt = SharedOptions.annotationSeparatorCharacterOption.getLongOpt();
+ if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) {
+ String chr = StringEscapeUtils.unescapeJava(line.getOptionValue(opt));
+ checkSingleByte(chr, defaultCharset);
+ serializer.withAnnotationSeparator(chr.getBytes()[0]);
+ }
+
+ opt = SharedOptions.withNumbersOption.getOpt();
+ if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.NUMBERS)) {
+ serializer.withNumbers();
+ }
+
+ opt = SharedOptions.progressOption.getLongOpt();
+ if (line.hasOption(opt)) {
+ printProgress = true;
+ }
+
+ opt = SharedOptions.inputSortedOption.getLongOpt();
+ if (line.hasOption(opt)) {
+ inputSorted = true;
+ }
+
+ opt = SharedOptions.statistics.getLongOpt();
+ if (line.hasOption(opt)) {
+ statistics = true;
+ }
+ }
+
+ private boolean requiredCapability(String opt, FSAFlags flag) {
+ if (!serializer.getFlags().contains(flag)) {
+ throw new RuntimeException("This serializer does not support option: " + opt);
+ }
+ return true;
+ }
+
+ /**
+ * Check if the argument is a single byte after conversion using platform-default
+ * encoding.
+ */
+ public static byte checkSingleByte(String chr, Charset charset) {
+ byte bytes [] = chr.getBytes(charset);
+ if (bytes.length == 1)
+ return bytes[0];
+
+ throw new IllegalArgumentException("Filler and annotation characters must be single" +
+ "-byte values, " + chr + " has " + chr.getBytes().length + " bytes.");
+ }
+
+ /**
+ * Read all the input lines, unsorted.
+ */
+ private ArrayList<byte[]> readInput(InputStream is) throws IOException {
+ final ArrayList<byte[]> result = new ArrayList<byte[]>();
+ forAllLines(is, new LineConsumer() {
+ public byte[] process(byte[] buffer, int pos) {
+ result.add(java.util.Arrays.copyOf(buffer, pos));
+ return buffer;
+ }
+ });
+ return result;
+ }
+
+ /**
+ * Apply line consumer to all non-empty lines.
+ */
+ private int forAllLines(InputStream is, LineConsumer lineConsumer) throws IOException {
+ int lines = 0;
+ byte[] buffer = new byte[0];
+ int line = 0, b, pos = 0;
+ while ((b = is.read()) != -1) {
+ if (b == '\r' && !crWarning) {
+ crWarning = true;
+ }
+
+ if (b == '\n') {
+ if (pos > 0) {
+ buffer = lineConsumer.process(buffer, pos);
+ pos = 0;
+ lines++;
+ }
+
+ if (printProgress && line++ > 0 && (line % 1000000) == 0) {
+ logger.log(String.format(Locale.ENGLISH, "%6.2fs, sequences: %d", elapsedTime(), line));
+ }
+ } else {
+ if (pos >= buffer.length) {
+ buffer = java.util.Arrays.copyOf(buffer, buffer.length + 10);
+ }
+ buffer[pos++] = (byte) b;
+ }
+ }
+
+ if (pos > 0) {
+ lineConsumer.process(buffer, pos);
+ lines++;
+ }
+
+ return lines;
+ }
+
+ private double elapsedTime() {
+ return (System.currentTimeMillis() - start) / 1000.0d;
+ }
+
+ @Override
+ protected void printUsage() {
+ final HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(this.getClass().getName(), options, true);
+ }
+
+ @Override
+ protected void initializeOptions(Options options) {
+ options.addOption(SharedOptions.inputFileOption);
+ options.addOption(SharedOptions.outputFileOption);
+
+ options.addOption(SharedOptions.outputFormatOption);
+
+ options.addOption(SharedOptions.fillerCharacterOption);
+ options.addOption(SharedOptions.annotationSeparatorCharacterOption);
+
+ options.addOption(SharedOptions.withNumbersOption);
+ options.addOption(SharedOptions.progressOption);
+
+ options.addOption(SharedOptions.inputSortedOption);
+
+ options.addOption(SharedOptions.statistics);
+ }
+
+ /**
+ *
+ */
+ @SuppressWarnings("resource")
+ private static OutputStream initializeOutput(CommandLine line)
+ throws IOException, ParseException {
+ final OutputStream output;
+ final String opt = SharedOptions.outputFileOption.getOpt();
+ if (line.hasOption(opt)) {
+ // Use output file.
+ output = new FileOutputStream((File) line.getParsedOptionValue(opt));
+ } else {
+ // Use standard output.
+ output = System.out;
+ }
+ return new BufferedOutputStream(output);
+ }
+
+ /**
+ *
+ */
+ @SuppressWarnings("resource")
+ private BufferedInputStream initializeInput(CommandLine line)
+ throws IOException, ParseException {
+ final InputStream input;
+ final String opt = SharedOptions.inputFileOption.getOpt();
+
+ if (line.hasOption(opt)) {
+ // Use input file.
+ File inputFile = (File) line.getParsedOptionValue(opt);
+ if (!inputSorted && inputFile.length() > 20 * MB) {
+ logger.log("WARN: The input file is quite large, avoid\n" +
+ " in-memory sorting by piping pre-sorted\n" +
+ " input directly to fsa_build. Linux:\n" +
+ " export LC_ALL=C && \\\n" +
+ " sort input | \\\n" +
+ " java -jar morfologik.jar fsa_build --sorted -o dict.fsa");
+ }
+
+ input = new FileInputStream(inputFile);
+ } else {
+ // Use standard input.
+ input = System.in;
+ }
+ return new BufferedInputStream(input);
+ }
+
+ /**
+ * Command line entry point.
+ */
+ public static void main(String... args) throws Exception {
+ final FSABuildTool tool = new FSABuildTool();
+ tool.go(args);
+ }
+} \ No newline at end of file
diff --git a/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java b/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java
new file mode 100644
index 0000000..510d8e5
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java
@@ -0,0 +1,287 @@
+package morfologik.tools;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.LinkedHashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import morfologik.fsa.FSA;
+import morfologik.fsa.FSA5;
+import morfologik.fsa.FSAInfo;
+import morfologik.fsa.FSAUtils;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryAttribute;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.WordData;
+import morfologik.util.FileUtils;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
+/**
+ * This utility will dump the information and contents of a given {@link FSA}
+ * dictionary. It can dump dictionaries in the raw form (as fed to the
+ * <code>fsa_build</code> program) or decoding compressed stem forms.
+ */
+public final class FSADumpTool extends Tool {
+ /**
+ * Direct binary stream used for dictionary dumps.
+ */
+ private OutputStream os;
+
+ /**
+ * A writer for messages and any text-based output.
+ */
+ private Writer w;
+
+ /**
+ * Print raw data only, no headers.
+ */
+ private boolean dataOnly;
+
+ /**
+ * Decode from prefix/infix/suffix encodings.
+ */
+ private boolean decode;
+
+ /**
+ * Dump graphviz DOT file instead of automaton sequences.
+ */
+ private boolean dot;
+
+ /**
+ * Command line entry point after parsing arguments.
+ */
+ protected void go(CommandLine line) throws Exception {
+ final File dictionaryFile = (File) line
+ .getParsedOptionValue(SharedOptions.fsaDictionaryFileOption
+ .getOpt());
+
+ dataOnly = line.hasOption(SharedOptions.dataOnly.getOpt());
+ decode = line.hasOption(SharedOptions.decode.getOpt());
+ dot = line.hasOption(SharedOptions.dot.getLongOpt());
+
+ FileUtils.assertExists(dictionaryFile, true, false);
+
+ dump(dictionaryFile);
+ }
+
+ /**
+ * Dumps the content of a dictionary to a file.
+ */
+ private void dump(File dictionaryFile)
+ throws UnsupportedEncodingException, IOException {
+ final long start = System.currentTimeMillis();
+
+ final Dictionary dictionary;
+ final FSA fsa;
+
+ if (!dictionaryFile.canRead()) {
+ printWarning("Dictionary file does not exist: "
+ + dictionaryFile.getAbsolutePath());
+ return;
+ }
+
+ this.os = new BufferedOutputStream(System.out, 1024 * 32);
+ this.w = new OutputStreamWriter(os, "UTF-8");
+
+ if (hasMetadata(dictionaryFile)) {
+ dictionary = Dictionary.read(dictionaryFile);
+ fsa = dictionary.fsa;
+
+ final String encoding = dictionary.metadata.getEncoding();
+ if (!Charset.isSupported(encoding)) {
+ printWarning("Dictionary's charset is not supported "
+ + "on this JVM: " + encoding);
+ return;
+ }
+ } else {
+ dictionary = null;
+ fsa = FSA.read(new FileInputStream(dictionaryFile));
+ printWarning("Warning: FSA automaton without metadata file.");
+ }
+
+ printExtra("FSA properties");
+ printExtra("--------------");
+ printExtra("FSA implementation : " + fsa.getClass().getName());
+ printExtra("Compiled with flags : " + fsa.getFlags().toString());
+
+ if (!dataOnly) {
+ final FSAInfo info = new FSAInfo(fsa);
+ printExtra("Number of arcs : " + info.arcsCount + "/" + info.arcsCountTotal);
+ printExtra("Number of nodes : " + info.nodeCount);
+ printExtra("Number of final states : " + info.finalStatesCount);
+ printExtra("");
+ }
+
+ // Separator for dumping.
+ char separator = '\t';
+
+ if (fsa instanceof FSA5) {
+ printExtra("FSA5 properties");
+ printExtra("---------------");
+ printFSA5((FSA5) fsa);
+ printExtra("");
+ }
+
+ if (dictionary != null) {
+ printExtra("Dictionary metadata");
+ printExtra("-------------------");
+
+ Map<DictionaryAttribute,String> values =
+ new LinkedHashMap<DictionaryAttribute,String>(dictionary.metadata.getAttributes());
+ values.put(DictionaryAttribute.ENCODING, dictionary.metadata.getEncoding());
+ values.put(DictionaryAttribute.SEPARATOR, "0x"
+ + Integer.toHexString(dictionary.metadata.getSeparator())
+ + " ('" + dictionary.metadata.getSeparatorAsChar() + "')");
+
+ for (Map.Entry<DictionaryAttribute,String> e : values.entrySet()) {
+ printExtra(String.format(Locale.ENGLISH,
+ "%-40s: %s",
+ e.getKey().propertyName,
+ e.getValue()));
+ }
+ printExtra("");
+ }
+
+ int sequences = 0;
+ if (decode) {
+ if (dictionary == null) {
+ printWarning("No dictionary metadata available.");
+ return;
+ }
+
+ printExtra("Decoded FSA data (in the encoding above)");
+ printExtra("----------------------------------------");
+
+ final DictionaryLookup dl = new DictionaryLookup(dictionary);
+ final StringBuilder builder = new StringBuilder();
+ final OutputStreamWriter osw = new OutputStreamWriter(os, dictionary.metadata.getEncoding());
+
+ CharSequence t;
+ for (WordData wd : dl) {
+ builder.setLength(0);
+ builder.append(wd.getWord());
+ builder.append(separator);
+
+ t = wd.getStem();
+ if (t == null)
+ t = "";
+ builder.append(t);
+ builder.append(separator);
+
+ t = wd.getTag();
+ if (t == null)
+ t = "";
+ builder.append(t);
+ builder.append('\n');
+
+ osw.write(builder.toString());
+ sequences++;
+ }
+ osw.flush();
+ } else {
+ if (dot) {
+ FSAUtils.toDot(w, fsa, fsa.getRootNode());
+ w.flush();
+ } else {
+ printExtra("FSA data (raw bytes in the encoding above)");
+ printExtra("------------------------------------------");
+
+ for (ByteBuffer bb : fsa) {
+ os.write(bb.array(), 0, bb.remaining());
+ os.write(0x0a);
+ sequences++;
+ }
+ }
+ }
+
+ printExtra("--------------------");
+
+ final long millis = Math.max(1, System.currentTimeMillis() - start);
+ printExtra(String
+ .format(
+ Locale.ENGLISH,
+ "Dictionary dumped in %.3f second(s), %d sequences (%d sequences/sec.).",
+ millis / 1000.0, sequences,
+ (int) (sequences / (millis / 1000.0))));
+
+ os.flush();
+ }
+
+ /**
+ * Print {@link FSA5}-specific stuff.
+ */
+ private void printFSA5(FSA5 fsa) throws IOException {
+ printExtra("GTL : " + fsa.gtl);
+ printExtra("Node extra data : " + fsa.nodeDataLength);
+ printExtra("Annotation separator : " + byteAsChar(fsa.annotation));
+ printExtra("Filler character : " + byteAsChar(fsa.filler));
+ }
+
+ /**
+ * Convert a byte to a character, no charset decoding, simple ASCII range mapping.
+ */
+ private char byteAsChar(byte v) {
+ char chr = (char) (v & 0xff);
+ if (chr < 127)
+ return chr;
+ else
+ return '?';
+ }
+
+ /*
+ *
+ */
+ private void printExtra(String msg) throws IOException {
+ if (dataOnly)
+ return;
+ w.write(msg);
+ w.write('\n');
+ w.flush();
+ }
+
+ /*
+ *
+ */
+ private void printWarning(String msg) {
+ System.err.println(msg);
+ }
+
+ /**
+ * Check if there is a metadata file for the given FSA automaton.
+ */
+ private static boolean hasMetadata(File fsaFile) {
+ final File featuresFile = new File(fsaFile.getParent(), Dictionary
+ .getExpectedFeaturesName(fsaFile.getName()));
+
+ return featuresFile.canRead();
+ }
+
+ /**
+ * Command line options for the tool.
+ */
+ protected void initializeOptions(Options options) {
+ options.addOption(SharedOptions.fsaDictionaryFileOption);
+ options.addOption(SharedOptions.dataOnly);
+ options.addOption(SharedOptions.decode);
+ options.addOption(SharedOptions.dot);
+ }
+
+ /**
+ * Command line entry point.
+ */
+ public static void main(String... args) throws Exception {
+ final FSADumpTool fsaDump = new FSADumpTool();
+ fsaDump.go(args);
+ }
+} \ No newline at end of file
diff --git a/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java b/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java
new file mode 100644
index 0000000..e913b7f
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java
@@ -0,0 +1,112 @@
+package morfologik.tools;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.*;
+import java.util.*;
+import java.util.Map.Entry;
+
+import morfologik.stemming.*;
+import morfologik.stemming.Dictionary;
+
+/**
+ * Calculate inflection frames from the Polish dictionary.
+ */
+public class InflectionFramesTool {
+ public static void main(String[] args) throws IOException {
+ new InflectionFramesTool().inflectionFrames();
+ }
+
+ /* */
+ @SuppressWarnings( { "unused" })
+ public void inflectionFrames() throws IOException {
+ final Dictionary pl = Dictionary.getForLanguage("pl");
+ final DictionaryLookup dict = new DictionaryLookup(pl);
+ final CharsetDecoder decoder = pl.metadata.getDecoder();
+
+ final HashMap<String, ArrayList<String>> forms =
+ new HashMap<String, ArrayList<String>>();
+
+ ByteBuffer stemBuffer = ByteBuffer.allocate(0);
+ ByteBuffer inflBuffer = ByteBuffer.allocate(0);
+ ByteBuffer stemDecoded = ByteBuffer.allocate(0);
+
+ int limit = Integer.MAX_VALUE;
+
+ final Iterator<WordData> i = new DictionaryIterator(pl, decoder, false);
+ while (i.hasNext() && limit-- > 0) {
+ final WordData wd = i.next();
+
+ final CharSequence inflected = wd.getWord();
+ final CharSequence stemEncoded = wd.getStem();
+ final CharSequence tag = wd.getTag();
+ if (tag == null)
+ continue;
+
+ inflBuffer.clear();
+ inflBuffer = wd.getWordBytes(inflBuffer);
+
+ stemBuffer.clear();
+ stemBuffer = wd.getStemBytes(stemBuffer);
+
+ stemDecoded = DictionaryLookup.decodeBaseForm(stemDecoded, stemBuffer
+ .array(), stemBuffer.remaining(), inflBuffer, pl.metadata);
+ stemDecoded.flip();
+
+ final String stem = decoder.decode(stemDecoded).toString();
+ final String form = tag.toString().intern();
+
+ ArrayList<String> frames = forms.get(stem);
+ if (frames == null) {
+ forms.put(stem, frames = new ArrayList<String>());
+ }
+
+ if (!frames.contains(form)) {
+ frames.add(form);
+ }
+ }
+
+ // Sort the forms so that we get a unique key. Then iteratively add them
+ // to another hash (by form this time).
+ final HashMap<String, ArrayList<String>> frames =
+ new HashMap<String, ArrayList<String>>();
+
+ StringBuilder key = new StringBuilder();
+ for (Map.Entry<String, ArrayList<String>> e : forms.entrySet()) {
+ Collections.sort(e.getValue());
+
+ key.setLength(0);
+ for (String s : e.getValue())
+ key.append(s).append(" ");
+
+ final String k = key.toString();
+ ArrayList<String> words = frames.get(k);
+ if (words == null) {
+ frames.put(k, words = new ArrayList<String>());
+ }
+ words.add(e.getKey());
+
+ e.setValue(null);
+ }
+
+ // Print inflection frames.
+ ArrayList<Map.Entry<String, ArrayList<String>>> entries =
+ new ArrayList<Map.Entry<String, ArrayList<String>>>();
+
+ entries.addAll(frames.entrySet());
+ Collections.sort(entries,
+ new Comparator<Map.Entry<String, ArrayList<String>>>() {
+ public int compare(Entry<String, ArrayList<String>> o1,
+ Entry<String, ArrayList<String>> o2) {
+ return o2.getValue().size() - o1.getValue().size();
+ }
+ });
+
+ for (Map.Entry<String, ArrayList<String>> e : entries) {
+ System.out.println(String.format("%6d %s %s",
+ e.getValue().size(), e.getKey(), e.getValue()));
+ }
+
+ System.out.println("Total frames: " + frames.size());
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/Launcher.java b/morfologik-tools/src/main/java/morfologik/tools/Launcher.java
new file mode 100644
index 0000000..320c1dc
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/Launcher.java
@@ -0,0 +1,158 @@
+package morfologik.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Method;
+import java.net.URL;
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.jar.Manifest;
+
+import morfologik.util.FileUtils;
+
+/**
+ * A launcher for other command-line tools.
+ */
+public final class Launcher {
+ /**
+ * Tool description.
+ */
+ final static class ToolInfo {
+ public final Class<? extends Tool> clazz;
+ public final String info;
+
+ public ToolInfo(Class<? extends Tool> clazz, String info) {
+ this.clazz = clazz;
+ this.info = info;
+ }
+
+ public void invoke(String[] subArgs) throws Exception {
+ final Method m = clazz.getMethod("main",
+ new Class[] { String[].class });
+ m.invoke(null, new Object[] { subArgs });
+ }
+ }
+
+ /**
+ * Command line entry point.
+ */
+ public static void main(String[] args) throws Exception {
+ // If so, tools are unavailable and a classpath error has been logged.
+ final TreeMap<String, ToolInfo> tools = initTools();
+
+ if (tools == null)
+ {
+ return;
+ }
+
+ if (args.length == 0) {
+ System.out.println("Provide tool name and its command-line options. "
+ + "Available tools:");
+ for (String key : tools.keySet()) {
+ final ToolInfo toolInfo = tools.get(key);
+ System.out.println(String.format(" %-10s - %s", key,
+ toolInfo.info));
+ }
+ } else {
+ final String toolName = args[0];
+ if (!tools.containsKey(toolName)) {
+ System.out.println("Unknown tool: " + toolName);
+ return;
+ }
+
+ final String[] subArgs = new String[args.length - 1];
+ System.arraycopy(args, 1, subArgs, 0, subArgs.length);
+
+ final ToolInfo toolInfo = (ToolInfo) tools.get(toolName);
+ toolInfo.invoke(subArgs);
+ }
+ }
+
+ /**
+ * Initialize and check tools' availability.
+ */
+ static TreeMap<String, ToolInfo> initTools() {
+ TreeMap<String, ToolInfo> tools = new TreeMap<String, ToolInfo>();
+
+ tools.put("fsa_build", new ToolInfo(FSABuildTool.class,
+ "Create an automaton from plain text files."));
+
+ tools.put("fsa_dump", new ToolInfo(FSADumpTool.class,
+ "Dump an FSA dictionary."));
+
+ tools.put("tab2morph", new ToolInfo(MorphEncodingTool.class,
+ "Convert tabbed dictionary to fsa encoding format."));
+
+ tools.put("plstem", new ToolInfo(PolishStemmingTool.class,
+ "Apply Polish dictionary stemming to the input."));
+
+ // Prune unavailable tools.
+ for (Iterator<ToolInfo> i = tools.values().iterator(); i.hasNext();) {
+ ToolInfo ti = i.next();
+ try {
+ ti.clazz.newInstance().isAvailable();
+ } catch (NoClassDefFoundError e) {
+ logJarWarning();
+ return null;
+ } catch (Throwable e) {
+ System.out.println("Tools could not be initialized because" +
+ " of an exception during initialization: "
+ + e.getClass().getName() + ", " + e.getMessage());
+ return null;
+ }
+ }
+
+ return tools;
+ }
+
+ /**
+ * Log a warning about missing JAR dependencies.
+ */
+ private static void logJarWarning() {
+ System.out.println("Tools are unavailable, at least one JAR dependency missing.");
+
+ try {
+ final Class<Launcher> clazz = Launcher.class;
+ final ClassLoader classLoader = clazz.getClassLoader();
+
+ final String clazzName = clazz.getName().replace('.', '/') + ".class";
+ // Figure out our own class path location.
+ final URL launcherLocation = classLoader.getResource(clazzName);
+ if (launcherLocation == null)
+ return;
+
+ String launcherPrefix = launcherLocation.toString()
+ .replace(clazzName, "");
+
+ // Figure our our location's MANIFEST.MF (class loader may be hitting a few).
+ URL manifestResource = null;
+ Enumeration<URL> manifests = classLoader.getResources("META-INF/MANIFEST.MF");
+ while (manifests.hasMoreElements())
+ {
+ URL candidate = manifests.nextElement();
+ if (candidate.toString().startsWith(launcherPrefix))
+ {
+ manifestResource = candidate;
+ break;
+ }
+ }
+
+ if (manifestResource == null)
+ return;
+
+ InputStream stream = null;
+ try {
+ stream = manifestResource.openStream();
+ Manifest manifest = new Manifest(stream);
+
+ System.out.println("Required JARs: "
+ + manifest.getMainAttributes().getValue("Class-Path"));
+ } catch (IOException e) {
+ FileUtils.close(stream);
+ }
+ } catch (IOException e) {
+ // Ignore.
+ }
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java b/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java
new file mode 100644
index 0000000..dfade2d
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java
@@ -0,0 +1,255 @@
+package morfologik.tools;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Locale;
+
+import morfologik.fsa.FSA5;
+import morfologik.stemming.EncoderType;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.lang.StringEscapeUtils;
+
+
+/**
+ * This utility converts the dictionary in a text (tabbed) format into
+ * the format accepted by the fsa building tools. It is meant to replace
+ * the Perl and AWK scripts from the original FSA package.
+ */
+class MorphEncodingTool extends Tool {
+ private static Charset US_ASCII = Charset.forName("US-ASCII");
+ private boolean noWarn = false;
+ private SequenceAssembler encoder;
+ private byte separatorByte;
+ private char separator;
+
+ /**
+ *
+ */
+ protected void go(final CommandLine line) throws Exception {
+ noWarn = line.hasOption(SharedOptions.noWarnIfTwoFields.getOpt());
+
+ EncoderType encType = EncoderType.SUFFIX;
+ if (line.hasOption(SharedOptions.encoder.getOpt())) {
+ String encValue = line.getOptionValue(SharedOptions.encoder.getOpt());
+ try {
+ encType = EncoderType.valueOf(encValue.toUpperCase());
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException("Invalid encoder: " + encValue + ", "
+ + "allowed values: " + Arrays.toString(EncoderType.values()));
+ }
+ }
+
+ separator = FSA5.DEFAULT_ANNOTATION;
+ if (line.hasOption(SharedOptions.annotationSeparatorCharacterOption.getLongOpt())) {
+ String sep = line.getOptionValue(SharedOptions.annotationSeparatorCharacterOption.getLongOpt());
+
+ // Decode escape sequences.
+ sep = StringEscapeUtils.unescapeJava(sep);
+ if (sep.length() != 1) {
+ throw new IllegalArgumentException("Field separator must be a single character: " + sep);
+ }
+ if (sep.charAt(0) > 0xff) {
+ throw new IllegalArgumentException("Field separator not within byte range: " + (int) sep.charAt(0));
+ }
+ separator = sep.charAt(0);
+ separatorByte = FSABuildTool.checkSingleByte(Character.toString(separator), Charset.defaultCharset());
+ }
+
+ encoder = new SequenceAssembler(SequenceEncoders.forType(encType), (byte) separator);
+
+ // Determine input and output streams.
+ final DataInputStream input = initializeInput(line);
+ final DataOutputStream output = initializeOutput(line);
+
+ try {
+ process(input, output);
+ output.flush();
+
+ } finally {
+ input.close();
+ output.close();
+ }
+ }
+
+ /**
+ * Process input stream, writing to output stream.
+ *
+ */
+ protected void process(final DataInputStream input, final DataOutputStream output)
+ throws IOException {
+ long lnumber = 0;
+ try {
+ int bufPos = 0;
+ byte[] buf = new byte[0];
+ ArrayList<byte[]> columns = new ArrayList<byte[]>();
+ int dataByte;
+ do {
+ dataByte = input.read();
+ switch (dataByte) {
+ case '\r':
+ // Ignore CR
+ continue;
+
+ case '\t':
+ columns.add(Arrays.copyOf(buf, bufPos));
+ bufPos = 0;
+ break;
+
+ case -1:
+ // Process EOF as if we encountered \n. fall-through.
+
+ case '\n':
+ lnumber++;
+ if (bufPos == 0 && columns.isEmpty()) {
+ if (dataByte != -1) {
+ System.err.println(String.format(Locale.ROOT,
+ "Ignoring empty line %d.", lnumber));
+ }
+ break;
+ }
+
+ columns.add(Arrays.copyOf(buf, bufPos));
+
+ if (columns.size() < 2 || columns.size() > 3) {
+ throw new IllegalArgumentException(
+ String.format(Locale.ROOT, "Every \\n-delimited 'line' must contain 2 or 3 columns, line %d has %d. US-ASCII version of this line: %s",
+ lnumber,
+ columns.size(),
+ toAscii(columns)));
+ }
+
+ if (columns.size() == 2 && !noWarn) {
+ System.err.println(String.format(Locale.ROOT,
+ "Line %d has %d columns. US-ASCII version of this line: %s",
+ lnumber,
+ columns.size(),
+ toAscii(columns)));
+ }
+
+ byte [] wordForm = columns.get(0);
+ byte [] wordLemma = columns.get(1);
+ if (contains(wordForm, separatorByte) ||
+ contains(wordLemma, separatorByte)) {
+ throw new IllegalArgumentException(
+ String.format(Locale.ROOT,
+ "Either word or lemma in line %d contain the annotation byte '%s': %s",
+ lnumber,
+ separator,
+ toAscii(columns)));
+ }
+
+ output.write(encoder.encode(
+ wordForm,
+ wordLemma,
+ columns.size() > 2 ? columns.get(2) : null));
+
+ output.writeByte('\n');
+
+ bufPos = 0;
+ columns.clear();
+ break;
+
+ default:
+ if (bufPos >= buf.length) {
+ buf = Arrays.copyOf(buf, buf.length + 1024);
+ }
+ buf[bufPos++] = (byte) dataByte;
+ }
+ } while (dataByte != -1);
+ } finally {
+ input.close();
+ }
+ }
+
+ private boolean contains(byte [] seq, byte b) {
+ for (int i = 0; i < seq.length; i++) {
+ if (seq[i] == b) return true;
+ }
+ return false;
+ }
+
+ private String toAscii(ArrayList<byte []> columns)
+ {
+ StringBuilder b = new StringBuilder();
+ for (int i = 0; i < columns.size(); i++) {
+ if (i > 0) b.append("\t");
+ b.append(new String(columns.get(i), US_ASCII));
+ }
+ return b.toString();
+ }
+
+ /**
+ * Command line options for the tool.
+ */
+ protected void initializeOptions(Options options) {
+ options.addOption(SharedOptions.inputFileOption);
+ options.addOption(SharedOptions.outputFileOption);
+ options.addOption(SharedOptions.encoder);
+ options.addOption(SharedOptions.noWarnIfTwoFields);
+ options.addOption(SharedOptions.annotationSeparatorCharacterOption);
+ }
+
+ /**
+ *
+ */
+ private static DataOutputStream initializeOutput(CommandLine line)
+ throws IOException, ParseException {
+ final DataOutputStream output;
+ final String opt = SharedOptions.outputFileOption.getOpt();
+ if (line.hasOption(opt)) {
+ // Use output file.
+ output = new DataOutputStream(
+ new BufferedOutputStream(
+ new FileOutputStream((File) line
+ .getParsedOptionValue(opt))));
+ } else {
+ // Use standard output.
+ output = new DataOutputStream(
+ new BufferedOutputStream(
+ System.out));
+ }
+ return output;
+ }
+
+ /**
+ *
+ */
+ private static DataInputStream initializeInput(CommandLine line)
+ throws IOException, ParseException {
+ final DataInputStream input;
+ final String opt = SharedOptions.inputFileOption.getOpt();
+ if (line.hasOption(opt)) {
+ // Use input file.
+ input = new DataInputStream (
+ new BufferedInputStream(
+ new FileInputStream((File) line
+ .getParsedOptionValue(opt))));
+ } else {
+ // Use standard input.
+ input = new DataInputStream(
+ new BufferedInputStream(
+ System.in));
+ }
+ return input;
+ }
+
+ /**
+ * Command line entry point.
+ */
+ public static void main(String... args) throws Exception {
+ final MorphEncodingTool tool = new MorphEncodingTool();
+ tool.go(args);
+ }
+} \ No newline at end of file
diff --git a/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java b/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java
new file mode 100644
index 0000000..22c84c3
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java
@@ -0,0 +1,193 @@
+package morfologik.tools;
+
+import java.io.*;
+import java.text.MessageFormat;
+import java.util.List;
+import java.util.Locale;
+
+import morfologik.stemming.*;
+
+import org.apache.commons.cli.*;
+
+/**
+ * This utility parses input text, tokenizes it on whitespace and stems input
+ * words, writing them to the output in column-based format:
+ *
+ * <pre>
+ * word stem form
+ * word stem form
+ * </pre>
+ *
+ * Words for which no stems or forms are available have empty values in each
+ * respective column. Columns are tab-delimited.
+ */
+class PolishStemmingTool extends Tool {
+ /**
+ *
+ */
+ protected void go(CommandLine line) throws Exception {
+ // Determine input/ output encoding.
+ final String inputEncoding = getEncodingOption(line,
+ SharedOptions.inputEncodingOption.getOpt());
+
+ final String outputEncoding = getEncodingOption(line,
+ SharedOptions.outputEncodingOption.getOpt());
+
+ System.err.println("Input encoding: " + inputEncoding);
+ System.err.println("Output encoding: " + outputEncoding);
+
+ // Determine input and output streams.
+ final Reader input = initializeInput(line, inputEncoding);
+ final Writer output = initializeOutput(line, outputEncoding);
+
+ final long start = System.currentTimeMillis();
+ try {
+ final long count = process(input, output);
+
+ output.flush();
+
+ final long millis = System.currentTimeMillis() - start;
+ final double time = millis / 1000.0;
+ final double wordsPerSec = time > 0 ? (count / time)
+ : Double.POSITIVE_INFINITY;
+ System.out
+ .println(new MessageFormat(
+ "Processed {0} words in {1,number,#.###} seconds ({2,number,#} words per second).",
+ Locale.ENGLISH).format(new Object[] {
+ new Long(count), new Double(millis / 1000.0),
+ new Double(wordsPerSec) }));
+ } finally {
+ input.close();
+ output.close();
+ }
+
+ }
+
+ /**
+ * Process input stream, writing to output stream.
+ *
+ * @return Returns the number of processed words.
+ */
+ protected long process(Reader input, Writer output) throws IOException {
+ final IStemmer stemmer = new PolishStemmer();
+ final StreamTokenizer st = new StreamTokenizer(input);
+ st.eolIsSignificant(false);
+ st.wordChars('+', '+');
+
+ long count = 0;
+ int token;
+ while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) {
+ if (token == StreamTokenizer.TT_WORD) {
+ final String word = st.sval;
+
+ count++;
+ final List<WordData> stems = stemmer.lookup(word);
+ if (stems.size() == 0) {
+ output.write(word);
+ output.write("\t-\t-\n");
+ } else {
+ for (WordData wd : stems) {
+ output.write(word);
+ output.write("\t");
+ output.write(asString(wd.getStem()));
+ output.write("\t");
+ output.write(asString(wd.getTag()));
+ output.write("\n");
+ }
+ }
+ }
+ }
+
+ return count;
+ }
+
+ private String asString(CharSequence stem) {
+ if (stem == null)
+ return "-";
+ return stem.toString();
+ }
+
+ /**
+ * Command line options for the tool.
+ */
+ protected void initializeOptions(Options options) {
+ options.addOption(SharedOptions.inputFileOption);
+ options.addOption(SharedOptions.inputEncodingOption);
+ options.addOption(SharedOptions.outputFileOption);
+ options.addOption(SharedOptions.outputEncodingOption);
+ }
+
+ /**
+ *
+ */
+ private Writer initializeOutput(CommandLine line, String outputEncoding)
+ throws IOException, ParseException {
+ final Writer output;
+ final String opt = SharedOptions.outputFileOption.getOpt();
+ if (line.hasOption(opt)) {
+ // Use output file.
+ output = new OutputStreamWriter(
+ new BufferedOutputStream(new FileOutputStream((File) line
+ .getParsedOptionValue(opt))), outputEncoding);
+ } else {
+ // Use standard output.
+ System.err.println("Using standard output for output.");
+ output = new OutputStreamWriter(System.out, outputEncoding);
+ }
+ return output;
+ }
+
+ /**
+ *
+ */
+ private Reader initializeInput(CommandLine line, String inputEncoding)
+ throws IOException, ParseException {
+ final Reader input;
+ final String opt = SharedOptions.inputFileOption.getOpt();
+
+ if (line.hasOption(opt)) {
+ // Use input file.
+ input = new InputStreamReader(
+ new BufferedInputStream(new FileInputStream((File) line
+ .getParsedOptionValue(opt))), inputEncoding);
+ } else {
+ // Use standard input.
+ System.err.println("Using standard input for input.");
+ input = new InputStreamReader(System.in, inputEncoding);
+ }
+ return input;
+ }
+
+ /**
+ *
+ */
+ private String getEncodingOption(CommandLine line, String opt) {
+ String encoding = System.getProperty("file.encoding", "iso-8859-1");
+ if (line.hasOption(opt)) {
+ encoding = line.getOptionValue(opt);
+ }
+ return encoding;
+ }
+
+ /*
+ * Check if the dictionary is available.
+ */
+ @Override
+ protected boolean isAvailable() {
+ boolean available = true;
+ try {
+ new PolishStemmer();
+ } catch (Throwable t) {
+ available = false;
+ }
+ return available;
+ }
+
+ /**
+ * Command line entry point.
+ */
+ public static void main(String[] args) throws Exception {
+ final PolishStemmingTool tool = new PolishStemmingTool();
+ tool.go(args);
+ }
+} \ No newline at end of file
diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java
new file mode 100644
index 0000000..e5fd388
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java
@@ -0,0 +1,46 @@
+package morfologik.tools;
+
+import morfologik.fsa.FSA5;
+import morfologik.tools.SequenceEncoders.IEncoder;
+
+import com.carrotsearch.hppc.ByteArrayList;
+
+final class SequenceAssembler {
+ private final byte annotationSeparator;
+
+ private final ByteArrayList src = new ByteArrayList();
+ private final ByteArrayList dst = new ByteArrayList();
+ private final ByteArrayList tmp = new ByteArrayList();
+
+ private final IEncoder encoder;
+
+ public SequenceAssembler(SequenceEncoders.IEncoder encoder) {
+ this(encoder, FSA5.DEFAULT_ANNOTATION);
+ }
+
+ public SequenceAssembler(SequenceEncoders.IEncoder encoder, byte annotationSeparator) {
+ this.annotationSeparator = annotationSeparator;
+ this.encoder = encoder;
+ }
+
+ byte [] encode(byte [] wordForm, byte [] wordLemma, byte [] wordTag)
+ {
+ src.clear();
+ dst.clear();
+ tmp.clear();
+
+ tmp.add(wordForm);
+ tmp.add(annotationSeparator);
+
+ src.add(wordForm);
+ dst.add(wordLemma);
+ encoder.encode(src, dst, tmp);
+
+ tmp.add(annotationSeparator);
+ if (wordTag != null) {
+ tmp.add(wordTag);
+ }
+
+ return tmp.toArray();
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java
new file mode 100644
index 0000000..37cd0cc
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java
@@ -0,0 +1,361 @@
+package morfologik.tools;
+
+import morfologik.stemming.EncoderType;
+
+import com.carrotsearch.hppc.ByteArrayList;
+
+/**
+ * Container class for sequence encoders.
+ */
+public final class SequenceEncoders {
+ private SequenceEncoders() {}
+
+ /**
+ * Maximum encodable single-byte code.
+ */
+ private static final int REMOVE_EVERYTHING = 255;
+
+ public static interface IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer);
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer);
+ public EncoderType type();
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix <code>src</code> has. The output code is (bytes):
+ * <pre>
+ * {K}{suffix}
+ * </pre>
+ * where (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code>
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: foo
+ * dst: foobar
+ * encoded: Abar
+ *
+ * src: foo
+ * dst: bar
+ * encoded: Dbar
+ * </pre>
+ *
+ * <p><strong>Note:</strong> The code length is a single byte. If equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimSuffixEncoder implements IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ int sharedPrefix = sharedPrefixLength(src, dst);
+ int truncateBytes = src.size() - sharedPrefix;
+ if (truncateBytes >= REMOVE_EVERYTHING) {
+ truncateBytes = REMOVE_EVERYTHING;
+ sharedPrefix = 0;
+ }
+
+ final byte suffixTrimCode = (byte) (truncateBytes + 'A');
+ encoded.add(suffixTrimCode);
+ encoded.add(dst.buffer, sharedPrefix, dst.size() - sharedPrefix);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int suffixTrimCode = encoded.get(0);
+ int truncateBytes = (suffixTrimCode - 'A') & 0xFF;
+ if (truncateBytes == REMOVE_EVERYTHING) {
+ truncateBytes = src.size();
+ }
+
+ dst.add(src.buffer, 0, src.size() - truncateBytes);
+ dst.add(encoded.buffer, 1, encoded.size() - 1);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.SUFFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix and prefix <code>src</code> and <code>dst</code> have.
+ * The output code is (bytes):
+ * <pre>
+ * {P}{K}{suffix}
+ * </pre>
+ * where (<code>P</code> - 'A') bytes should be trimmed from the start of <code>src</code>,
+ * (<code>K</code> - 'A') bytes should be trimmed from the end of <code>src</code>
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: abc
+ * dst: abcd
+ * encoded: AAd
+ *
+ * src: abc
+ * dst: xyz
+ * encoded: ADxyz
+ * </pre>
+ *
+ * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimPrefixAndSuffixEncoder implements IEncoder {
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ // Search for the maximum matching subsequence that can be encoded.
+ int maxSubsequenceLength = 0;
+ int maxSubsequenceIndex = 0;
+ for (int i = 0; i < src.size(); i++) {
+ // prefix at i => shared subsequence (infix)
+ int sharedPrefix = sharedPrefixLength(src, i, dst, 0);
+ // Only update maxSubsequenceLength if we will be able to encode it.
+ if (sharedPrefix > maxSubsequenceLength
+ && i < REMOVE_EVERYTHING
+ && (src.size() - (i + sharedPrefix)) < REMOVE_EVERYTHING) {
+ maxSubsequenceLength = sharedPrefix;
+ maxSubsequenceIndex = i;
+ }
+ }
+
+ // Determine how much to remove (and where) from src to get a prefix of dst.
+ int truncatePrefixBytes = maxSubsequenceIndex;
+ int truncateSuffixBytes = (src.size() - (maxSubsequenceIndex + maxSubsequenceLength));
+ if (truncatePrefixBytes >= REMOVE_EVERYTHING ||
+ truncateSuffixBytes >= REMOVE_EVERYTHING) {
+ maxSubsequenceIndex = maxSubsequenceLength = 0;
+ truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING;
+ }
+
+ encoded.add((byte) ((truncatePrefixBytes + 'A') & 0xFF));
+ encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF));
+ encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int truncatePrefixBytes = (encoded.get(0) - 'A') & 0xFF;
+ int truncateSuffixBytes = (encoded.get(1) - 'A') & 0xFF;
+
+ if (truncatePrefixBytes == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ truncatePrefixBytes = src.size();
+ truncateSuffixBytes = 0;
+ }
+
+ dst.add(src.buffer, truncatePrefixBytes, src.size() - (truncateSuffixBytes + truncatePrefixBytes));
+ dst.add(encoded.buffer, 2, encoded.size() - 2);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.PREFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Encodes <code>dst</code> relative to <code>src</code> by trimming
+ * whatever non-equal suffix and infix <code>src</code> and <code>dst</code> have.
+ * The output code is (bytes):
+ * <pre>
+ * {X}{L}{K}{suffix}
+ * </pre>
+ * where <code>src's</code> infix at position (<code>X</code> - 'A') and of length
+ * (<code>L</code> - 'A') should be removed, then (<code>K</code> - 'A') bytes
+ * should be trimmed from the end
+ * and then the <code>suffix</code> should be appended to the resulting byte sequence.
+ *
+ * <p>Examples:</p>
+ * <pre>
+ * src: ayz
+ * dst: abc
+ * encoded: AACbc
+ *
+ * src: aillent
+ * dst: aller
+ * encoded: BBCr
+ * </pre>
+ *
+ * <p><strong>Note:</strong> Each code's length is a single byte. If any is equal to
+ * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire <code>src</code> sequence
+ * should be discarded.</p>
+ */
+ public static class TrimInfixAndSuffixEncoder implements IEncoder {
+ ByteArrayList scratch = new ByteArrayList();
+
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) {
+ // Search for the infix that can we can encode and remove from src
+ // to get a maximum-length prefix of dst. This could be done more efficiently
+ // by running a smarter longest-common-subsequence algorithm and some pruning (?).
+ //
+ // For now, naive loop should do.
+
+ // There can be only two positions for the infix to delete:
+ // 1) we remove leading bytes, even if they are partially matching (but a longer match
+ // exists somewhere later on).
+ // 2) we leave max. matching prefix and remove non-matching bytes that follow.
+ int maxInfixIndex = 0;
+ int maxSubsequenceLength = sharedPrefixLength(src, dst);
+ int maxInfixLength = 0;
+ for (int i : new int [] {0, maxSubsequenceLength}) {
+ for (int j = 1; j <= src.size() - i; j++) {
+ // Compute temporary src with the infix removed.
+ // Concatenate in scratch space for simplicity.
+ scratch.clear();
+ scratch.add(src.buffer, 0, i);
+ scratch.add(src.buffer, i + j, src.size() - (i + j));
+
+ int sharedPrefix = sharedPrefixLength(scratch, dst);
+
+ // Only update maxSubsequenceLength if we will be able to encode it.
+ if (sharedPrefix > 0 &&
+ sharedPrefix > maxSubsequenceLength &&
+ i < REMOVE_EVERYTHING &&
+ j < REMOVE_EVERYTHING) {
+ maxSubsequenceLength = sharedPrefix;
+ maxInfixIndex = i;
+ maxInfixLength = j;
+ }
+ }
+ }
+
+ int truncateSuffixBytes = src.size() - (maxInfixLength + maxSubsequenceLength);
+
+ // Special case: if we're removing the suffix in the infix code, move it
+ // to the suffix code instead.
+ if (truncateSuffixBytes == 0 &&
+ maxInfixIndex + maxInfixLength == src.size()) {
+ truncateSuffixBytes = maxInfixLength;
+ maxInfixIndex = maxInfixLength = 0;
+ }
+
+
+ if (maxInfixIndex >= REMOVE_EVERYTHING ||
+ maxInfixLength >= REMOVE_EVERYTHING ||
+ truncateSuffixBytes >= REMOVE_EVERYTHING) {
+ maxInfixIndex = maxSubsequenceLength = 0;
+ maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING;
+ }
+
+ encoded.add((byte) ((maxInfixIndex + 'A') & 0xFF));
+ encoded.add((byte) ((maxInfixLength + 'A') & 0xFF));
+ encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF));
+ encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength);
+
+ return encoded;
+ }
+
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) {
+ int infixIndex = (encoded.get(0) - 'A') & 0xFF;
+ int infixLength = (encoded.get(1) - 'A') & 0xFF;
+ int truncateSuffixBytes = (encoded.get(2) - 'A') & 0xFF;
+
+ if (infixLength == REMOVE_EVERYTHING ||
+ truncateSuffixBytes == REMOVE_EVERYTHING) {
+ infixIndex = 0;
+ infixLength = src.size();
+ truncateSuffixBytes = 0;
+ }
+
+ dst.add(src.buffer, 0, infixIndex);
+ dst.add(src.buffer, infixIndex + infixLength, src.size() - (infixIndex + infixLength + truncateSuffixBytes));
+ dst.add(encoded.buffer, 3, encoded.size() - 3);
+
+ return dst;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.INFIX;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ *
+ */
+ public static class CopyEncoder implements IEncoder {
+ @Override
+ public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer)
+ {
+ encodedBuffer.add(derived.buffer, 0, derived.size());
+ return encodedBuffer;
+ }
+
+ @Override
+ public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer)
+ {
+ derivedBuffer.add(encoded.buffer, 0, encoded.size());
+ return derivedBuffer;
+ }
+
+ @Override
+ public EncoderType type() {
+ return EncoderType.NONE;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName();
+ }
+ }
+
+ /**
+ * Compute the length of the shared prefix between two byte sequences.
+ */
+ private static int sharedPrefixLength(ByteArrayList a, ByteArrayList b) {
+ final int max = Math.min(a.size(), b.size());
+ int i = 0;
+ while (i < max && a.get(i) == b.get(i)) {
+ i++;
+ }
+ return i;
+ }
+
+ /**
+ * Compute the length of the shared prefix between two byte sequences.
+ */
+ private static int sharedPrefixLength(ByteArrayList a, int aStart, ByteArrayList b, int bStart) {
+
+ int i = 0;
+ while (aStart < a.size() &&
+ bStart < b.size() &&
+ a.get(aStart++) == b.get(bStart++)) {
+ i++;
+ }
+ return i;
+ }
+
+ public static IEncoder forType(EncoderType encType)
+ {
+ switch (encType) {
+ case INFIX: return new TrimInfixAndSuffixEncoder();
+ case PREFIX: return new TrimPrefixAndSuffixEncoder();
+ case SUFFIX: return new TrimSuffixEncoder();
+ case NONE: return new CopyEncoder();
+ }
+ throw new RuntimeException("Unknown encoder: " + encType);
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java b/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java
new file mode 100644
index 0000000..11b42aa
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java
@@ -0,0 +1,152 @@
+package morfologik.tools;
+
+import java.io.File;
+import java.util.Arrays;
+
+import morfologik.stemming.EncoderType;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+
+/**
+ * Options shared between tools.
+ */
+@SuppressWarnings("static-access")
+final class SharedOptions {
+ public final static Option fsaDictionaryFileOption = OptionBuilder
+ .hasArg()
+ .withArgName("file")
+ .withDescription("Path to the FSA dictionary.")
+ .withLongOpt("dictionary")
+ .withType(File.class)
+ .isRequired(true)
+ .create("d");
+
+ public final static Option decode = OptionBuilder
+ .withDescription("Decode prefix/ infix/ suffix forms (if available).")
+ .withLongOpt("decode")
+ .isRequired(false)
+ .create("x");
+
+ public final static Option dataOnly = OptionBuilder
+ .withDescription("Dump only raw FSA data.")
+ .withLongOpt("raw-data")
+ .isRequired(false)
+ .create("r");
+
+ public final static Option dot = OptionBuilder
+ .withDescription("Dump the automaton as graphviz DOT file.")
+ .withLongOpt("dot")
+ .isRequired(false)
+ .create();
+
+ public final static Option inputEncodingOption = OptionBuilder
+ .hasArg()
+ .withArgName("codepage")
+ .withDescription("Input stream encoding.")
+ .withLongOpt("input-encoding")
+ .isRequired(false)
+ .create("ie");
+
+ public final static Option outputEncodingOption = OptionBuilder
+ .hasArg()
+ .withArgName("codepage")
+ .withDescription("Output stream encoding.")
+ .withLongOpt("output-encoding")
+ .isRequired(false)
+ .create("oe");
+
+ public final static Option inputFileOption = OptionBuilder
+ .hasArg()
+ .withArgName("file")
+ .withDescription("Input file. If missing, standard input is used.")
+ .withLongOpt("input")
+ .withType(File.class)
+ .isRequired(false)
+ .create("i");
+
+ public final static Option outputFileOption = OptionBuilder
+ .hasArg()
+ .withArgName("file")
+ .withDescription("Output file. If missing, standard output is used.")
+ .withLongOpt("output")
+ .withType(File.class)
+ .isRequired(false)
+ .create("o");
+
+ public final static Option outputFormatOption = OptionBuilder
+ .hasArg()
+ .withArgName("format")
+ .withDescription("Name of the binary output format. Allowed values: " + Arrays.toString(FSABuildTool.Format.values()))
+ .withLongOpt("format")
+ .isRequired(false)
+ .create("f");
+
+ public final static Option fillerCharacterOption = OptionBuilder
+ .hasArg()
+ .withArgName("char")
+ .withDescription("Custom filler character")
+ .isRequired(false)
+ .withLongOpt("filler")
+ .create();
+
+ public final static Option annotationSeparatorCharacterOption = OptionBuilder
+ .hasArg()
+ .withArgName("char")
+ .withDescription("Custom annotation separator character")
+ .isRequired(false)
+ .withLongOpt("annotation")
+ .create();
+
+ public final static Option withNumbersOption = OptionBuilder
+ .withDescription("Include numbers required for perfect hashing (larger automaton)")
+ .isRequired(false)
+ .withLongOpt("with-numbers")
+ .create("n");
+
+ public final static Option progressOption = OptionBuilder
+ .withDescription("Print more verbose progress information")
+ .isRequired(false)
+ .withLongOpt("progress")
+ .create();
+
+ public final static Option inputSortedOption = OptionBuilder
+ .withDescription("Assume the input is already sorted using C-sort (builds FSA directly, no in-memory sorting)")
+ .isRequired(false)
+ .withLongOpt("sorted")
+ .create();
+
+ public final static Option encoder = OptionBuilder
+ .withDescription("Encoder used for compressing inflected forms. Any of: "
+ + Arrays.toString(EncoderType.values()))
+ .withLongOpt("encoder")
+ .hasArg(true)
+ .withArgName("name")
+ .isRequired(false)
+ .create("e");
+
+ public final static Option noWarnIfTwoFields = OptionBuilder
+ .withDescription("Suppress warning for lines with only two fields (for stemming dictionaries)")
+ .withLongOpt("nowarn")
+ .isRequired(false)
+ .create("nw");
+
+ public final static Option statistics = OptionBuilder
+ .withDescription("Print extra statistics.")
+ .isRequired(false)
+ .withLongOpt("stats")
+ .create();
+
+ public final static Option help = OptionBuilder
+ .withDescription("Help on available options.")
+ .withLongOpt("help")
+ .isRequired(false)
+ .create();
+
+ /**
+ * No instances. Use static fields.
+ */
+ private SharedOptions() {
+ // empty
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/Tool.java b/morfologik-tools/src/main/java/morfologik/tools/Tool.java
new file mode 100644
index 0000000..27dac3f
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/Tool.java
@@ -0,0 +1,102 @@
+package morfologik.tools;
+
+import org.apache.commons.cli.*;
+
+/**
+ * Base class for command-line applications.
+ */
+abstract class Tool {
+ /** Command line options. */
+ protected final Options options = new Options();
+
+ /**
+ * Initializes application context.
+ */
+ protected final void go(String[] args) {
+ options.addOption(SharedOptions.help);
+ initializeOptions(options);
+
+ // Commons-cli is pretty dumb in terms of option parsing because it
+ // validates immediately and there is no way to determine
+ // if an option exists without bailing out with an exception. This
+ // is a hardcoded workaround for --help
+ for (String arg : args) {
+ if ("--help".equals(arg)) {
+ printUsage();
+ return;
+ }
+ }
+
+ final Parser parser = new GnuParser();
+ final CommandLine line;
+ try {
+ line = parser.parse(options, args);
+ if (line.hasOption(SharedOptions.help.getLongOpt())) {
+ printUsage();
+ return;
+ }
+ if (line.getArgList().size() > 0) {
+ printError("Unreconized left over command line arguments: "
+ + line.getArgList());
+ return;
+ }
+
+ try {
+ go(line);
+ } catch (Throwable e) {
+ printError("Unhandled program error occurred.", e);
+ }
+ } catch (MissingArgumentException e) {
+ printError("Provide the required argument for option: "
+ + e.getMessage());
+ } catch (MissingOptionException e) {
+ printError("Provide the required option: " + e.getMessage());
+ } catch (UnrecognizedOptionException e) {
+ printError(e.getMessage());
+ } catch (ParseException e) {
+ printError("Could not parse command line: " + e.getMessage());
+ }
+ }
+
+ /**
+ * Print an error and an associated exception.
+ */
+ protected void printError(String msg, Throwable t) {
+ printError(msg);
+ t.printStackTrace(System.err);
+ }
+
+ /**
+ * Print an error without an exception.
+ */
+ protected void printError(String msg) {
+ System.err.println();
+ System.err.println(msg);
+ System.err.println("Invoke with '--help' for help.");
+ }
+
+ /**
+ * Prints usage (options).
+ */
+ protected void printUsage() {
+ final HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(this.getClass().getName(), options, true);
+ }
+
+ /**
+ * Override and write your stuff using command line options.
+ */
+ protected abstract void go(CommandLine line) throws Exception;
+
+ /**
+ * Override and initialize options.
+ */
+ protected abstract void initializeOptions(Options options);
+
+ /**
+ * Is the tool available? <code>true</code> by default.
+ */
+ protected boolean isAvailable() {
+ return true;
+ }
+}
diff --git a/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java b/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java
new file mode 100644
index 0000000..5caee57
--- /dev/null
+++ b/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java
@@ -0,0 +1,125 @@
+package morfologik.tools;
+
+import java.io.PrintWriter;
+import java.util.*;
+
+import morfologik.fsa.IMessageLogger;
+
+/**
+ * A logger dumping info to <code>System.err</code>.
+ */
+public class WriterMessageLogger implements IMessageLogger {
+ /**
+ * Start of the world timestamp.
+ */
+ private final static long world = System.currentTimeMillis();
+
+ /**
+ * A single part: name, start timestamp.
+ */
+ private static class Part {
+ final String name;
+ final long start;
+
+ Part(String name, long start) {
+ this.name = name;
+ this.start = start;
+ }
+ }
+
+ /**
+ * Is the output currently indented?
+ */
+ private boolean indent;
+
+ /**
+ * Active parts.
+ */
+ private ArrayDeque<Part> parts = new ArrayDeque<Part>();
+
+ /**
+ * Output writer.
+ */
+ private final PrintWriter writer;
+
+ /**
+ *
+ */
+ public WriterMessageLogger(PrintWriter w) {
+ this.writer = w;
+ }
+
+ /*
+ *
+ */
+ @Override
+ public void log(String msg) {
+ cancelIndent();
+
+ writer.println(msg);
+ writer.flush();
+ }
+
+ /*
+ *
+ */
+ @Override
+ public void log(String header, Object v) {
+ cancelIndent();
+
+ if (v instanceof Integer || v instanceof Long) {
+ writer.println(String.format(Locale.ENGLISH, "%-30s %,11d", header, v));
+ } else {
+ writer.println(String.format(Locale.ENGLISH, "%-30s %11s", header, v.toString()));
+ }
+ writer.flush();
+ }
+
+ /*
+ *
+ */
+ @Override
+ public void startPart(String header) {
+ cancelIndent();
+
+ Part p = new Part(header, System.currentTimeMillis());
+ parts.addLast(p);
+
+ writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "..."));
+ writer.flush();
+
+ indent = true;
+ }
+
+ /*
+ *
+ */
+ @Override
+ public void endPart() {
+ long now = System.currentTimeMillis();
+ Part p = parts.removeLast();
+
+ if (!indent) {
+ writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "..."));
+ }
+
+ writer.println(
+ String.format(Locale.ENGLISH, "%13.2f sec. [%6.2f sec.]",
+ (now - p.start) / 1000.0,
+ (now - world) / 1000.0));
+ writer.flush();
+
+ indent = false;
+ }
+
+ /*
+ *
+ */
+ private void cancelIndent() {
+ if (indent) {
+ System.err.println();
+ }
+
+ indent = false;
+ }
+}