From 991420c6b1ebc1f041818ffeaee65e5dcaa1581d Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Wed, 26 Dec 2018 20:22:36 +0100 Subject: Import Upstream version 1.9.0+dfsg --- .classpath | 11 - .gitignore | 33 +- .project | 17 - ACKNOWLEDGMENT | 6 - CHANGES | 450 ++++++-- CONTRIBUTOR | 62 + LICENSE | 29 - README | 57 +- TODO | 21 - build.xml | 219 ---- doc/api/allclasses-frame.html | 103 -- doc/api/allclasses-noframe.html | 103 -- doc/api/constant-values.html | 446 ------- doc/api/deprecated-list.html | 144 --- doc/api/help-doc.html | 217 ---- doc/api/index-all.html | 1211 -------------------- doc/api/index.html | 39 - doc/api/morfologik/fsa/CFSA.html | 871 -------------- doc/api/morfologik/fsa/CFSA2.html | 779 ------------- doc/api/morfologik/fsa/CFSA2Serializer.html | 414 ------- doc/api/morfologik/fsa/ConstantArcSizeFSA.html | 654 ----------- doc/api/morfologik/fsa/FSA.html | 855 -------------- doc/api/morfologik/fsa/FSA5.html | 887 -------------- doc/api/morfologik/fsa/FSA5Serializer.html | 468 -------- doc/api/morfologik/fsa/FSABuilder.InfoEntry.html | 431 ------- doc/api/morfologik/fsa/FSABuilder.html | 450 -------- doc/api/morfologik/fsa/FSAFinalStatesIterator.html | 339 ------ doc/api/morfologik/fsa/FSAFlags.html | 484 -------- doc/api/morfologik/fsa/FSAInfo.html | 399 ------- doc/api/morfologik/fsa/FSASerializer.html | 335 ------ doc/api/morfologik/fsa/FSATraversal.html | 394 ------- doc/api/morfologik/fsa/FSAUtils.IntIntHolder.html | 295 ----- doc/api/morfologik/fsa/FSAUtils.html | 379 ------ doc/api/morfologik/fsa/MatchResult.html | 397 ------- doc/api/morfologik/fsa/StateVisitor.html | 211 ---- doc/api/morfologik/fsa/package-frame.html | 84 -- doc/api/morfologik/fsa/package-summary.html | 251 ---- doc/api/morfologik/fsa/package-tree.html | 172 --- doc/api/morfologik/stemming/Dictionary.html | 489 -------- .../morfologik/stemming/DictionaryIterator.html | 310 ----- doc/api/morfologik/stemming/DictionaryLookup.html | 376 ------ .../morfologik/stemming/DictionaryMetadata.html | 437 ------- doc/api/morfologik/stemming/IStemmer.html | 220 ---- doc/api/morfologik/stemming/PolishStemmer.html | 302 ----- doc/api/morfologik/stemming/WordData.html | 447 -------- doc/api/morfologik/stemming/package-frame.html | 53 - doc/api/morfologik/stemming/package-summary.html | 190 --- doc/api/morfologik/stemming/package-tree.html | 160 --- doc/api/morfologik/tools/FSABuildTool.Format.html | 346 ------ doc/api/morfologik/tools/FSABuildTool.html | 522 --------- doc/api/morfologik/tools/FSADumpTool.html | 457 -------- doc/api/morfologik/tools/IMessageLogger.html | 272 ----- doc/api/morfologik/tools/InflectionFramesTool.html | 279 ----- doc/api/morfologik/tools/Launcher.html | 259 ----- doc/api/morfologik/tools/MorphEncoder.html | 584 ---------- doc/api/morfologik/tools/WriterMessageLogger.html | 337 ------ doc/api/morfologik/tools/package-frame.html | 64 -- doc/api/morfologik/tools/package-summary.html | 205 ---- doc/api/morfologik/tools/package-tree.html | 167 --- doc/api/morfologik/util/Arrays.html | 343 ------ doc/api/morfologik/util/BufferUtils.html | 273 ----- doc/api/morfologik/util/FileUtils.html | 424 ------- doc/api/morfologik/util/ResourceUtils.html | 228 ---- doc/api/morfologik/util/package-frame.html | 38 - doc/api/morfologik/util/package-summary.html | 167 --- doc/api/morfologik/util/package-tree.html | 151 --- doc/api/overview-frame.html | 48 - doc/api/overview-summary.html | 161 --- doc/api/overview-tree.html | 176 --- doc/api/package-list | 4 - doc/api/resources/inherit.gif | Bin 57 -> 0 bytes doc/api/stylesheet.css | 29 - lib/commons-cli-1.2.LICENSE | 202 ---- lib/commons-cli-1.2.jar | Bin 41123 -> 0 bytes lib/hppc-0.3.2.jar | Bin 981375 -> 0 bytes lib/hppc.LICENSE | 202 ---- lib/junit-4.7.jar | Bin 232354 -> 0 bytes lib/junit-benchmarks-0.1.0.jar | Bin 54998 -> 0 bytes lib/junit-benchmarks.LICENSE | 202 ---- lib/junit.LICENSE | 88 -- lib/thirdparty.LICENSE | 5 - licenses/commons-cli.LICENSE | 202 ++++ licenses/commons-lang.LICENSE | 202 ++++ licenses/hppc.LICENSE | 202 ++++ licenses/morfologik-polish.LICENSE | 28 + morfologik-distribution/pom.xml | 112 ++ morfologik-distribution/src/main/assembly/bin.xml | 77 ++ morfologik-fsa/pom.xml | 38 + .../src/main/java/morfologik/fsa/CFSA.java | 364 ++++++ .../src/main/java/morfologik/fsa/CFSA2.java | 404 +++++++ .../main/java/morfologik/fsa/CFSA2Serializer.java | 543 +++++++++ .../java/morfologik/fsa/ConstantArcSizeFSA.java | 134 +++ .../src/main/java/morfologik/fsa/FSA.java | 286 +++++ .../src/main/java/morfologik/fsa/FSA5.java | 323 ++++++ .../main/java/morfologik/fsa/FSA5Serializer.java | 332 ++++++ .../src/main/java/morfologik/fsa/FSABuilder.java | 486 ++++++++ .../morfologik/fsa/FSAFinalStatesIterator.java | 154 +++ .../src/main/java/morfologik/fsa/FSAFlags.java | 64 ++ .../src/main/java/morfologik/fsa/FSAHeader.java | 52 + .../src/main/java/morfologik/fsa/FSAInfo.java | 156 +++ .../main/java/morfologik/fsa/FSASerializer.java | 43 + .../src/main/java/morfologik/fsa/FSATraversal.java | 169 +++ .../src/main/java/morfologik/fsa/FSAUtils.java | 202 ++++ .../main/java/morfologik/fsa/IMessageLogger.java | 25 + .../src/main/java/morfologik/fsa/MatchResult.java | 86 ++ .../java/morfologik/fsa/NullMessageLogger.java | 22 + .../src/main/java/morfologik/fsa/StateVisitor.java | 11 + .../src/main/java/morfologik/util/Arrays.java | 68 ++ .../src/main/java/morfologik/util/BufferUtils.java | 54 + .../src/main/java/morfologik/util/FileUtils.java | 137 +++ .../main/java/morfologik/util/ResourceUtils.java | 58 + .../java/morfologik/fsa/CFSA2SerializerTest.java | 27 + .../java/morfologik/fsa/FSA5SerializerTest.java | 10 + .../src/test/java/morfologik/fsa/FSA5Test.java | 105 ++ .../test/java/morfologik/fsa/FSABuilderTest.java | 112 ++ .../src/test/java/morfologik/fsa/FSATestUtils.java | 179 +++ .../test/java/morfologik/fsa/FSATraversalTest.java | 160 +++ .../java/morfologik/fsa/SerializerTestBase.java | 256 +++++ .../src/test/java/morfologik/util/MinMax.java | 21 + .../test/resources/morfologik/fsa/abc-numbers.fsa | Bin 0 -> 29 bytes .../src/test/resources/morfologik/fsa/abc.fsa | Bin 0 -> 24 bytes .../src/test/resources/morfologik/fsa/abc.in | 6 + .../src/test/resources/morfologik/fsa/en_tst.dict | Bin 0 -> 1070678 bytes .../src/test/resources/morfologik/fsa/minimal.fsa | Bin 0 -> 24 bytes .../src/test/resources/morfologik/fsa/minimal.in | 3 + .../src/test/resources/morfologik/fsa/minimal2.fsa | Bin 0 -> 194 bytes .../src/test/resources/morfologik/fsa/minimal2.in | 24 + morfologik-polish/pom.xml | 58 + .../java/morfologik/stemming/PolishStemmer.java | 54 + .../resources/morfologik/dictionaries/pl.README-en | 67 ++ .../resources/morfologik/dictionaries/pl.README-pl | 141 +++ .../main/resources/morfologik/dictionaries/pl.dict | Bin 0 -> 2804243 bytes .../main/resources/morfologik/dictionaries/pl.info | 36 + .../java/morfologik/stemming/PerformanceTest.java | 73 ++ .../stemming/PolishMorfologikStemmerTest.java | 141 +++ morfologik-speller/pom.xml | 58 + .../src/main/java/morfologik/speller/HMatrix.java | 100 ++ .../src/main/java/morfologik/speller/Speller.java | 920 +++++++++++++++ .../test/java/morfologik/speller/HMatrixTest.java | 21 + .../test/java/morfologik/speller/SpellerTest.java | 272 +++++ .../morfologik/speller/dict-with-freq.dict | Bin 0 -> 162 bytes .../morfologik/speller/dict-with-freq.info | 15 + .../morfologik/speller/dict-with-freq.txt | 21 + .../test/resources/morfologik/speller/slownik.dict | Bin 0 -> 130 bytes .../test/resources/morfologik/speller/slownik.info | 14 + .../resources/morfologik/speller/test-infix.dict | Bin 0 -> 1859 bytes .../resources/morfologik/speller/test-infix.info | 10 + .../morfologik/speller/test-utf-spell.dict | Bin 0 -> 168 bytes .../morfologik/speller/test-utf-spell.info | 15 + .../morfologik/speller/test_freq_iso.dict | Bin 0 -> 129 bytes .../morfologik/speller/test_freq_iso.info | 16 + morfologik-stemming/pom.xml | 71 ++ .../java/morfologik/stemming/ArrayViewList.java | 111 ++ .../main/java/morfologik/stemming/Dictionary.java | 233 ++++ .../morfologik/stemming/DictionaryAttribute.java | 333 ++++++ .../morfologik/stemming/DictionaryIterator.java | 143 +++ .../java/morfologik/stemming/DictionaryLookup.java | 403 +++++++ .../morfologik/stemming/DictionaryMetadata.java | 298 +++++ .../stemming/DictionaryMetadataBuilder.java | 139 +++ .../main/java/morfologik/stemming/EncoderType.java | 11 + .../main/java/morfologik/stemming/IStemmer.java | 20 + .../main/java/morfologik/stemming/WordData.java | 254 ++++ .../morfologik/stemming/DictionaryLookupTest.java | 247 ++++ .../stemming/DictionaryMetadataBuilderTest.java | 49 + .../java/morfologik/stemming/DictionaryTest.java | 27 + .../stemming/StringDecoderBenchmarkTest.java | 62 + .../morfologik/stemming/test-diacritics-utf8.dict | Bin 0 -> 136 bytes .../morfologik/stemming/test-diacritics-utf8.info | 9 + .../resources/morfologik/stemming/test-infix.dict | Bin 0 -> 1859 bytes .../resources/morfologik/stemming/test-infix.info | 8 + .../resources/morfologik/stemming/test-prefix.dict | Bin 0 -> 1776 bytes .../resources/morfologik/stemming/test-prefix.info | 11 + .../morfologik/stemming/test-separators.dict | Bin 0 -> 155 bytes .../morfologik/stemming/test-separators.info | 10 + .../morfologik/stemming/test-separators.txt | 8 + .../resources/morfologik/stemming/test-synth.dict | Bin 0 -> 1354192 bytes .../resources/morfologik/stemming/test-synth.info | 6 + morfologik-tools/pom.xml | 159 +++ .../main/java/morfologik/tools/FSABuildTool.java | 541 +++++++++ .../main/java/morfologik/tools/FSADumpTool.java | 287 +++++ .../morfologik/tools/InflectionFramesTool.java | 112 ++ .../src/main/java/morfologik/tools/Launcher.java | 158 +++ .../java/morfologik/tools/MorphEncodingTool.java | 255 +++++ .../java/morfologik/tools/PolishStemmingTool.java | 193 ++++ .../java/morfologik/tools/SequenceAssembler.java | 46 + .../java/morfologik/tools/SequenceEncoders.java | 361 ++++++ .../main/java/morfologik/tools/SharedOptions.java | 152 +++ .../src/main/java/morfologik/tools/Tool.java | 102 ++ .../java/morfologik/tools/WriterMessageLogger.java | 125 ++ morfologik-tools/src/proguard/rules.pro | 16 + .../java/morfologik/tools/FSABuildToolTest.java | 53 + .../test/java/morfologik/tools/LauncherTest.java | 26 + .../morfologik/tools/MorphEncodingToolTest.java | 243 ++++ .../tools/SequenceEncodersRandomizedTest.java | 106 ++ .../tools/SequenceEncodersStaticTest.java | 96 ++ .../test/java/morfologik/tools/Text2FSA5Test.java | 37 + morfologik.LICENSE | 29 + pom.xml | 339 ++++++ src-test/morfologik/fsa/CFSA2SerializerTest.java | 27 - src-test/morfologik/fsa/FSA5SerializerTest.java | 10 - src-test/morfologik/fsa/FSA5Test.java | 117 -- src-test/morfologik/fsa/FSABuilderTest.java | 112 -- src-test/morfologik/fsa/FSATestUtils.java | 179 --- src-test/morfologik/fsa/FSATraversalTest.java | 160 --- src-test/morfologik/fsa/SerializerTestBase.java | 256 ----- src-test/morfologik/fsa/abc-numbers.fsa | Bin 29 -> 0 bytes src-test/morfologik/fsa/abc.fsa | Bin 24 -> 0 bytes src-test/morfologik/fsa/abc.in | 6 - src-test/morfologik/fsa/en_tst.dict | Bin 1070678 -> 0 bytes src-test/morfologik/fsa/minimal.fsa | Bin 24 -> 0 bytes src-test/morfologik/fsa/minimal.in | 3 - src-test/morfologik/fsa/minimal2.fsa | Bin 194 -> 0 bytes src-test/morfologik/fsa/minimal2.in | 24 - .../morfologik/stemming/DictionaryLookupTest.java | 250 ---- src-test/morfologik/stemming/PerformanceTest.java | 73 -- .../morfologik/stemming/PolishStemmerTest.java | 54 - .../stemming/StringDecoderBenchmarkTest.java | 62 - .../morfologik/stemming/test-diacritics-utf8.dict | Bin 136 -> 0 bytes .../morfologik/stemming/test-diacritics-utf8.info | 10 - src-test/morfologik/stemming/test-infix.dict | Bin 1859 -> 0 bytes src-test/morfologik/stemming/test-infix.info | 9 - src-test/morfologik/stemming/test-prefix.dict | Bin 1776 -> 0 bytes src-test/morfologik/stemming/test-prefix.info | 9 - src-test/morfologik/stemming/test-separators.dict | Bin 155 -> 0 bytes src-test/morfologik/stemming/test-separators.info | 9 - src-test/morfologik/stemming/test-separators.txt | 8 - src-test/morfologik/stemming/test-synth.dict | Bin 1354192 -> 0 bytes src-test/morfologik/stemming/test-synth.info | 6 - src-test/morfologik/tools/LauncherTest.java | 26 - src-test/morfologik/tools/MorphEncoderTest.java | 125 -- .../morfologik/tools/MorphEncodingToolTest.java | 110 -- src-test/morfologik/tools/Text2FSA5Test.java | 37 - src-test/morfologik/util/MinMax.java | 21 - src/morfologik/dictionaries/pl.LICENSE | 8 - src/morfologik/dictionaries/pl.dict | Bin 1806661 -> 0 bytes src/morfologik/dictionaries/pl.info | 13 - src/morfologik/fsa/CFSA.java | 364 ------ src/morfologik/fsa/CFSA2.java | 404 ------- src/morfologik/fsa/CFSA2Serializer.java | 536 --------- src/morfologik/fsa/ConstantArcSizeFSA.java | 134 --- src/morfologik/fsa/FSA.java | 270 ----- src/morfologik/fsa/FSA5.java | 323 ------ src/morfologik/fsa/FSA5Serializer.java | 334 ------ src/morfologik/fsa/FSABuilder.java | 486 -------- src/morfologik/fsa/FSAFinalStatesIterator.java | 154 --- src/morfologik/fsa/FSAFlags.java | 64 -- src/morfologik/fsa/FSAHeader.java | 52 - src/morfologik/fsa/FSAInfo.java | 157 --- src/morfologik/fsa/FSASerializer.java | 45 - src/morfologik/fsa/FSATraversal.java | 169 --- src/morfologik/fsa/FSAUtils.java | 202 ---- src/morfologik/fsa/MatchResult.java | 86 -- src/morfologik/fsa/NullMessageLogger.java | 24 - src/morfologik/fsa/StateVisitor.java | 11 - src/morfologik/stemming/ArrayViewList.java | 111 -- src/morfologik/stemming/Dictionary.java | 169 --- src/morfologik/stemming/DictionaryIterator.java | 144 --- src/morfologik/stemming/DictionaryLookup.java | 355 ------ src/morfologik/stemming/DictionaryMetadata.java | 122 -- src/morfologik/stemming/IStemmer.java | 20 - src/morfologik/stemming/PolishStemmer.java | 43 - src/morfologik/stemming/WordData.java | 247 ---- src/morfologik/tools/FSABuildTool.java | 486 -------- src/morfologik/tools/FSADumpTool.java | 286 ----- src/morfologik/tools/IMessageLogger.java | 25 - src/morfologik/tools/InflectionFramesTool.java | 118 -- src/morfologik/tools/Launcher.java | 159 --- src/morfologik/tools/MorphEncoder.java | 399 ------- src/morfologik/tools/MorphEncodingTool.java | 213 ---- src/morfologik/tools/PolishStemmingTool.java | 191 --- src/morfologik/tools/SharedOptions.java | 153 --- src/morfologik/tools/Tool.java | 84 -- src/morfologik/tools/WriterMessageLogger.java | 123 -- src/morfologik/util/Arrays.java | 68 -- src/morfologik/util/BufferUtils.java | 54 - src/morfologik/util/FileUtils.java | 137 --- src/morfologik/util/ResourceUtils.java | 58 - 277 files changed, 14328 insertions(+), 30768 deletions(-) delete mode 100644 .classpath delete mode 100644 .project delete mode 100644 ACKNOWLEDGMENT create mode 100644 CONTRIBUTOR delete mode 100644 LICENSE mode change 100644 => 100755 README delete mode 100644 build.xml delete mode 100644 doc/api/allclasses-frame.html delete mode 100644 doc/api/allclasses-noframe.html delete mode 100644 doc/api/constant-values.html delete mode 100644 doc/api/deprecated-list.html delete mode 100644 doc/api/help-doc.html delete mode 100644 doc/api/index-all.html delete mode 100644 doc/api/index.html delete mode 100644 doc/api/morfologik/fsa/CFSA.html delete mode 100644 doc/api/morfologik/fsa/CFSA2.html delete mode 100644 doc/api/morfologik/fsa/CFSA2Serializer.html delete mode 100644 doc/api/morfologik/fsa/ConstantArcSizeFSA.html delete mode 100644 doc/api/morfologik/fsa/FSA.html delete mode 100644 doc/api/morfologik/fsa/FSA5.html delete mode 100644 doc/api/morfologik/fsa/FSA5Serializer.html delete mode 100644 doc/api/morfologik/fsa/FSABuilder.InfoEntry.html delete mode 100644 doc/api/morfologik/fsa/FSABuilder.html delete mode 100644 doc/api/morfologik/fsa/FSAFinalStatesIterator.html delete mode 100644 doc/api/morfologik/fsa/FSAFlags.html delete mode 100644 doc/api/morfologik/fsa/FSAInfo.html delete mode 100644 doc/api/morfologik/fsa/FSASerializer.html delete mode 100644 doc/api/morfologik/fsa/FSATraversal.html delete mode 100644 doc/api/morfologik/fsa/FSAUtils.IntIntHolder.html delete mode 100644 doc/api/morfologik/fsa/FSAUtils.html delete mode 100644 doc/api/morfologik/fsa/MatchResult.html delete mode 100644 doc/api/morfologik/fsa/StateVisitor.html delete mode 100644 doc/api/morfologik/fsa/package-frame.html delete mode 100644 doc/api/morfologik/fsa/package-summary.html delete mode 100644 doc/api/morfologik/fsa/package-tree.html delete mode 100644 doc/api/morfologik/stemming/Dictionary.html delete mode 100644 doc/api/morfologik/stemming/DictionaryIterator.html delete mode 100644 doc/api/morfologik/stemming/DictionaryLookup.html delete mode 100644 doc/api/morfologik/stemming/DictionaryMetadata.html delete mode 100644 doc/api/morfologik/stemming/IStemmer.html delete mode 100644 doc/api/morfologik/stemming/PolishStemmer.html delete mode 100644 doc/api/morfologik/stemming/WordData.html delete mode 100644 doc/api/morfologik/stemming/package-frame.html delete mode 100644 doc/api/morfologik/stemming/package-summary.html delete mode 100644 doc/api/morfologik/stemming/package-tree.html delete mode 100644 doc/api/morfologik/tools/FSABuildTool.Format.html delete mode 100644 doc/api/morfologik/tools/FSABuildTool.html delete mode 100644 doc/api/morfologik/tools/FSADumpTool.html delete mode 100644 doc/api/morfologik/tools/IMessageLogger.html delete mode 100644 doc/api/morfologik/tools/InflectionFramesTool.html delete mode 100644 doc/api/morfologik/tools/Launcher.html delete mode 100644 doc/api/morfologik/tools/MorphEncoder.html delete mode 100644 doc/api/morfologik/tools/WriterMessageLogger.html delete mode 100644 doc/api/morfologik/tools/package-frame.html delete mode 100644 doc/api/morfologik/tools/package-summary.html delete mode 100644 doc/api/morfologik/tools/package-tree.html delete mode 100644 doc/api/morfologik/util/Arrays.html delete mode 100644 doc/api/morfologik/util/BufferUtils.html delete mode 100644 doc/api/morfologik/util/FileUtils.html delete mode 100644 doc/api/morfologik/util/ResourceUtils.html delete mode 100644 doc/api/morfologik/util/package-frame.html delete mode 100644 doc/api/morfologik/util/package-summary.html delete mode 100644 doc/api/morfologik/util/package-tree.html delete mode 100644 doc/api/overview-frame.html delete mode 100644 doc/api/overview-summary.html delete mode 100644 doc/api/overview-tree.html delete mode 100644 doc/api/package-list delete mode 100644 doc/api/resources/inherit.gif delete mode 100644 doc/api/stylesheet.css delete mode 100644 lib/commons-cli-1.2.LICENSE delete mode 100644 lib/commons-cli-1.2.jar delete mode 100644 lib/hppc-0.3.2.jar delete mode 100644 lib/hppc.LICENSE delete mode 100644 lib/junit-4.7.jar delete mode 100644 lib/junit-benchmarks-0.1.0.jar delete mode 100644 lib/junit-benchmarks.LICENSE delete mode 100644 lib/junit.LICENSE delete mode 100644 lib/thirdparty.LICENSE create mode 100644 licenses/commons-cli.LICENSE create mode 100644 licenses/commons-lang.LICENSE create mode 100644 licenses/hppc.LICENSE create mode 100644 licenses/morfologik-polish.LICENSE create mode 100644 morfologik-distribution/pom.xml create mode 100644 morfologik-distribution/src/main/assembly/bin.xml create mode 100644 morfologik-fsa/pom.xml create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSA.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java create mode 100644 morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java create mode 100644 morfologik-fsa/src/main/java/morfologik/util/Arrays.java create mode 100644 morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java create mode 100644 morfologik-fsa/src/main/java/morfologik/util/FileUtils.java create mode 100644 morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java create mode 100644 morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java create mode 100644 morfologik-fsa/src/test/java/morfologik/util/MinMax.java create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/abc-numbers.fsa create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/abc.fsa create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/abc.in create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/en_tst.dict create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/minimal.fsa create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/minimal.in create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.fsa create mode 100644 morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.in create mode 100644 morfologik-polish/pom.xml create mode 100644 morfologik-polish/src/main/java/morfologik/stemming/PolishStemmer.java create mode 100644 morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-en create mode 100644 morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-pl create mode 100644 morfologik-polish/src/main/resources/morfologik/dictionaries/pl.dict create mode 100644 morfologik-polish/src/main/resources/morfologik/dictionaries/pl.info create mode 100644 morfologik-polish/src/test/java/morfologik/stemming/PerformanceTest.java create mode 100644 morfologik-polish/src/test/java/morfologik/stemming/PolishMorfologikStemmerTest.java create mode 100644 morfologik-speller/pom.xml create mode 100644 morfologik-speller/src/main/java/morfologik/speller/HMatrix.java create mode 100644 morfologik-speller/src/main/java/morfologik/speller/Speller.java create mode 100644 morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java create mode 100644 morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/slownik.dict create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/slownik.info create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test-infix.info create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict create mode 100644 morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info create mode 100644 morfologik-stemming/pom.xml create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java create mode 100644 morfologik-stemming/src/main/java/morfologik/stemming/WordData.java create mode 100644 morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java create mode 100644 morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java create mode 100644 morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java create mode 100644 morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict create mode 100644 morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info create mode 100644 morfologik-tools/pom.xml create mode 100644 morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/Launcher.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/Tool.java create mode 100644 morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java create mode 100644 morfologik-tools/src/proguard/rules.pro create mode 100644 morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java create mode 100644 morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java create mode 100644 morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java create mode 100644 morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java create mode 100644 morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java create mode 100644 morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java create mode 100644 morfologik.LICENSE create mode 100644 pom.xml delete mode 100644 src-test/morfologik/fsa/CFSA2SerializerTest.java delete mode 100644 src-test/morfologik/fsa/FSA5SerializerTest.java delete mode 100644 src-test/morfologik/fsa/FSA5Test.java delete mode 100644 src-test/morfologik/fsa/FSABuilderTest.java delete mode 100644 src-test/morfologik/fsa/FSATestUtils.java delete mode 100644 src-test/morfologik/fsa/FSATraversalTest.java delete mode 100644 src-test/morfologik/fsa/SerializerTestBase.java delete mode 100644 src-test/morfologik/fsa/abc-numbers.fsa delete mode 100644 src-test/morfologik/fsa/abc.fsa delete mode 100644 src-test/morfologik/fsa/abc.in delete mode 100644 src-test/morfologik/fsa/en_tst.dict delete mode 100644 src-test/morfologik/fsa/minimal.fsa delete mode 100644 src-test/morfologik/fsa/minimal.in delete mode 100644 src-test/morfologik/fsa/minimal2.fsa delete mode 100644 src-test/morfologik/fsa/minimal2.in delete mode 100644 src-test/morfologik/stemming/DictionaryLookupTest.java delete mode 100644 src-test/morfologik/stemming/PerformanceTest.java delete mode 100644 src-test/morfologik/stemming/PolishStemmerTest.java delete mode 100644 src-test/morfologik/stemming/StringDecoderBenchmarkTest.java delete mode 100644 src-test/morfologik/stemming/test-diacritics-utf8.dict delete mode 100644 src-test/morfologik/stemming/test-diacritics-utf8.info delete mode 100644 src-test/morfologik/stemming/test-infix.dict delete mode 100644 src-test/morfologik/stemming/test-infix.info delete mode 100644 src-test/morfologik/stemming/test-prefix.dict delete mode 100644 src-test/morfologik/stemming/test-prefix.info delete mode 100644 src-test/morfologik/stemming/test-separators.dict delete mode 100644 src-test/morfologik/stemming/test-separators.info delete mode 100644 src-test/morfologik/stemming/test-separators.txt delete mode 100644 src-test/morfologik/stemming/test-synth.dict delete mode 100644 src-test/morfologik/stemming/test-synth.info delete mode 100644 src-test/morfologik/tools/LauncherTest.java delete mode 100644 src-test/morfologik/tools/MorphEncoderTest.java delete mode 100644 src-test/morfologik/tools/MorphEncodingToolTest.java delete mode 100644 src-test/morfologik/tools/Text2FSA5Test.java delete mode 100644 src-test/morfologik/util/MinMax.java delete mode 100644 src/morfologik/dictionaries/pl.LICENSE delete mode 100644 src/morfologik/dictionaries/pl.dict delete mode 100644 src/morfologik/dictionaries/pl.info delete mode 100644 src/morfologik/fsa/CFSA.java delete mode 100644 src/morfologik/fsa/CFSA2.java delete mode 100644 src/morfologik/fsa/CFSA2Serializer.java delete mode 100644 src/morfologik/fsa/ConstantArcSizeFSA.java delete mode 100644 src/morfologik/fsa/FSA.java delete mode 100644 src/morfologik/fsa/FSA5.java delete mode 100644 src/morfologik/fsa/FSA5Serializer.java delete mode 100644 src/morfologik/fsa/FSABuilder.java delete mode 100644 src/morfologik/fsa/FSAFinalStatesIterator.java delete mode 100644 src/morfologik/fsa/FSAFlags.java delete mode 100644 src/morfologik/fsa/FSAHeader.java delete mode 100644 src/morfologik/fsa/FSAInfo.java delete mode 100644 src/morfologik/fsa/FSASerializer.java delete mode 100644 src/morfologik/fsa/FSATraversal.java delete mode 100644 src/morfologik/fsa/FSAUtils.java delete mode 100644 src/morfologik/fsa/MatchResult.java delete mode 100644 src/morfologik/fsa/NullMessageLogger.java delete mode 100644 src/morfologik/fsa/StateVisitor.java delete mode 100644 src/morfologik/stemming/ArrayViewList.java delete mode 100644 src/morfologik/stemming/Dictionary.java delete mode 100644 src/morfologik/stemming/DictionaryIterator.java delete mode 100644 src/morfologik/stemming/DictionaryLookup.java delete mode 100644 src/morfologik/stemming/DictionaryMetadata.java delete mode 100644 src/morfologik/stemming/IStemmer.java delete mode 100644 src/morfologik/stemming/PolishStemmer.java delete mode 100644 src/morfologik/stemming/WordData.java delete mode 100644 src/morfologik/tools/FSABuildTool.java delete mode 100644 src/morfologik/tools/FSADumpTool.java delete mode 100644 src/morfologik/tools/IMessageLogger.java delete mode 100644 src/morfologik/tools/InflectionFramesTool.java delete mode 100644 src/morfologik/tools/Launcher.java delete mode 100644 src/morfologik/tools/MorphEncoder.java delete mode 100644 src/morfologik/tools/MorphEncodingTool.java delete mode 100644 src/morfologik/tools/PolishStemmingTool.java delete mode 100644 src/morfologik/tools/SharedOptions.java delete mode 100644 src/morfologik/tools/Tool.java delete mode 100644 src/morfologik/tools/WriterMessageLogger.java delete mode 100644 src/morfologik/util/Arrays.java delete mode 100644 src/morfologik/util/BufferUtils.java delete mode 100644 src/morfologik/util/FileUtils.java delete mode 100644 src/morfologik/util/ResourceUtils.java diff --git a/.classpath b/.classpath deleted file mode 100644 index df7d164..0000000 --- a/.classpath +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - diff --git a/.gitignore b/.gitignore index fa137ce..746231c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,34 @@ +*.versionsBackup tmp/ dist/ -*.patch \ No newline at end of file +target/ +*.patch +.eclipse/ +.project +.classpath +.settings +*.name +*.iml +.idea/compiler.xml +.idea/encodings.xml +.idea/copyright/profiles_settings.xml +.idea/libraries/Maven__com_carrotsearch_hppc_0_5_3.xml +.idea/libraries/Maven__com_carrotsearch_junit_benchmarks_0_7_2.xml +.idea/libraries/proguard.xml +.idea/misc.xml +.idea/modules.xml +.idea/qaplug_profiles.xml +.idea/scopes/scope_settings.xml +.idea/vcs.xml +.idea/workspace.xml +.idea/libraries/Maven__com_carrotsearch_randomizedtesting_randomizedtesting_runner_2_0_13.xml +.idea/libraries/Maven__com_google_guava_guava_15_0.xml +.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml +.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml +.idea/libraries/Maven__junit_junit_4_11.xml +.idea/libraries/Maven__org_easytesting_fest_assert_core_2_0M10.xml +.idea/libraries/Maven__org_easytesting_fest_util_1_2_5.xml +.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml +.idea/uiDesigner.xml +.idea/inspectionProfiles/Project_Default.xml +.idea/inspectionProfiles/profiles_settings.xml diff --git a/.project b/.project deleted file mode 100644 index fe93ffc..0000000 --- a/.project +++ /dev/null @@ -1,17 +0,0 @@ - - - morfologik-stemming - - - - - - org.eclipse.jdt.core.javabuilder - - - - - - org.eclipse.jdt.core.javanature - - diff --git a/ACKNOWLEDGMENT b/ACKNOWLEDGMENT deleted file mode 100644 index c68b089..0000000 --- a/ACKNOWLEDGMENT +++ /dev/null @@ -1,6 +0,0 @@ -Polish stemming data comes from Polish ispell/myspell dictionary hosted at -http://sjp.pl/. It was corrected and enriched with part-of-speech tags in -Morfologik project. The resulting data set is licensed on the terms -of LGPL and/or Creative Commons ShareAlike (pick the suitable license). - -See http://morfologik.blogspot.com \ No newline at end of file diff --git a/CHANGES b/CHANGES index 65243fe..be4426a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,179 +1,395 @@ -1.5.0 - Major size saving improvements in CFSA2. Built in Polish dictionary size decreased from - 2,811,345 to 1,806,661 (CFSA2 format). +Morfologik-stemming Change Log - FSABuilder returns a ready-to-be-used FSA (ConstantArcSizeFSA). Construction overhead - for this automaton is a round zero (it is immediately serialized in-memory). +For an up-to-date CHANGES file see +https://github.com/morfologik/morfologik-stemming/blob/master/CHANGES - Polish dictionary updated to Morfologik 1.7. [19.11.2010] +======================= morfologik-stemming 1.9.0 ======================= - Added an option to serialize automaton to CFSA2 or FSA5 directly from fsa_build. +Changes in backwards compatibility policy - CFSA is now deprecated for serialization (the code still reads CFSA automata, but will - no be able to serialize them). Use CFSA2. +New Features - Added immediate state interning. - Speedup in automaton construction by about 30%, memory use - decreased significantly (did not perform exact measurements, but incremental - construction from presorted data should consume way less memory). +* Added capability to normalize input and output strings for dictionaries. + This is useful for dictionaries that do not support ligatures, for example. + To specify input conversion, use the property 'fsa.dict.input-conversion' + in the .info file. The output conversion (for example, to use ligatures) + is specified by 'fsa.dict.output-conversion'. Note that lengthy + conversion tables may negatively affect performance. - Added an option to build FSA from already sorted data (--sorted). Avoids in-memory sorting. - Pipe the input through shell sort if building FSA from large data. +Bug Fixes - Changed the default ordering from Java signed-byte to C-like unsigned byte value. - This lets one use GNU sort to sort the input using 'export LC_ALL=C; sort input'. +Optimizations - Added traversal routines to calculate perfect hashing based on FSA with NUMBERS. + * The suggestion search for the speller is now performed directly by traversing + the dictionary automaton, which makes it much more time-efficient (thanks + to Jaume Ortolà). - Changed the order of serialized arcs in the binary serializer for FSA5 to lexicographic - (consistent with the input). Depth-first traversal recreates the input, in other words. + * Suggestions are generated faster by avoiding unnecessary case conversions. - Removed character-based automata. +======================= morfologik-stemming 1.8.3 ======================= - Incompatible API changes to FSA builders (moved to morfologik.fsa). +Bug Fixes - Incompatible API changes to FSATraversalHelper. Cleaned up match types, added - unit tests. +* Fixed a bug for spelling dictionaries in non-UTF encodings with + separators: strings with non-encodable characters might have been + accepted as spelled correctly even if they were missing in the + dictionary. - ################################################################ - - Incompatible API changes have been made in release 1.5.0. See above. - - An external dependency HPPC (high performance primitive collections) is now required - for compiling FSAs (it is optional for traversals). - ################################################################ +======================= morfologik-stemming 1.8.2 ======================= -1.4.1 - Upgrade of the built-in Morfologik dictionary for Polish (in CFSA format). +New Features - Added options to define custom FILLER and ANNOT_SEPARATOR bytes in the fsa_build - tool. +* Added the option of using frequencies of words for sorting spelling + replacements. It can be used in both spelling and tagging dictionaries. + 'fsa.dict.frequency-included=true' must be added to the .info file. + For building the dictionary, add at the end of each entry a separator and + a character between A and Z (A: less frequently used words; + Z: more frequently used words). (Jaume Ortolà) - Corrected an inconsistency with the C fsa package -- FILLER and ANNOT_SEPARATOR - characters are now identical with the C version. +======================= morfologik-stemming 1.8.1 ======================= + +Changes in backwards compatibility policy + +* MorphEncodingTool will *fail* if it detects data/lines that contain the + separator annotation byte. This is because such lines get encoded into + something that the decoder cannot process. You can use \u0000 as the + annotation byte to avoid clashes with any existing data. + +======================= morfologik-stemming 1.8.0 ======================= + +Changes in backwards compatibility policy + +* Command-line option changes to MorphEncodingTool - it now accepts an explicit + name of the sequence encoder, not infix/suffix/prefix booleans. + +* Updating dependencies to their newest versions. + +New Features + +* Dictionary .info files can specify the sequence decoder explicitly: + suffix, prefix, infix, none are supported. For backwards compatibility, + fsa.dict.uses-prefixes, fsa.dict.uses-infixes and fsa.dict.uses-suffixes + are still supported, but will be removed in the next major version. + +* Command-line option changes to MorphEncodingTool - it now accepts an explicit + name of the sequence encoder, not infix/suffix/prefix booleans. + +* Rewritten implementation of tab-separated data files (tab2morph tool). + The output should yield smaller files, especially for prefix encoding + and infix encoding. This does *not* necessarily mean smaller automata + but we're working on getting these as well. + + Example output before and after refactoring: + + Prefix coder: + postmodernizm|modernizm|xyz => [before] postmodernizm+ANmodernizm+xyz + => [after ] postmodernizm+EA+xyz + + Infix coder: + laquelle|lequel|D f s => [before] laquelle+AAHequel+D f s + => [after ] laquelle+AGAquel+D f s + +* Changed the default format of the Polish dictionary from infix + encoded to prefix encoded (smaller output size). + +Optimizations + +* A number of internal implementation cleanups and refactorings. + +======================= morfologik-stemming 1.7.2 ======================= + +* A quick fix for incorrect decoding of certain suffixes (long suffixes). + +* Increased max. recursion level in Speller to 6 from 4. (Jaume Ortolà) + +======================= morfologik-stemming 1.7.1 ======================= + +* Fixed a couple of bugs in morfologik-speller (Jaume Ortolà). + +======================= morfologik-stemming 1.7.0 ======================= + +* Changed DictionaryMetadata API (access methods for encoder/decoder). + +* Initial version of morfologik-speller component. + +* Minor changes to the FSADumpTool: the header block is always UTF-8 + encoded, the default platform encoding does not matter. This is done to + always support certain attributes that may be unicode (and would be + incorrectly dumped otherwise). + +* Metadata *.info files can now be encoded in UTF-8 to support text + attributes that otherwise would require text2ascii conversion. + +======================= morfologik-stemming 1.6.0 ======================= + +* Update morfologik-polish data to Morfologik 2.0 PoliMorf (08.03.2013). + Deprecated DICTIONARY constants (unified dictionary only). - Cleanups to the tools' launcher -- will complain about missing JARs, if any. +* Important! The format of encoding tags has changed and is now + multiple-tags-per-lemma. The value returned from WordData#getTag + may be a number of tags concatenated with a "+" character. Previously + the same lamma/stem would be returned multiple times, each time with + a different tag. -1.4.0 - Added FSA5 construction in Java (on byte sequences). Added preliminary support for - character sequences. Added a command line tool for FSA5 - construction from unsorted data (sorting is done in-memory). +* Moving code from SourceForge to github. - Added a tool to encode tab-delimited dictionaries to the format accepted by - fsa_build and FSA5 construction tool. +======================= morfologik-stemming 1.5.5 ======================= - Added a new version of Morfologik dictionary for Polish (in CFSA format). +* Made hppc an optional component of morfologik-fsa. It is required + for constructing FSA automata only and causes problems with javac. + http://stackoverflow.com/questions/3800462/can-i-prevent-javac-accessing-the-class-path-from-the-manifests-of-our-third-par -1.3.0 - Added runtime checking for tools availability so that unavailable - tools don't show up in the list. +======================= morfologik-stemming 1.5.4 ======================= - Recompressed the built-in Polish dictionary to CFSA. +* Replaced byte-based speller with CharBasedSpeller. - Cleaned up FSA/Dictionary separation. FSAs don't store encoding any more (because - it does not make sense for them to do so). The FSA is a purely abstract class - pushing functionality to sub-classes. Input stream reading cleaned up. +* Warn about UTF-8 files with BOM. + +* Fixed a typo in package name (speller). - Added initial code for CFSA (compressed FSA). Reduces automata size about 10%. +======================= morfologik-stemming 1.5.3 ======================= - Changes in the public API. Implementation classes renamed - (FSAVer5Impl into FSA5). Major tweaks and tunes to the API. +* Initial release of spelling correction submodule. - Added support for version 5 automata built with NUMBERS flag (an extra field - stored for each node). +* Updated morfologik-polish data to morfologik 1.9 [12.06.2012] - ################################################################ - Incompatible API changes have been made in release 1.3.0. - ################################################################ +* Updated morfologik-polish licensing info to BSD (yay). -1.2.2 License switch to plain BSD (removed the patent clause which did not make much sense - anyway). +======================= morfologik-stemming 1.5.2 ======================= - The build ZIP now includes licenses for individual JARs (prevents confusion). +* An alternative Polish dictionary added (BSD licensed): SGJP (Morfeusz). + PolishStemmer can now take an enum switching between the dictionary to + be used or combine both. -1.2.1 Fixed tool launching routines. +* Project split into modules. A single jar version (no external + dependencies) added by transforming via proguard. -1.2 Package hierarchy reorganized. +* Enabled use of escaped special characters in the tab2morph tool. - Removed stempel (heuristic stemmer for Polish). +* Added guards against the input term having separator character + somewhere (this will now return an empty list of matches). Added + getSeparatorChar to DictionaryLookup so that one can check for this + condition manually, if needed. - Code updated to Java 1.5. +======================= morfologik-stemming 1.5.1 ======================= - The API has changed in many places (enums instead of constants, - generics, iterables, removed explicit Arc and Node classes and replaced - by int pointers). +* Build system switch to Maven (tested with Maven2). - FSA traversal in version 1.2 is implemented on top of primitive - data structures (int pointers) to keep memory usage minimal. The speed - boost gained from this is enormous and justifies less readable code. We - strongly advise to use the provided iterators and helper functions - for matching state sequences in the FSA. +======================= morfologik-stemming 1.5.0 ======================= - Tools updated. Dumping existing FSAs is much, much faster now. +* Major size saving improvements in CFSA2. Built in Polish dictionary + size decreased from 2,811,345 to 1,806,661 (CFSA2 format). - ################################################################ - Incompatible API changes have been made in release 1.2. - Java 1.5 or later is required from this version on. - ################################################################ +* FSABuilder returns a ready-to-be-used FSA (ConstantArcSizeFSA). + Construction overhead for this automaton is a round zero (it is + immediately serialized in-memory). -1.1.4 * Fixed a bug that caused UTF-8 dictionaries to be garbled. Now it should be relatively - safe to use UTF-8 dictionaries (note: separators cannot be multibyte UTF-8 characters, - yet this is probably a very rare case). +* Polish dictionary updated to Morfologik 1.7. [19.11.2010] -1.1.3 * Fixed a bug causing NPE when the library is called with null context class loader - (happens when JVM is invoked from an JNI-attached thread). Thanks to - Patrick Luby for report and detailed analysis. +* Added an option to serialize automaton to CFSA2 or FSA5 directly from + fsa_build. - Updated the built-in dictionary to the newest version available. +* CFSA is now deprecated for serialization (the code still reads CFSA + automata, but will no be able to serialize them). Use CFSA2. -1.1.2 * Fixed a bug causing JAR file locking (by implementing a workaround). +* Added immediate state interning. Speedup in automaton construction by + about 30%, memory use decreased significantly (did not perform exact + measurements, but incremental construction from presorted data should + consume way less memory). - Fixed the build script (manifest file was broken). +* Added an option to build FSA from already sorted data (--sorted). + Avoids in-memory sorting. Pipe the input through shell sort if + building FSA from large data. -1.1.1 Distribution script fixes. The final JAR does not contain test classes and resources. Size - trimmed almost twice compared to release 1.1. +* Changed the default ordering from Java signed-byte to C-like unsigned + byte value. This lets one use GNU sort to sort the input using + 'export LC_ALL=C; sort input'. - Updated the dump tool to accept dictionary metadata files. +* Added traversal routines to calculate perfect hashing based on + FSA with NUMBERS. -1.1 Introduced an auxiliary "meta" information files about compressed dictionaries. - Such information include delimiter symbol, encoding and infix/prefix/postfix - decoding info. - - The API has changed (repackaging). Some deprecated methods have been removed. - This is a major redesign/ upgrade, you will have to adjust your source code. +* Changed the order of serialized arcs in the binary serializer for FSA5 + to lexicographic (consistent with the input). Depth-first traversal + recreates the input, in other words. + +* Removed character-based automata. + +* Incompatible API changes to FSA builders (moved to morfologik.fsa). + +* Incompatible API changes to FSATraversalHelper. Cleaned up match + types, added unit tests. + +* An external dependency HPPC (high performance primitive collections) + is now required + +======================= morfologik-stemming 1.4.1 ======================= + +* Upgrade of the built-in Morfologik dictionary for Polish (in CFSA + format). + +* Added options to define custom FILLER and ANNOT_SEPARATOR bytes in the + fsa_build tool. + +* Corrected an inconsistency with the C fsa package -- FILLER and + ANNOT_SEPARATOR characters are now identical with the C version. - Cleaned up APIs and interfaces. +* Cleanups to the tools' launcher -- will complain about missing JARs, + if any. + +======================= morfologik-stemming 1.4.0 ======================= + +* Added FSA5 construction in Java (on byte sequences). Added preliminary + support for character sequences. Added a command line tool for FSA5 + construction from unsorted data (sorting is done in-memory). + +* Added a tool to encode tab-delimited dictionaries to the format + accepted by fsa_build and FSA5 construction tool. + +* Added a new version of Morfologik dictionary for Polish (in CFSA format). + +======================= morfologik-stemming 1.3.0 ======================= + +* Added runtime checking for tools availability so that unavailable tools + don't show up in the list. - Added infrastructure for command-line tool launching. +* Recompressed the built-in Polish dictionary to CFSA. - Cleaned up tests. +* Cleaned up FSA/Dictionary separation. FSAs don't store encoding any more + (because it does not make sense for them to do so). The FSA is a purely + abstract class pushing functionality to sub-classes. Input stream + reading cleaned up. - Changed project name to morfologik-stemmers and ownership to (c) Morfologik. +* Added initial code for CFSA (compressed FSA). Reduces automata size + about 10%. -1.0.7 Removed one bug in fsa 'compression' decoding. +* Changes in the public API. Implementation classes renamed (FSAVer5Impl + into FSA5). Major tweaks and tunes to the API. -1.0.6 Customized version of stempel replaced with a standard distribution. +* Added support for version 5 automata built with NUMBERS flag (an extra + field stored for each node). - Removed deprecated methods and classes. +======================= morfologik-stemming 1.2.2 ======================= + +* License switch to plain BSD (removed the patent clause which did not + make much sense anyway). + +* The build ZIP now includes licenses for individual JARs (prevents + confusion). + +======================= morfologik-stemming 1.2.1 ======================= + +* Fixed tool launching routines. + +======================= morfologik-stemming 1.2.0 ======================= + +* Package hierarchy reorganized. + +* Removed stempel (heuristic stemmer for Polish). + +* Code updated to Java 1.5. + +* The API has changed in many places (enums instead of constants, + generics, iterables, removed explicit Arc and Node classes and replaced + by int pointers). + +* FSA traversal in version 1.2 is implemented on top of primitive data + structures (int pointers) to keep memory usage minimal. The speed + boost gained from this is enormous and justifies less readable code. We + strongly advise to use the provided iterators and helper functions + for matching state sequences in the FSA. + +* Tools updated. Dumping existing FSAs is much, much faster now. + +======================= morfologik-stemming 1.1.4 ======================= + +* Fixed a bug that caused UTF-8 dictionaries to be garbled. Now it + should be relatively safe to use UTF-8 dictionaries (note: separators + cannot be multibyte UTF-8 characters, yet this is probably a very + rare case). + +======================= morfologik-stemming 1.1.3 ======================= + +* Fixed a bug causing NPE when the library is called with null context + class loader (happens when JVM is invoked from an JNI-attached + thread). Thanks to Patrick Luby for report and detailed analysis. + +* Updated the built-in dictionary to the newest version available. + +======================= morfologik-stemming 1.1.2 ======================= + +* Fixed a bug causing JAR file locking (by implementing a workaround). + +* Fixed the build script (manifest file was broken). + +======================= morfologik-stemming 1.1.1 ======================= + +* Distribution script fixes. The final JAR does not contain test classes + and resources. Size trimmed almost twice compared to release 1.1. + +* Updated the dump tool to accept dictionary metadata files. + +======================= morfologik-stemming 1.1 ========================= + +* Introduced an auxiliary "meta" information files about compressed + dictionaries. Such information include delimiter symbol, encoding + and infix/prefix/postfix decoding info. + +* The API has changed (repackaging). Some deprecated methods have been + removed. This is a major redesign/ upgrade, you will have to adjust + your source code. + +* Cleaned up APIs and interfaces. + +* Added infrastructure for command-line tool launching. + +* Cleaned up tests. + +* Changed project name to morfologik-stemmers and ownership to + (c) Morfologik. + +======================= morfologik-stemming 1.0.7 ======================= + +* Removed one bug in fsa 'compression' decoding. + +======================= morfologik-stemming 1.0.6 ======================= + +* Customized version of stempel replaced with a standard distribution. + +* Removed deprecated methods and classes. - Added infix and prefix encoding support for fsa dictionaries. +* Added infix and prefix encoding support for fsa dictionaries. + +======================= morfologik-stemming 1.0.5 ======================= -1.0.5 Added filler and separator char dumps to FSADump. +* Added filler and separator char dumps to FSADump. - * A major bug in automaton traversal corrected. Upgrade when possible. +* A major bug in automaton traversal corrected. Upgrade when possible. - Certain API changes were introduced; older methods are now deprecated - and will be removed in the future. +* Certain API changes were introduced; older methods are now deprecated + and will be removed in the future. + +======================= morfologik-stemming 1.0.4 ======================= + +* Licenses for full and no-dict versions. + +======================= morfologik-stemming 1.0.3 ======================= + +* Project code moved to SourceForge (subproject of Morfologik). + LICENSE CHANGED FROM PUBLIC DOMAIN TO BSD (doesn't change much, but + clarifies legal issues). + +======================= morfologik-stemming 1.0.2 ======================= -1.0.4 Licenses for full and no-dict versions. +* Added a Lametyzator constructor which allows custom dictionary stream, + field delimiters and encoding. Added an option for building stand-alone + JAR that does not include the default polish dictionary. -1.0.3 Project code moved to SourceForge (subproject of Morfologik). - LICENSE CHANGED FROM PUBLIC DOMAIN TO BSD (doesn't change much, but clarifies - legal issues). +======================= morfologik-stemming 1.0.1 ======================= -1.0.2 Added a Lametyzator constructor which allows custom dictionary stream, field - delimiters and encoding. Added an option for building stand-alone - JAR that does not include the default polish dictionary. +* Code cleanups. Added a method that returns the third automaton's column + (form). -1.0.1 Code cleanups. Added a method that returns the third automaton's column (form). +======================= morfologik-stemming 1.0 ========================= -1.0 Initial release \ No newline at end of file +* Initial release diff --git a/CONTRIBUTOR b/CONTRIBUTOR new file mode 100644 index 0000000..2e3fa2f --- /dev/null +++ b/CONTRIBUTOR @@ -0,0 +1,62 @@ + +Compiling +========= + +You will need maven. Then, in the top folder of the checkout: + +mvn clean install + +or + +mvn clean install -Pquick # no tests + + +Eclipse +======= + +We highly recommend using m2eclipse plugin and importing all projects directly +into Eclipse as Maven projects ("maven nature"). + + +Compiling dictionaries +====================== + +A pipeline for compiling plain text dictionary data into automata: + +1) Prepare tab-delimited input file with the following columns: + inflected-form base-form annotation. An example: + +niebabińscy niebabiński adj:pl:nom.voc:m1.p1:pos +niebabińska niebabiński adj:sg:nom.voc:f:pos +niebabiński niebabiński adj:sg:acc:m3:pos + +2) The above tab-delimited input can be preprocessed + to conflate shared affixes (helps in subsequent FSA compression): + +java -jar morfologik-tools-*-standalone.jar tab2morph --coder INFIX --input ~/tmp/input.txt > intermediate.txt + +3) Compile FSA from the intermediate format: + +java -jar morfologik-tools-*-standalone.jar fsa_build --input intermediate.txt --progress > output.fsa + +4) You should add output.info file specifying character encoding and additional + licensing information. See examples (Polish dictionaries). + +More info: +http://languagetool.wikidot.com/developing-a-tagger-dictionary + + +Sonatype/ release push +====================== + +# snapshot deploy, create single-JAR version, javadocs, etc. +mvn clean deploy -Prelease + +# ZIP with full release artifacts +mvn clean deploy -Prelease,distribution + +# ZIP with full release artifacts for sourceforge.net +mvn clean install -Prelease,distribution + +# For final releases, GPG sign. +mvn clean deploy -Prelease,distribution,sign diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 9074aac..0000000 --- a/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ - -Copyright (c) 2006 Dawid Weiss -Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of Morfologik nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README b/README old mode 100644 new mode 100755 index 4ce3038..50a7d1b --- a/README +++ b/README @@ -1,33 +1,62 @@ -CONTENTS --------- +MORFOLOGIK +========== + +FSA (automata), stemming, dictionaries and tools. Tools quickstart: + +java -jar lib/morfologik-tools-${version}-standalone.jar + + +MODULES +======= This project provides: - - Finite state automaton traversal routines for Jan Daciuk's FSA package. - +morfologik-fsa: + - Creation of byte-based, efficient finite state automata in Java, including - custom, efficient data storage formats (not compatible with Daciuk's FSA package). + custom, efficient data storage formats. + + - Compatibility with FSA5, binary format of finite state automata produced by + Jan Daciuk's "fsa" package. + +morfologik-stemming: - - A stemming engine for the Polish language built on top of a large dictionary - of inflected forms, stems and grammatical annotations. + - FSA-based stemming interfaces and dictionary metadata. -There are a few command-line tools you may find useful. Type: +morfologik-polish: - java -jar morfologik-stemming-*.jar + - Precompiled dictionary of inflected forms, stems and tags for the Polish + language built on top of a large dictionary. -for an up-to-date list of all tools. +morfologik-tools: + + - Command line tools to preprocess, build and dump FSA automata and dictionaries. + + - There are a few command-line tools you may find useful. Type: + java -jar lib/morfologik-tools-${version}.jar + for an up-to-date list of all tools. + +morfologik-speller: + + - Simplistic automaton-based spelling correction (suggester). AUTHORS ======= -Marcin Miłkowski (http://marcinmilkowski.pl) [linguistic data lead] -Dawid Weiss (http://www.dawidweiss.com) [fsa lead] -Grzegorz Słowikowski [maven bundles maintenance] +Marcin Miłkowski (http://marcinmilkowski.pl) [linguistic data lead, code] +Dawid Weiss (http://www.dawidweiss.com) [fsa lead, code] + + +CONTRIBUTORS +============ + +Grzegorz Słowikowski [initial maven configs] QUESTIONS, COMMENTS =================== -www.morfologik.blogspot.com +Web site: http://www.morfologik.blogspot.com +Mailing list: morfologik-devel@lists.sourceforge.net diff --git a/TODO b/TODO index eed30fd..e69de29 100644 --- a/TODO +++ b/TODO @@ -1,21 +0,0 @@ - -BUGS? - - -NEXT MAJOR VERSION - -- New binary automaton format; - - allow direct arc lookup on highly fanning nodes, especially root nodes - - allow direct perf. hashing number on arcs of highly fanning nodes, especially root nodes - - variable coding for state numbers - - state flags to encode single-arc-final tails in a compact form (no arcs at all?). - - -NOT EVEN SCHEDULED - -- Add arc reuse (optimisation). - -- Add a set of utility classes for parsing morphological tags. This could be similar - (or even compatible?) with Morfeusz-Java. - -- Suggest and implement a new stemming heuristic based on inflection frames. diff --git a/build.xml b/build.xml deleted file mode 100644 index 5961525..0000000 --- a/build.xml +++ /dev/null @@ -1,219 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ---- - - - - - - - - - - - - - - - - - - - - diff --git a/doc/api/allclasses-frame.html b/doc/api/allclasses-frame.html deleted file mode 100644 index 876e191..0000000 --- a/doc/api/allclasses-frame.html +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - -All Classes - - - - - - - - - - - -All Classes -
- - - - - -
Arrays -
-BufferUtils -
-CFSA -
-CFSA2 -
-CFSA2Serializer -
-ConstantArcSizeFSA -
-Dictionary -
-DictionaryIterator -
-DictionaryLookup -
-DictionaryMetadata -
-FileUtils -
-FSA -
-FSA5 -
-FSA5Serializer -
-FSABuilder -
-FSABuilder.InfoEntry -
-FSABuildTool -
-FSABuildTool.Format -
-FSADumpTool -
-FSAFinalStatesIterator -
-FSAFlags -
-FSAInfo -
-FSASerializer -
-FSATraversal -
-FSAUtils -
-FSAUtils.IntIntHolder -
-IMessageLogger -
-InflectionFramesTool -
-IStemmer -
-Launcher -
-MatchResult -
-MorphEncoder -
-PolishStemmer -
-ResourceUtils -
-StateVisitor -
-WordData -
-WriterMessageLogger -
-
- - - diff --git a/doc/api/allclasses-noframe.html b/doc/api/allclasses-noframe.html deleted file mode 100644 index 50dcd62..0000000 --- a/doc/api/allclasses-noframe.html +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - -All Classes - - - - - - - - - - - -All Classes -
- - - - - -
Arrays -
-BufferUtils -
-CFSA -
-CFSA2 -
-CFSA2Serializer -
-ConstantArcSizeFSA -
-Dictionary -
-DictionaryIterator -
-DictionaryLookup -
-DictionaryMetadata -
-FileUtils -
-FSA -
-FSA5 -
-FSA5Serializer -
-FSABuilder -
-FSABuilder.InfoEntry -
-FSABuildTool -
-FSABuildTool.Format -
-FSADumpTool -
-FSAFinalStatesIterator -
-FSAFlags -
-FSAInfo -
-FSASerializer -
-FSATraversal -
-FSAUtils -
-FSAUtils.IntIntHolder -
-IMessageLogger -
-InflectionFramesTool -
-IStemmer -
-Launcher -
-MatchResult -
-MorphEncoder -
-PolishStemmer -
-ResourceUtils -
-StateVisitor -
-WordData -
-WriterMessageLogger -
-
- - - diff --git a/doc/api/constant-values.html b/doc/api/constant-values.html deleted file mode 100644 index e296aa0..0000000 --- a/doc/api/constant-values.html +++ /dev/null @@ -1,446 +0,0 @@ - - - - - - -Constant Field Values - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Constant Field Values

-
-
-Contents - - - - - - -
-morfologik.fsa.*
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.fsa.CFSA
-public static final intBIT_FINAL_ARC1
-public static final intBIT_LAST_ARC2
-public static final intBIT_TARGET_NEXT4
-public static final byteVERSION-59
- -

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.fsa.CFSA2
-public static final intBIT_FINAL_ARC32
-public static final intBIT_LAST_ARC64
-public static final intBIT_TARGET_NEXT128
-public static final byteVERSION-58
- -

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.fsa.ConstantArcSizeFSA
-public static final intADDRESS_OFFSET2
-public static final intARC_SIZE6
-public static final intBIT_ARC_FINAL2
-public static final intBIT_ARC_LAST1
-public static final intFLAGS_OFFSET0
-public static final intFLAGS_SIZE1
-public static final intLABEL_OFFSET1
-public static final intLABEL_SIZE1
-public static final intTARGET_ADDRESS_SIZE4
- -

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.fsa.FSA5
-public static final intADDRESS_OFFSET1
-public static final intBIT_FINAL_ARC1
-public static final intBIT_LAST_ARC2
-public static final intBIT_TARGET_NEXT4
-public static final byteDEFAULT_ANNOTATION43
-public static final byteDEFAULT_FILLER95
-public static final byteVERSION5
- -

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.fsa.MatchResult
-public static final intAUTOMATON_HAS_PREFIX-3
-public static final intEXACT_MATCH0
-public static final intNO_MATCH-1
-public static final intSEQUENCE_IS_A_PREFIX-4
- -

- -

- - - - - -
-morfologik.stemming.*
- -

- - - - - - - - - - - - -
morfologik.stemming.Dictionary
-public static final java.lang.StringMETADATA_FILE_EXTENSION"info"
- -

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
morfologik.stemming.DictionaryMetadata
-public static final java.lang.StringATTR_NAME_ENCODING"fsa.dict.encoding"
-public static final java.lang.StringATTR_NAME_SEPARATOR"fsa.dict.separator"
-public static final java.lang.StringATTR_NAME_USES_INFIXES"fsa.dict.uses-infixes"
-public static final java.lang.StringATTR_NAME_USES_PREFIXES"fsa.dict.uses-prefixes"
- -

- -

-


- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/deprecated-list.html b/doc/api/deprecated-list.html deleted file mode 100644 index 2b4b8b8..0000000 --- a/doc/api/deprecated-list.html +++ /dev/null @@ -1,144 +0,0 @@ - - - - - - -Deprecated List - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Deprecated API

-
-
-Contents - -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/help-doc.html b/doc/api/help-doc.html deleted file mode 100644 index 16c5ae3..0000000 --- a/doc/api/help-doc.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - - - -API Help - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-How This API Document Is Organized

-
-This API (Application Programming Interface) document has pages corresponding to the items in the navigation bar, described as follows.

-Overview

-
- -

-The Overview page is the front page of this API document and provides a list of all packages with a summary for each. This page can also contain an overall description of the set of packages.

-

-Package

-
- -

-Each package has a page that contains a list of its classes and interfaces, with a summary for each. This page can contain four categories:

-
-

-Class/Interface

-
- -

-Each class, interface, nested class and nested interface has its own separate page. Each of these pages has three sections consisting of a class/interface description, summary tables, and detailed member descriptions:

-Each summary entry contains the first sentence from the detailed description for that item. The summary entries are alphabetical, while the detailed descriptions are in the order they appear in the source code. This preserves the logical groupings established by the programmer.
- -

-Annotation Type

-
- -

-Each annotation type has its own separate page with the following sections:

-
- -

-Enum

-
- -

-Each enum has its own separate page with the following sections:

-
-

-Tree (Class Hierarchy)

-
-There is a Class Hierarchy page for all packages, plus a hierarchy for each package. Each hierarchy page contains a list of classes and a list of interfaces. The classes are organized by inheritance structure starting with java.lang.Object. The interfaces do not inherit from java.lang.Object. -
-

-Deprecated API

-
-The Deprecated API page lists all of the API that have been deprecated. A deprecated API is not recommended for use, generally due to improvements, and a replacement API is usually given. Deprecated APIs may be removed in future implementations.
-

-Index

-
-The Index contains an alphabetic list of all classes, interfaces, constructors, methods, and fields.
-

-Prev/Next

-These links take you to the next or previous class, interface, package, or related page.

-Frames/No Frames

-These links show and hide the HTML frames. All pages are available with or without frames. -

-

-Serialized Form

-Each serializable or externalizable class has a description of its serialization fields and methods. This information is of interest to re-implementors, not to developers using the API. While there is no link in the navigation bar, you can get to this information by going to any serialized class and clicking "Serialized Form" in the "See also" section of the class description. -

-

-Constant Field Values

-The Constant Field Values page lists the static final fields and their values. -

- - -This help file applies to API documentation generated using the standard doclet. - -
-


- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/index-all.html b/doc/api/index-all.html deleted file mode 100644 index 6ae27f4..0000000 --- a/doc/api/index-all.html +++ /dev/null @@ -1,1211 +0,0 @@ - - - - - - -Index - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -A B C D E F G H I K L M N O P R S T U V W
-

-A

-
-
a - -Variable in class morfologik.fsa.FSAUtils.IntIntHolder -
  -
accept(int) - -Method in interface morfologik.fsa.StateVisitor -
  -
add(byte[], int, int) - -Method in class morfologik.fsa.FSABuilder -
Add a single sequence of bytes to the FSA. -
ADDRESS_OFFSET - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Offset of the address field inside an arc. -
ADDRESS_OFFSET - -Static variable in class morfologik.fsa.FSA5 -
An offset in the arc structure, where the address and flags field begins. -
annotation - -Variable in class morfologik.fsa.FSA5 -
Annotation character. -
annotationByte - -Variable in class morfologik.fsa.FSA5Serializer -
  -
ARC_SIZE - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Size of a single arc structure. -
arcs - -Variable in class morfologik.fsa.CFSA -
An array of bytes with the internal representation of the automaton. -
arcs - -Variable in class morfologik.fsa.CFSA2 -
An array of bytes with the internal representation of the automaton. -
arcs - -Variable in class morfologik.fsa.FSA5 -
An array of bytes with the internal representation of the automaton. -
arcsCount - -Variable in class morfologik.fsa.FSAInfo -
Number of arcs in the automaton, excluding an arcs from the zero node - (initial) and an arc from the start node to the root node. -
arcsCountTotal - -Variable in class morfologik.fsa.FSAInfo -
Total number of arcs, counting arcs that physically overlap due to - merging. -
Arrays - Class in morfologik.util
Compatibility layer for JVM 1.5.
assertExists(File, boolean, boolean) - -Static method in class morfologik.util.FileUtils -
Checks if the given file exists. -
asShort(Set<FSAFlags>) - -Static method in enum morfologik.fsa.FSAFlags -
Returns the set of flags encoded in a single short. -
asString(byte[], String) - -Static method in class morfologik.tools.MorphEncoder -
Converts a byte array to a given encoding. -
ATTR_NAME_ENCODING - -Static variable in class morfologik.stemming.DictionaryMetadata -
Attribute name for DictionaryMetadata.encoding. -
ATTR_NAME_SEPARATOR - -Static variable in class morfologik.stemming.DictionaryMetadata -
Attribute name for DictionaryMetadata.separator. -
ATTR_NAME_USES_INFIXES - -Static variable in class morfologik.stemming.DictionaryMetadata -
Attribute name for DictionaryMetadata.usesInfixes. -
ATTR_NAME_USES_PREFIXES - -Static variable in class morfologik.stemming.DictionaryMetadata -
Attribute name for DictionaryMetadata.usesPrefixes. -
AUTOMATON_HAS_PREFIX - -Static variable in class morfologik.fsa.MatchResult -
The automaton contains a prefix of the input sequence. -
-
-

-B

-
-
b - -Variable in class morfologik.fsa.FSAUtils.IntIntHolder -
  -
BIT_ARC_FINAL - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
An arc flag indicating the target node of an arc corresponds to a final - state. -
BIT_ARC_LAST - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
An arc flag indicating the arc is last within its state. -
BIT_FINAL_ARC - -Static variable in class morfologik.fsa.CFSA -
Bitmask indicating that an arc corresponds to the last character of a - sequence available when building the automaton. -
BIT_FINAL_ARC - -Static variable in class morfologik.fsa.CFSA2 -
The arc corresponds to the last character of a sequence - available when building the automaton (acceptor transition). -
BIT_FINAL_ARC - -Static variable in class morfologik.fsa.FSA5 -
Bit indicating that an arc corresponds to the last character of a - sequence available when building the automaton. -
BIT_LAST_ARC - -Static variable in class morfologik.fsa.CFSA -
Bitmask indicating that an arc is the last one of the node's list and the - following one belongs to another node. -
BIT_LAST_ARC - -Static variable in class morfologik.fsa.CFSA2 -
The arc is the last one from the current node's arcs list. -
BIT_LAST_ARC - -Static variable in class morfologik.fsa.FSA5 -
Bit indicating that an arc is the last one of the node's list and the - following one belongs to another node. -
BIT_TARGET_NEXT - -Static variable in class morfologik.fsa.CFSA -
Bitmask indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field). -
BIT_TARGET_NEXT - -Static variable in class morfologik.fsa.CFSA2 -
The target node of this arc follows the last arc of the current state - (no goto field). -
BIT_TARGET_NEXT - -Static variable in class morfologik.fsa.FSA5 -
Bit indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field). -
bits - -Variable in enum morfologik.fsa.FSAFlags -
Bit mask for the corresponding flag. -
BufferUtils - Class in morfologik.util
Utility functions for buffers.
build(byte[][]) - -Static method in class morfologik.fsa.FSABuilder -
Build a minimal, deterministic automaton from a sorted list of byte sequences. -
build(Iterable<byte[]>) - -Static method in class morfologik.fsa.FSABuilder -
Build a minimal, deterministic automaton from an iterable list of byte sequences. -
-
-

-C

-
-
calculateFanOuts(FSA, int) - -Static method in class morfologik.fsa.FSAUtils -
Calculate fan-out ratio. -
CFSA - Class in morfologik.fsa
CFSA (Compact Finite State Automaton) binary format implementation.
CFSA(InputStream) - -Constructor for class morfologik.fsa.CFSA -
Creates a new automaton, reading it from a file in FSA format, version 5. -
CFSA2 - Class in morfologik.fsa
CFSA (Compact Finite State Automaton) binary format implementation, version 2: - - CFSA2.BIT_TARGET_NEXT applicable on all arcs, not necessarily the last one.
CFSA2(InputStream) - -Constructor for class morfologik.fsa.CFSA2 -
Reads an automaton from a byte stream. -
CFSA2Serializer - Class in morfologik.fsa
Serializes in-memory FSA graphs to CFSA2.
CFSA2Serializer() - -Constructor for class morfologik.fsa.CFSA2Serializer -
  -
checkSingleByte(String) - -Static method in class morfologik.tools.FSABuildTool -
Check if the argument is a single byte after conversion using platform-default - encoding. -
clone() - -Method in class morfologik.stemming.WordData -
Declare a covariant of Object.clone() that returns a deep copy of - this object. -
close(Closeable...) - -Static method in class morfologik.util.FileUtils -
Force any non-null closeables. -
commonPrefix(byte[], byte[]) - -Static method in class morfologik.tools.MorphEncoder -
  -
compare(byte[], int, int, byte[], int, int) - -Static method in class morfologik.fsa.FSABuilder -
Lexicographic order of input sequences. -
complete() - -Method in class morfologik.fsa.FSABuilder -
Complete the automaton. -
ConstantArcSizeFSA - Class in morfologik.fsa
An FSA with constant-size arc representation produced directly - by FSABuilder.
-
-

-D

-
-
decodeStem(ByteBuffer, byte[], int, ByteBuffer, DictionaryMetadata) - -Static method in class morfologik.stemming.DictionaryLookup -
Decode the base form of an inflected word and save its decoded form into - a byte buffer. -
DEFAULT_ANNOTATION - -Static variable in class morfologik.fsa.FSA5 -
Default annotation byte. -
DEFAULT_FILLER - -Static variable in class morfologik.fsa.FSA5 -
Default filler byte. -
defaultDictionaries - -Static variable in class morfologik.stemming.Dictionary -
Default loaded dictionaries. -
Dictionary - Class in morfologik.stemming
A dictionary combines FSA automaton and metadata describing the - internals of dictionary entries' coding (DictionaryMetadata.
Dictionary(FSA, DictionaryMetadata) - -Constructor for class morfologik.stemming.Dictionary -
It is strongly recommended to use static methods in this class for - reading dictionaries. -
DictionaryIterator - Class in morfologik.stemming
An iterator over WordData entries of a Dictionary.
DictionaryIterator(Dictionary, CharsetDecoder, boolean) - -Constructor for class morfologik.stemming.DictionaryIterator -
  -
DictionaryLookup - Class in morfologik.stemming
This class implements a dictionary lookup over an FSA dictionary.
DictionaryLookup(Dictionary) - -Constructor for class morfologik.stemming.DictionaryLookup -
- Creates a new object of this class using the given FSA for word lookups - and encoding for converting characters to bytes. -
DictionaryMetadata - Class in morfologik.stemming
Description of attributes, their types and default values.
DictionaryMetadata(char, String, boolean, boolean, Map<String, String>) - -Constructor for class morfologik.stemming.DictionaryMetadata -
Creates an immutable instance of DictionaryMetadata. -
dumpLine(byte[], int) - -Method in class morfologik.tools.FSABuildTool -
Dump input line, byte-by-byte. -
-
-

-E

-
-
encoding - -Variable in class morfologik.stemming.DictionaryMetadata -
Encoding used for converting bytes to characters and vice versa. -
endPart() - -Method in interface morfologik.tools.IMessageLogger -
  -
endPart() - -Method in class morfologik.tools.WriterMessageLogger -
  -
ensureCapacity(ByteBuffer, int) - -Static method in class morfologik.util.BufferUtils -
Ensure the byte buffer's capacity. -
ensureCapacity(CharBuffer, int) - -Static method in class morfologik.util.BufferUtils -
Ensure the char buffer's capacity. -
equals(Object) - -Method in class morfologik.stemming.WordData -
  -
equals(byte[], int, byte[], int, int) - -Static method in class morfologik.util.Arrays -
Compare two arrays for equality. -
equals(boolean[], int, boolean[], int, int) - -Static method in class morfologik.util.Arrays -
Compare two arrays for equality. -
equals(int[], int, int[], int, int) - -Static method in class morfologik.util.Arrays -
Compare two arrays for equality. -
EXACT_MATCH - -Static variable in class morfologik.fsa.MatchResult -
The automaton has exactly one match for the input sequence. -
-
-

-F

-
-
FileUtils - Class in morfologik.util
Utility functions.
filler - -Variable in class morfologik.fsa.FSA5 -
Filler character. -
fillerByte - -Variable in class morfologik.fsa.FSA5Serializer -
  -
finalStatesCount - -Variable in class morfologik.fsa.FSAInfo -
Number of final states (number of input sequences stored in the automaton). -
FLAGS_OFFSET - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Offset of the flags field inside an arc. -
FLAGS_SIZE - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Size of the flags field (constant for the builder). -
FSA - Class in morfologik.fsa
This is a top abstract class for handling finite state automata.
FSA() - -Constructor for class morfologik.fsa.FSA -
  -
fsa - -Variable in class morfologik.stemming.Dictionary -
FSA automaton with the compiled dictionary data. -
FSA5 - Class in morfologik.fsa
FSA binary format implementation for version 5.
FSA5(InputStream) - -Constructor for class morfologik.fsa.FSA5 -
Read and wrap a binary automaton in FSA version 5. -
FSA5Serializer - Class in morfologik.fsa
Serializes in-memory FSA graphs to a binary format compatible with - Jan Daciuk's fsa's package FSA5 format.
FSA5Serializer() - -Constructor for class morfologik.fsa.FSA5Serializer -
  -
FSABuilder - Class in morfologik.fsa
Fast, memory-conservative finite state automaton builder, returning a - byte-serialized ConstantArcSizeFSA (a tradeoff between construction - speed and memory consumption).
FSABuilder() - -Constructor for class morfologik.fsa.FSABuilder -
  -
FSABuilder(int) - -Constructor for class morfologik.fsa.FSABuilder -
  -
FSABuilder.InfoEntry - Enum in morfologik.fsa
Debug and information constants.
FSABuildTool - Class in morfologik.tools
Convert from plain text input to a serialized FSA in any of the - available FSABuildTool.Formats.
FSABuildTool() - -Constructor for class morfologik.tools.FSABuildTool -
  -
FSABuildTool.Format - Enum in morfologik.tools
The serialization format to use for the binary output.
FSADumpTool - Class in morfologik.tools
This utility will dump the information and contents of a given FSA - dictionary.
FSADumpTool() - -Constructor for class morfologik.tools.FSADumpTool -
  -
FSAFinalStatesIterator - Class in morfologik.fsa
An iterator that traverses the right language of a given node (all sequences - reachable from a given node).
FSAFinalStatesIterator(FSA, int) - -Constructor for class morfologik.fsa.FSAFinalStatesIterator -
Create an instance of the iterator for a given node. -
FSAFlags - Enum in morfologik.fsa
FSA automaton flags.
FSAInfo - Class in morfologik.fsa
Compute additional information about an FSA: number of arcs, nodes, etc.
FSAInfo(FSA) - -Constructor for class morfologik.fsa.FSAInfo -
  -
FSAInfo(int, int, int, int) - -Constructor for class morfologik.fsa.FSAInfo -
  -
FSASerializer - Interface in morfologik.fsa
All FSA serializers to binary formats will implement this interface.
FSATraversal - Class in morfologik.fsa
This class implements some common matching and scanning operations on a - generic FSA.
FSATraversal(FSA) - -Constructor for class morfologik.fsa.FSATraversal -
Traversals of the given FSA. -
FSAUtils - Class in morfologik.fsa
Other FSA-related utilities not directly associated with the class hierarchy.
FSAUtils() - -Constructor for class morfologik.fsa.FSAUtils -
  -
FSAUtils.IntIntHolder - Class in morfologik.fsa
 
FSAUtils.IntIntHolder(int, int) - -Constructor for class morfologik.fsa.FSAUtils.IntIntHolder -
  -
FSAUtils.IntIntHolder() - -Constructor for class morfologik.fsa.FSAUtils.IntIntHolder -
  -
-
-

-G

-
-
getArc(int, byte) - -Method in class morfologik.fsa.CFSA -
-
getArc(int, byte) - -Method in class morfologik.fsa.CFSA2 -
-
getArc(int, byte) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getArc(int, byte) - -Method in class morfologik.fsa.FSA -
  -
getArc(int, byte) - -Method in class morfologik.fsa.FSA5 -
-
getArcCount(int) - -Method in class morfologik.fsa.FSA -
Calculates the number of arcs of a given node. -
getArcLabel(int) - -Method in class morfologik.fsa.CFSA -
Return the label associated with a given arc. -
getArcLabel(int) - -Method in class morfologik.fsa.CFSA2 -
Return the label associated with a given arc. -
getArcLabel(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getArcLabel(int) - -Method in class morfologik.fsa.FSA -
Return the label associated with a given arc. -
getArcLabel(int) - -Method in class morfologik.fsa.FSA5 -
Return the label associated with a given arc. -
getDictionary() - -Method in class morfologik.stemming.DictionaryLookup -
  -
getEndNode(int) - -Method in class morfologik.fsa.CFSA -
Return the end node pointed to by a given arc. -
getEndNode(int) - -Method in class morfologik.fsa.CFSA2 -
Return the end node pointed to by a given arc. -
getEndNode(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getEndNode(int) - -Method in class morfologik.fsa.FSA -
Return the end node pointed to by a given arc. -
getEndNode(int) - -Method in class morfologik.fsa.FSA5 -
Return the end node pointed to by a given arc. -
getExpectedFeaturesName(String) - -Static method in class morfologik.stemming.Dictionary -
Returns the expected name of the metadata file, based on the name of the - FSA dictionary file. -
getFirstArc(int) - -Method in class morfologik.fsa.CFSA -
-
getFirstArc(int) - -Method in class morfologik.fsa.CFSA2 -
-
getFirstArc(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getFirstArc(int) - -Method in class morfologik.fsa.FSA -
  -
getFirstArc(int) - -Method in class morfologik.fsa.FSA5 -
-
getFlags() - -Method in class morfologik.fsa.CFSA -
Returns a set of flags for this FSA instance. -
getFlags() - -Method in class morfologik.fsa.CFSA2 -
Returns a set of flags for this FSA instance. -
getFlags() - -Method in class morfologik.fsa.CFSA2Serializer -
Return supported flags. -
getFlags() - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getFlags() - -Method in class morfologik.fsa.FSA -
Returns a set of flags for this FSA instance. -
getFlags() - -Method in class morfologik.fsa.FSA5 -
Returns a set of flags for this FSA instance. -
getFlags() - -Method in class morfologik.fsa.FSA5Serializer -
Return supported flags. -
getFlags() - -Method in interface morfologik.fsa.FSASerializer -
Returns the set of flags supported by the serializer (and the output automaton). -
getForLanguage(String) - -Static method in class morfologik.stemming.Dictionary -
Return a built-in dictionary for a given ISO language code. -
getInfo() - -Method in class morfologik.fsa.FSABuilder -
Return various statistics concerning the FSA and its compilation. -
getNextArc(int) - -Method in class morfologik.fsa.CFSA -
-
getNextArc(int) - -Method in class morfologik.fsa.CFSA2 -
-
getNextArc(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getNextArc(int) - -Method in class morfologik.fsa.FSA -
  -
getNextArc(int) - -Method in class morfologik.fsa.FSA5 -
-
getRightLanguageCount(int) - -Method in class morfologik.fsa.CFSA -
-
getRightLanguageCount(int) - -Method in class morfologik.fsa.CFSA2 -
-
getRightLanguageCount(int) - -Method in class morfologik.fsa.FSA -
  -
getRightLanguageCount(int) - -Method in class morfologik.fsa.FSA5 -
Returns the number encoded at the given node. -
getRootNode() - -Method in class morfologik.fsa.CFSA -
Returns the start node of this automaton. -
getRootNode() - -Method in class morfologik.fsa.CFSA2 -
-
getRootNode() - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
getRootNode() - -Method in class morfologik.fsa.FSA -
  -
getRootNode() - -Method in class morfologik.fsa.FSA5 -
Returns the start node of this automaton. -
getSequences(int) - -Method in class morfologik.fsa.FSA -
Returns an iterator over all binary sequences starting at the given FSA - state (node) and ending in final nodes. -
getSequences() - -Method in class morfologik.fsa.FSA -
An alias of calling FSA.iterator() directly (FSA is also - Iterable). -
getSerializer() - -Method in enum morfologik.tools.FSABuildTool.Format -
  -
getStem() - -Method in class morfologik.stemming.WordData -
  -
getStemBytes(ByteBuffer) - -Method in class morfologik.stemming.WordData -
Copy the stem's binary data (no charset decoding) to a custom byte - buffer. -
getTag() - -Method in class morfologik.stemming.WordData -
  -
getTagBytes(ByteBuffer) - -Method in class morfologik.stemming.WordData -
Copy the tag's binary data (no charset decoding) to a custom byte buffer. -
getWord() - -Method in class morfologik.stemming.WordData -
  -
getWordBytes(ByteBuffer) - -Method in class morfologik.stemming.WordData -
Copy the inflected word's binary data (no charset decoding) to a custom - byte buffer. -
go(CommandLine) - -Method in class morfologik.tools.FSABuildTool -
Command line entry point after parsing arguments. -
go(CommandLine) - -Method in class morfologik.tools.FSADumpTool -
Command line entry point after parsing arguments. -
gtl - -Variable in class morfologik.fsa.CFSA -
Number of bytes each address takes in full, expanded form (goto length). -
gtl - -Variable in class morfologik.fsa.FSA5 -
Number of bytes each address takes in full, expanded form (goto length). -
-
-

-H

-
-
hashCode() - -Method in class morfologik.stemming.WordData -
  -
hasNext() - -Method in class morfologik.fsa.FSAFinalStatesIterator -
Returns true if there are still elements in this iterator. -
hasNext() - -Method in class morfologik.stemming.DictionaryIterator -
  -
-
-

-I

-
-
IMessageLogger - Interface in morfologik.tools
 
index - -Variable in class morfologik.fsa.MatchResult -
Input sequence's index, interpretation depends on MatchResult.kind. -
infixEncode(byte[], byte[], byte[]) - -Method in class morfologik.tools.MorphEncoder -
This method converts wordform, wordLemma and the tag to the form: - - inflected_form + MLKending + tags - - - where '+' is a separator, M is the position of characters to be deleted - towards the beginning of the inflected form ("A" means from the - beginning, "B" from the second character, "C" - from the third one, and - so on), L is the number of characters to be deleted from the position - specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - character that specifies how many characters should be deleted from the - end of the inflected form to produce the lexeme by concatenating the - stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - on). -
infixEncodeUTF8(String, String, String) - -Method in class morfologik.tools.MorphEncoder -
A UTF-8 variant of MorphEncoder.infixEncode(byte[], byte[], byte[]). -
inflectionFrames() - -Method in class morfologik.tools.InflectionFramesTool -
  -
InflectionFramesTool - Class in morfologik.tools
Calculate inflection frames from the Polish dictionary.
InflectionFramesTool() - -Constructor for class morfologik.tools.InflectionFramesTool -
  -
initializeOptions(Options) - -Method in class morfologik.tools.FSABuildTool -
  -
initializeOptions(Options) - -Method in class morfologik.tools.FSADumpTool -
Command line options for the tool. -
isArcFinal(int) - -Method in class morfologik.fsa.CFSA -
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -
isArcFinal(int) - -Method in class morfologik.fsa.CFSA2 -
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -
isArcFinal(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
isArcFinal(int) - -Method in class morfologik.fsa.FSA -
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -
isArcFinal(int) - -Method in class morfologik.fsa.FSA5 -
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -
isArcLast(int) - -Method in class morfologik.fsa.CFSA -
Returns true if this arc has NEXT bit set. -
isArcLast(int) - -Method in class morfologik.fsa.CFSA2 -
Returns true if this arc has NEXT bit set. -
isArcLast(int) - -Method in class morfologik.fsa.FSA5 -
Returns true if this arc has LAST bit set. -
isArcTerminal(int) - -Method in class morfologik.fsa.CFSA -
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). -
isArcTerminal(int) - -Method in class morfologik.fsa.CFSA2 -
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). -
isArcTerminal(int) - -Method in class morfologik.fsa.ConstantArcSizeFSA -
  -
isArcTerminal(int) - -Method in class morfologik.fsa.FSA -
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). -
isArcTerminal(int) - -Method in class morfologik.fsa.FSA5 -
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). -
isLabelCompressed(int) - -Method in class morfologik.fsa.CFSA -
Returns true if the label is compressed inside flags byte. -
isNextSet(int) - -Method in class morfologik.fsa.CFSA -
  -
isNextSet(int) - -Method in class morfologik.fsa.CFSA2 -
  -
isNextSet(int) - -Method in class morfologik.fsa.FSA5 -
  -
isSet(int, FSAFlags) - -Static method in enum morfologik.fsa.FSAFlags -
Returns true if the corresponding flag is set in the bit set. -
IStemmer - Interface in morfologik.stemming
A generic "stemmer" interface in Morfologik.
iterator() - -Method in class morfologik.fsa.FSA -
Returns an iterator over all binary sequences starting from the initial - FSA state (node) and ending in final nodes. -
iterator() - -Method in class morfologik.stemming.DictionaryLookup -
Return an iterator over all WordData entries available in the - embedded Dictionary. -
iterator() - -Method in class morfologik.stemming.PolishStemmer -
Iterates over all dictionary forms stored in this stemmer. -
-
-

-K

-
-
kind - -Variable in class morfologik.fsa.MatchResult -
One of the match kind constants defined in this class. -
-
-

-L

-
-
LABEL_OFFSET - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Offset of the label field inside an arc. -
LABEL_SIZE - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Size of the label field (constant for the builder). -
labelMapping - -Variable in class morfologik.fsa.CFSA -
Label mapping for arcs of type (1) (see class documentation). -
labelMapping - -Variable in class morfologik.fsa.CFSA2 -
Label mapping for M-indexed labels. -
Launcher - Class in morfologik.tools
A launcher for other command-line tools.
Launcher() - -Constructor for class morfologik.tools.Launcher -
  -
LEXICAL_ORDERING - -Static variable in class morfologik.fsa.FSABuilder -
Comparator comparing full byte arrays consistently with - FSABuilder.compare(byte[], int, int, byte[], int, int). -
log(String) - -Method in interface morfologik.tools.IMessageLogger -
Log progress to the console. -
log(String, Object) - -Method in interface morfologik.tools.IMessageLogger -
Log a two-part message. -
log(String) - -Method in class morfologik.tools.WriterMessageLogger -
  -
log(String, Object) - -Method in class morfologik.tools.WriterMessageLogger -
  -
lookup(CharSequence) - -Method in class morfologik.stemming.DictionaryLookup -
Searches the automaton for a symbol sequence equal to word, - followed by a separator. -
lookup(CharSequence) - -Method in interface morfologik.stemming.IStemmer -
Returns a list of WordData entries for a given word. -
lookup(CharSequence) - -Method in class morfologik.stemming.PolishStemmer -
Returns a list of WordData entries for a given word. -
-
-

-M

-
-
main(String[]) - -Static method in class morfologik.tools.FSABuildTool -
Command line entry point. -
main(String[]) - -Static method in class morfologik.tools.FSADumpTool -
Command line entry point. -
main(String[]) - -Static method in class morfologik.tools.InflectionFramesTool -
  -
main(String[]) - -Static method in class morfologik.tools.Launcher -
Command line entry point. -
match(MatchResult, byte[], int, int, int) - -Method in class morfologik.fsa.FSATraversal -
Same as FSATraversal.match(byte[], int, int, int), but allows passing - a reusable MatchResult object so that no intermediate garbage is - produced. -
match(byte[], int, int, int) - -Method in class morfologik.fsa.FSATraversal -
Finds a matching path in the dictionary for a given sequence of labels - from sequence and starting at node node. -
match(byte[], int) - -Method in class morfologik.fsa.FSATraversal -
  -
match(byte[]) - -Method in class morfologik.fsa.FSATraversal -
  -
MatchResult - Class in morfologik.fsa
A matching result returned from FSATraversal.
MatchResult() - -Constructor for class morfologik.fsa.MatchResult -
  -
metadata - -Variable in class morfologik.stemming.Dictionary -
Metadata associated with the dictionary. -
metadata - -Variable in class morfologik.stemming.DictionaryMetadata -
Other meta data not included above. -
METADATA_FILE_EXTENSION - -Static variable in class morfologik.stemming.Dictionary -
Expected metadata file extension. -
morfologik.fsa - package morfologik.fsa
 
morfologik.stemming - package morfologik.stemming
 
morfologik.tools - package morfologik.tools
 
morfologik.util - package morfologik.util
 
MorphEncoder - Class in morfologik.tools
A class that converts tabular data to fsa morphological format.
MorphEncoder() - -Constructor for class morfologik.tools.MorphEncoder -
  -
MorphEncoder(byte) - -Constructor for class morfologik.tools.MorphEncoder -
  -
-
-

-N

-
-
next() - -Method in class morfologik.fsa.FSAFinalStatesIterator -
  -
next() - -Method in class morfologik.stemming.DictionaryIterator -
  -
NO_MATCH - -Static variable in class morfologik.fsa.MatchResult -
The automaton has no match for the input sequence. -
node - -Variable in class morfologik.fsa.MatchResult -
Automaton node, interpretation depends on the MatchResult.kind. -
nodeCount - -Variable in class morfologik.fsa.FSAInfo -
Number of nodes in the automaton. -
nodeDataLength - -Variable in class morfologik.fsa.CFSA -
The length of the node header structure (if the automaton was compiled with - NUMBERS option). -
nodeDataLength - -Variable in class morfologik.fsa.FSA5 -
The length of the node header structure (if the automaton was compiled with - NUMBERS option). -
-
-

-O

-
-
openInputStream(String) - -Static method in class morfologik.util.ResourceUtils -
Returns an input stream to the resource. -
-
-

-P

-
-
perfectHash(byte[], int, int, int) - -Method in class morfologik.fsa.FSATraversal -
Calculate perfect hash for a given input sequence of bytes. -
perfectHash(byte[]) - -Method in class morfologik.fsa.FSATraversal -
  -
PolishStemmer - Class in morfologik.stemming
A dictionary-based stemmer for the Polish language.
PolishStemmer() - -Constructor for class morfologik.stemming.PolishStemmer -
This constructor is initialized with a built-in dictionary or fails with - a runtime exception if the dictionary is not available. -
prefixEncode(byte[], byte[], byte[]) - -Method in class morfologik.tools.MorphEncoder -
This method converts wordform, wordLemma and the tag to the form: - - - - inflected_form + LKending + tags - - - where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on). -
prefixEncodeUTF8(String, String, String) - -Method in class morfologik.tools.MorphEncoder -
A UTF-8 variant of MorphEncoder.prefixEncode(byte[], byte[], byte[]) This - method converts wordform, wordLemma and the tag to the form: - - inflected_form + LKending + tags - - - where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on). -
printUsage() - -Method in class morfologik.tools.FSABuildTool -
  -
-
-

-R

-
-
read(InputStream) - -Static method in class morfologik.fsa.FSA -
A factory for reading automata in any of the supported versions. -
read(File) - -Static method in class morfologik.stemming.Dictionary -
Attempts to load a dictionary using the path to the FSA file and the - expected metadata extension. -
read(URL) - -Static method in class morfologik.stemming.Dictionary -
- Attempts to load a dictionary using the URL to the FSA file and the - expected metadata extension. -
readAndClose(InputStream, InputStream) - -Static method in class morfologik.stemming.Dictionary -
Attempts to load a dictionary from opened streams of FSA dictionary data - and associated metadata. -
readByte(InputStream) - -Static method in class morfologik.util.FileUtils -
Read exactly one byte from the input stream. -
readFully(InputStream) - -Static method in class morfologik.util.FileUtils -
Reads all bytes from an input stream (until EOF). -
readFully(InputStream, byte[]) - -Static method in class morfologik.util.FileUtils -
Read enough bytes to fill array If there are not enough - bytes, throw an exception. -
readInt(InputStream) - -Static method in class morfologik.util.FileUtils -
Read exactly 4 bytes from the input stream. -
readShort(InputStream) - -Static method in class morfologik.util.FileUtils -
Read exactly 2 bytes from the input stream. -
referenceEquals(Object[], int, Object[], int, int) - -Static method in class morfologik.util.Arrays -
Compare two lists of objects for reference-equality. -
remove() - -Method in class morfologik.fsa.FSAFinalStatesIterator -
Not implemented in this iterator. -
remove() - -Method in class morfologik.stemming.DictionaryIterator -
  -
ResourceUtils - Class in morfologik.util
Resource management utilities.
restartFrom(int) - -Method in class morfologik.fsa.FSAFinalStatesIterator -
Restart walking from node. -
rightLanguage(FSA, int) - -Static method in class morfologik.fsa.FSAUtils -
All byte sequences generated as the right language of state. -
rightLanguageForAllStates(FSA) - -Static method in class morfologik.fsa.FSAUtils -
Calculate the size of right language for each state in an FSA. -
-
-

-S

-
-
separator - -Variable in class morfologik.stemming.DictionaryMetadata -
A separator character between fields (stem, lemma, form). -
SEQUENCE_IS_A_PREFIX - -Static variable in class morfologik.fsa.MatchResult -
The sequence is a prefix of at least one sequence in the automaton. -
serialize(FSA, T) - -Method in class morfologik.fsa.CFSA2Serializer -
Serializes any FSA to CFSA2 stream. -
serialize(FSA, T) - -Method in class morfologik.fsa.FSA5Serializer -
Serialize root state s to an output stream in - FSA5 format. -
serialize(FSA, T) - -Method in interface morfologik.fsa.FSASerializer -
Serialize a finite state automaton to an output stream. -
size - -Variable in class morfologik.fsa.FSAInfo -
Arcs size (in serialized form). -
standardEncode(byte[], byte[], byte[]) - -Method in class morfologik.tools.MorphEncoder -
This method converts the wordForm, wordLemma and tag to the form: - - - wordForm + Kending + tags - - - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending. -
standardEncodeUTF8(String, String, String) - -Method in class morfologik.tools.MorphEncoder -
A UTF-8 variant of MorphEncoder.standardEncode(byte[], byte[], byte[]) This - method converts the wordForm, wordLemma and tag to the form: - - - wordForm + Kending + tags - - - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending. -
startPart(String) - -Method in interface morfologik.tools.IMessageLogger -
Log message header and save current time. -
startPart(String) - -Method in class morfologik.tools.WriterMessageLogger -
  -
StateVisitor - Interface in morfologik.fsa
State visitor.
-
-

-T

-
-
TARGET_ADDRESS_SIZE - -Static variable in class morfologik.fsa.ConstantArcSizeFSA -
Size of the target address field (constant for the builder). -
toDot(FSA, int) - -Static method in class morfologik.fsa.FSAUtils -
Returns the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language). -
toDot(Writer, FSA, int) - -Static method in class morfologik.fsa.FSAUtils -
Saves the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language), to the given writer. -
toString() - -Method in enum morfologik.fsa.FSABuilder.InfoEntry -
  -
toString() - -Method in class morfologik.fsa.FSAInfo -
  -
toString(byte[], int, int) - -Static method in class morfologik.util.Arrays -
Convert an array of strings to bytes. -
toString(ByteBuffer) - -Static method in class morfologik.util.BufferUtils -
Convert a byte buffer to a string in platform default encoding. -
-
-

-U

-
-
usesInfixes - -Variable in class morfologik.stemming.DictionaryMetadata -
True if the dictionary was compiled with infix compression. -
usesPrefixes - -Variable in class morfologik.stemming.DictionaryMetadata -
True if the dictionary was compiled with prefix compression. -
-
-

-V

-
-
valueOf(String) - -Static method in enum morfologik.fsa.FSABuilder.InfoEntry -
Returns the enum constant of this type with the specified name. -
valueOf(String) - -Static method in enum morfologik.fsa.FSAFlags -
Returns the enum constant of this type with the specified name. -
valueOf(String) - -Static method in enum morfologik.tools.FSABuildTool.Format -
Returns the enum constant of this type with the specified name. -
values() - -Static method in enum morfologik.fsa.FSABuilder.InfoEntry -
Returns an array containing the constants of this enum type, in -the order they are declared. -
values() - -Static method in enum morfologik.fsa.FSAFlags -
Returns an array containing the constants of this enum type, in -the order they are declared. -
values() - -Static method in enum morfologik.tools.FSABuildTool.Format -
Returns an array containing the constants of this enum type, in -the order they are declared. -
VERSION - -Static variable in class morfologik.fsa.CFSA -
Automaton header version value. -
VERSION - -Static variable in class morfologik.fsa.CFSA2 -
Automaton header version value. -
VERSION - -Static variable in class morfologik.fsa.FSA5 -
Automaton version as in the file header. -
visitAllStates(T) - -Method in class morfologik.fsa.FSA -
Visit all states. -
visitInPostOrder(T) - -Method in class morfologik.fsa.FSA -
Same as FSA.visitInPostOrder(StateVisitor, int), - starting from root automaton node. -
visitInPostOrder(T, int) - -Method in class morfologik.fsa.FSA -
Visits all states reachable from node in postorder. -
visitInPreOrder(T) - -Method in class morfologik.fsa.FSA -
Same as FSA.visitInPreOrder(StateVisitor, int), starting from root automaton node. -
visitInPreOrder(T, int) - -Method in class morfologik.fsa.FSA -
Visits all states in preorder. -
-
-

-W

-
-
withAnnotationSeparator(byte) - -Method in class morfologik.fsa.CFSA2Serializer -
  -
withAnnotationSeparator(byte) - -Method in class morfologik.fsa.FSA5Serializer -
Supports built-in annotation separator. -
withAnnotationSeparator(byte) - -Method in interface morfologik.fsa.FSASerializer -
Supports built-in annotation separator. -
withFiller(byte) - -Method in class morfologik.fsa.CFSA2Serializer -
  -
withFiller(byte) - -Method in class morfologik.fsa.FSA5Serializer -
Supports built-in filler separator. -
withFiller(byte) - -Method in interface morfologik.fsa.FSASerializer -
Supports built-in filler separator. -
withLogger(IMessageLogger) - -Method in class morfologik.fsa.CFSA2Serializer -
  -
withLogger(IMessageLogger) - -Method in class morfologik.fsa.FSA5Serializer -
Log extra messages during construction. -
withLogger(IMessageLogger) - -Method in interface morfologik.fsa.FSASerializer -
Log extra messages during construction. -
withNumbers() - -Method in class morfologik.fsa.CFSA2Serializer -
Serialize the automaton with the number of right-language sequences in - each node. -
withNumbers() - -Method in class morfologik.fsa.FSA5Serializer -
Serialize the automaton with the number of right-language sequences in - each node. -
withNumbers() - -Method in interface morfologik.fsa.FSASerializer -
Supports built-in right language count on nodes, speeding up perfect hash counts. -
WordData - Class in morfologik.stemming
Stem and tag data associated with a given word.
writeInt(OutputStream, int) - -Static method in class morfologik.util.FileUtils -
  -
WriterMessageLogger - Class in morfologik.tools
A logger dumping info to System.err.
WriterMessageLogger(PrintWriter) - -Constructor for class morfologik.tools.WriterMessageLogger -
  -
writeShort(OutputStream, short) - -Static method in class morfologik.util.FileUtils -
  -
-
-A B C D E F G H I K L M N O P R S T U V W - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/index.html b/doc/api/index.html deleted file mode 100644 index 866e191..0000000 --- a/doc/api/index.html +++ /dev/null @@ -1,39 +0,0 @@ - - - - - - -Generated Documentation (Untitled) - - - - - - - - - - - -<H2> -Frame Alert</H2> - -<P> -This document is designed to be viewed using the frames feature. If you see this message, you are using a non-frame-capable web client. -<BR> -Link to<A HREF="overview-summary.html">Non-frame version.</A> - - - diff --git a/doc/api/morfologik/fsa/CFSA.html b/doc/api/morfologik/fsa/CFSA.html deleted file mode 100644 index 1b0b2ff..0000000 --- a/doc/api/morfologik/fsa/CFSA.html +++ /dev/null @@ -1,871 +0,0 @@ - - - - - - -CFSA - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class CFSA

-
-java.lang.Object
-  extended by morfologik.fsa.FSA
-      extended by morfologik.fsa.CFSA
-
-
-
All Implemented Interfaces:
java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
-
public final class CFSA
extends FSA
- - -

-CFSA (Compact Finite State Automaton) binary format implementation. This is a - slightly reorganized version of FSA5 offering smaller automata size - at some (minor) performance penalty. - -

Note: Serialize to CFSA2 for new code.

- -

The encoding of automaton body is as follows.

- -
- ---- FSA header (standard)
- Byte                            Description 
-       +-+-+-+-+-+-+-+-+\
-     0 | | | | | | | | | +------ '\'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     1 | | | | | | | | | +------ 'f'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     2 | | | | | | | | | +------ 's'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     3 | | | | | | | | | +------ 'a'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     4 | | | | | | | | | +------ version (fixed 0xc5)
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     5 | | | | | | | | | +------ filler character
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     6 | | | | | | | | | +------ annot character
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-  8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding. 
-       : : : : : : : : : |
-       +-+-+-+-+-+-+-+-+/
- 
- ---- Start of a node; only if automaton was compiled with NUMBERS option.
- 
- Byte
-        +-+-+-+-+-+-+-+-+\
-      0 | | | | | | | | | \  LSB
-        +-+-+-+-+-+-+-+-+  +
-      1 | | | | | | | | |  |      number of strings recognized
-        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
-        : : : : : : : : :  |      from this node.
-        +-+-+-+-+-+-+-+-+  +
-  ctl-1 | | | | | | | | | /  MSB
-        +-+-+-+-+-+-+-+-+/
-        
- ---- A vector of node's arcs. Conditional format, depending on flags.
- 
- 1) NEXT bit set, mapped arc label. 
- 
-                +--------------- arc's label mapped in M bits if M's field value > 0
-                | +------------- node pointed to is next
-                | | +----------- the last arc of the node
-         _______| | | +--------- the arc is final
-        /       | | | |
-       +-+-+-+-+-+-+-+-+\
-     0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
-       +-+-+-+-+-+-+-+-+/
- 
- 2) NEXT bit set, label separate.
- 
-                +--------------- arc's label stored separately (M's field is zero).
-                | +------------- node pointed to is next
-                | | +----------- the last arc of the node
-                | | | +--------- the arc is final
-                | | | |
-       +-+-+-+-+-+-+-+-+\
-     0 |0|0|0|0|0|1|L|F| +------ flags
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     1 | | | | | | | | | +------ label
-       +-+-+-+-+-+-+-+-+/
- 
- 3) NEXT bit not set. Full arc.
- 
-                  +------------- node pointed to is next
-                  | +----------- the last arc of the node
-                  | | +--------- the arc is final
-                  | | |
-       +-+-+-+-+-+-+-+-+\
-     0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     1 | | | | | | | | | +------ label
-       +-+-+-+-+-+-+-+-+/
-       : : : : : : : : :       
-       +-+-+-+-+-+-+-+-+\
- gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
-       +-+-+-+-+-+-+-+-+/
- 
-

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
- byte[]arcs - -
-          An array of bytes with the internal representation of the automaton.
-static intBIT_FINAL_ARC - -
-          Bitmask indicating that an arc corresponds to the last character of a - sequence available when building the automaton.
-static intBIT_LAST_ARC - -
-          Bitmask indicating that an arc is the last one of the node's list and the - following one belongs to another node.
-static intBIT_TARGET_NEXT - -
-          Bitmask indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field).
- intgtl - -
-          Number of bytes each address takes in full, expanded form (goto length).
- byte[]labelMapping - -
-          Label mapping for arcs of type (1) (see class documentation).
- intnodeDataLength - -
-          The length of the node header structure (if the automaton was compiled with - NUMBERS option).
-static byteVERSION - -
-          Automaton header version value.
-  - - - - - - - - - - -
-Constructor Summary
CFSA(java.io.InputStream fsaStream) - -
-          Creates a new automaton, reading it from a file in FSA format, version 5.
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- intgetArc(int node, - byte label) - -
-          
- bytegetArcLabel(int arc) - -
-          Return the label associated with a given arc.
- intgetEndNode(int arc) - -
-          Return the end node pointed to by a given arc.
- intgetFirstArc(int node) - -
-          
- java.util.Set<FSAFlags>getFlags() - -
-          Returns a set of flags for this FSA instance.
- intgetNextArc(int arc) - -
-          
- intgetRightLanguageCount(int node) - -
-          
- intgetRootNode() - -
-          Returns the start node of this automaton.
- booleanisArcFinal(int arc) - -
-          Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton.
- booleanisArcLast(int arc) - -
-          Returns true if this arc has NEXT bit set.
- booleanisArcTerminal(int arc) - -
-          Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception).
- booleanisLabelCompressed(int arc) - -
-          Returns true if the label is compressed inside flags byte.
- booleanisNextSet(int arc) - -
-           
- - - - - - - -
Methods inherited from class morfologik.fsa.FSA
getArcCount, getSequences, getSequences, iterator, read, visitAllStates, visitInPostOrder, visitInPostOrder, visitInPreOrder, visitInPreOrder
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-VERSION

-
-public static final byte VERSION
-
-
Automaton header version value. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_FINAL_ARC

-
-public static final int BIT_FINAL_ARC
-
-
Bitmask indicating that an arc corresponds to the last character of a - sequence available when building the automaton. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_LAST_ARC

-
-public static final int BIT_LAST_ARC
-
-
Bitmask indicating that an arc is the last one of the node's list and the - following one belongs to another node. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_TARGET_NEXT

-
-public static final int BIT_TARGET_NEXT
-
-
Bitmask indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field). -

-

-
See Also:
Constant Field Values
-
-
- -

-arcs

-
-public byte[] arcs
-
-
An array of bytes with the internal representation of the automaton. - Please see the documentation of this class for more information on how - this structure is organized. -

-

-
-
-
- -

-nodeDataLength

-
-public final int nodeDataLength
-
-
The length of the node header structure (if the automaton was compiled with - NUMBERS option). Otherwise zero. -

-

-
-
-
- -

-gtl

-
-public final int gtl
-
-
Number of bytes each address takes in full, expanded form (goto length). -

-

-
-
-
- -

-labelMapping

-
-public final byte[] labelMapping
-
-
Label mapping for arcs of type (1) (see class documentation). The array - is indexed by mapped label's value and contains the original label. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-CFSA

-
-public CFSA(java.io.InputStream fsaStream)
-     throws java.io.IOException
-
-
Creates a new automaton, reading it from a file in FSA format, version 5. -

-

- -
Throws: -
java.io.IOException
-
- - - - - - - - -
-Method Detail
- -

-getRootNode

-
-public int getRootNode()
-
-
Returns the start node of this automaton. May return 0 if - the start node is also an end node. -

-

-
Specified by:
getRootNode in class FSA
-
-
- -
Returns:
Returns the identifier of the root node of this automaton. - Returns 0 if the start node is also the end node (the automaton - is empty).
-
-
-
- -

-getFirstArc

-
-public final int getFirstArc(int node)
-
-
-

-

-
Specified by:
getFirstArc in class FSA
-
-
- -
Returns:
Returns the identifier of the first arc leaving node - or 0 if the node has no outgoing arcs.
-
-
-
- -

-getNextArc

-
-public final int getNextArc(int arc)
-
-
-

-

-
Specified by:
getNextArc in class FSA
-
-
- -
Returns:
Returns the identifier of the next arc after arc and - leaving node. Zero is returned if no more arcs are - available for the node.
-
-
-
- -

-getArc

-
-public int getArc(int node,
-                  byte label)
-
-
-

-

-
Specified by:
getArc in class FSA
-
-
- -
Returns:
Returns the identifier of an arc leaving node and - labeled with label. An identifier equal to 0 means - the node has no outgoing arc labeled label.
-
-
-
- -

-getEndNode

-
-public int getEndNode(int arc)
-
-
Return the end node pointed to by a given arc. Terminal arcs - (those that point to a terminal state) have no end node representation - and throw a runtime exception. -

-

-
Specified by:
getEndNode in class FSA
-
-
-
-
-
-
- -

-getArcLabel

-
-public byte getArcLabel(int arc)
-
-
Return the label associated with a given arc. -

-

-
Specified by:
getArcLabel in class FSA
-
-
-
-
-
-
- -

-getRightLanguageCount

-
-public int getRightLanguageCount(int node)
-
-
-

-

-
Overrides:
getRightLanguageCount in class FSA
-
-
- -
Returns:
Returns the number of sequences reachable from the given state if - the automaton was compiled with FSAFlags.NUMBERS. The size of - the right language of the state, in other words.
-
-
-
- -

-isArcFinal

-
-public boolean isArcFinal(int arc)
-
-
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -

-

-
Specified by:
isArcFinal in class FSA
-
-
-
-
-
-
- -

-isArcTerminal

-
-public boolean isArcTerminal(int arc)
-
-
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). Implies FSA.isArcFinal(int). -

-

-
Specified by:
isArcTerminal in class FSA
-
-
-
-
-
-
- -

-isArcLast

-
-public boolean isArcLast(int arc)
-
-
Returns true if this arc has NEXT bit set. -

-

-
See Also:
BIT_LAST_ARC
-
-
-
- -

-isNextSet

-
-public boolean isNextSet(int arc)
-
-
-
See Also:
BIT_TARGET_NEXT
-
-
-
- -

-isLabelCompressed

-
-public boolean isLabelCompressed(int arc)
-
-
Returns true if the label is compressed inside flags byte. -

-

-
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Returns a set of flags for this FSA instance. - -

For this automaton version, an additional FSAFlags.NUMBERS flag - may be set to indicate the automaton contains extra fields for each node.

-

-

-
Specified by:
getFlags in class FSA
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/CFSA2.html b/doc/api/morfologik/fsa/CFSA2.html deleted file mode 100644 index 78c00ef..0000000 --- a/doc/api/morfologik/fsa/CFSA2.html +++ /dev/null @@ -1,779 +0,0 @@ - - - - - - -CFSA2 - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class CFSA2

-
-java.lang.Object
-  extended by morfologik.fsa.FSA
-      extended by morfologik.fsa.CFSA2
-
-
-
All Implemented Interfaces:
java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
-
public final class CFSA2
extends FSA
- - -

-CFSA (Compact Finite State Automaton) binary format implementation, version 2: -

- -

The encoding of automaton body is as follows.

- -
- ---- CFSA header
- Byte                            Description 
-       +-+-+-+-+-+-+-+-+\
-     0 | | | | | | | | | +------ '\'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     1 | | | | | | | | | +------ 'f'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     2 | | | | | | | | | +------ 's'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     3 | | | | | | | | | +------ 'a'
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     4 | | | | | | | | | +------ version (fixed 0xc6)
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     5 | | | | | | | | | +----\
-       +-+-+-+-+-+-+-+-+/      \ flags [MSB first]
-       +-+-+-+-+-+-+-+-+\      /
-     6 | | | | | | | | | +----/
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     7 | | | | | | | | | +------ label lookup table size
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-  8-32 | | | | | | | | | +------ label value lookup table 
-       : : : : : : : : : |
-       +-+-+-+-+-+-+-+-+/
- 
- ---- Start of a node; only if automaton was compiled with NUMBERS option.
- 
- Byte
-        +-+-+-+-+-+-+-+-+\
-      0 | | | | | | | | | \  
-        +-+-+-+-+-+-+-+-+  +
-      1 | | | | | | | | |  |      number of strings recognized
-        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
-        : : : : : : : : :  |      from this node. v-coding
-        +-+-+-+-+-+-+-+-+  +
-        | | | | | | | | | /  
-        +-+-+-+-+-+-+-+-+/
-
- ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
- 
- 1) NEXT bit set, mapped arc label. 
- 
-        +----------------------- node pointed to is next
-        | +--------------------- the last arc of the node
-        | | +------------------- this arc leads to a final state (acceptor)
-        | | |  _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
-        | | | / | | | |
-       +-+-+-+-+-+-+-+-+\
-     0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
-       +-+-+-+-+-+-+-+-+/
-       +-+-+-+-+-+-+-+-+\
-     1 | | | | | | | | | +------ optional label if M == 0
-       +-+-+-+-+-+-+-+-+/
-       : : : : : : : : :
-       +-+-+-+-+-+-+-+-+\
-       |A|A|A|A|A|A|A|A| +------ v-coded goto address
-       +-+-+-+-+-+-+-+-+/
- 
-

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
- byte[]arcs - -
-          An array of bytes with the internal representation of the automaton.
-static intBIT_FINAL_ARC - -
-          The arc corresponds to the last character of a sequence - available when building the automaton (acceptor transition).
-static intBIT_LAST_ARC - -
-          The arc is the last one from the current node's arcs list.
-static intBIT_TARGET_NEXT - -
-          The target node of this arc follows the last arc of the current state - (no goto field).
- byte[]labelMapping - -
-          Label mapping for M-indexed labels.
-static byteVERSION - -
-          Automaton header version value.
-  - - - - - - - - - - -
-Constructor Summary
CFSA2(java.io.InputStream in) - -
-          Reads an automaton from a byte stream.
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- intgetArc(int node, - byte label) - -
-          
- bytegetArcLabel(int arc) - -
-          Return the label associated with a given arc.
- intgetEndNode(int arc) - -
-          Return the end node pointed to by a given arc.
- intgetFirstArc(int node) - -
-          
- java.util.Set<FSAFlags>getFlags() - -
-          Returns a set of flags for this FSA instance.
- intgetNextArc(int arc) - -
-          
- intgetRightLanguageCount(int node) - -
-          
- intgetRootNode() - -
-          
- booleanisArcFinal(int arc) - -
-          Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton.
- booleanisArcLast(int arc) - -
-          Returns true if this arc has NEXT bit set.
- booleanisArcTerminal(int arc) - -
-          Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception).
- booleanisNextSet(int arc) - -
-           
- - - - - - - -
Methods inherited from class morfologik.fsa.FSA
getArcCount, getSequences, getSequences, iterator, read, visitAllStates, visitInPostOrder, visitInPostOrder, visitInPreOrder, visitInPreOrder
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-VERSION

-
-public static final byte VERSION
-
-
Automaton header version value. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_TARGET_NEXT

-
-public static final int BIT_TARGET_NEXT
-
-
The target node of this arc follows the last arc of the current state - (no goto field). -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_LAST_ARC

-
-public static final int BIT_LAST_ARC
-
-
The arc is the last one from the current node's arcs list. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_FINAL_ARC

-
-public static final int BIT_FINAL_ARC
-
-
The arc corresponds to the last character of a sequence - available when building the automaton (acceptor transition). -

-

-
See Also:
Constant Field Values
-
-
- -

-arcs

-
-public byte[] arcs
-
-
An array of bytes with the internal representation of the automaton. - Please see the documentation of this class for more information on how - this structure is organized. -

-

-
-
-
- -

-labelMapping

-
-public final byte[] labelMapping
-
-
Label mapping for M-indexed labels. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-CFSA2

-
-public CFSA2(java.io.InputStream in)
-      throws java.io.IOException
-
-
Reads an automaton from a byte stream. -

-

- -
Throws: -
java.io.IOException
-
- - - - - - - - -
-Method Detail
- -

-getRootNode

-
-public int getRootNode()
-
-
-

-

-
Specified by:
getRootNode in class FSA
-
-
- -
Returns:
Returns the identifier of the root node of this automaton. - Returns 0 if the start node is also the end node (the automaton - is empty).
-
-
-
- -

-getFirstArc

-
-public final int getFirstArc(int node)
-
-
-

-

-
Specified by:
getFirstArc in class FSA
-
-
- -
Returns:
Returns the identifier of the first arc leaving node - or 0 if the node has no outgoing arcs.
-
-
-
- -

-getNextArc

-
-public final int getNextArc(int arc)
-
-
-

-

-
Specified by:
getNextArc in class FSA
-
-
- -
Returns:
Returns the identifier of the next arc after arc and - leaving node. Zero is returned if no more arcs are - available for the node.
-
-
-
- -

-getArc

-
-public int getArc(int node,
-                  byte label)
-
-
-

-

-
Specified by:
getArc in class FSA
-
-
- -
Returns:
Returns the identifier of an arc leaving node and - labeled with label. An identifier equal to 0 means - the node has no outgoing arc labeled label.
-
-
-
- -

-getEndNode

-
-public int getEndNode(int arc)
-
-
Return the end node pointed to by a given arc. Terminal arcs - (those that point to a terminal state) have no end node representation - and throw a runtime exception. -

-

-
Specified by:
getEndNode in class FSA
-
-
-
-
-
-
- -

-getArcLabel

-
-public byte getArcLabel(int arc)
-
-
Return the label associated with a given arc. -

-

-
Specified by:
getArcLabel in class FSA
-
-
-
-
-
-
- -

-getRightLanguageCount

-
-public int getRightLanguageCount(int node)
-
-
-

-

-
Overrides:
getRightLanguageCount in class FSA
-
-
- -
Returns:
Returns the number of sequences reachable from the given state if - the automaton was compiled with FSAFlags.NUMBERS. The size of - the right language of the state, in other words.
-
-
-
- -

-isArcFinal

-
-public boolean isArcFinal(int arc)
-
-
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -

-

-
Specified by:
isArcFinal in class FSA
-
-
-
-
-
-
- -

-isArcTerminal

-
-public boolean isArcTerminal(int arc)
-
-
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). Implies FSA.isArcFinal(int). -

-

-
Specified by:
isArcTerminal in class FSA
-
-
-
-
-
-
- -

-isArcLast

-
-public boolean isArcLast(int arc)
-
-
Returns true if this arc has NEXT bit set. -

-

-
See Also:
BIT_LAST_ARC
-
-
-
- -

-isNextSet

-
-public boolean isNextSet(int arc)
-
-
-
See Also:
BIT_TARGET_NEXT
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Returns a set of flags for this FSA instance. -

-

-
Specified by:
getFlags in class FSA
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/CFSA2Serializer.html b/doc/api/morfologik/fsa/CFSA2Serializer.html deleted file mode 100644 index 159fe3e..0000000 --- a/doc/api/morfologik/fsa/CFSA2Serializer.html +++ /dev/null @@ -1,414 +0,0 @@ - - - - - - -CFSA2Serializer - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class CFSA2Serializer

-
-java.lang.Object
-  extended by morfologik.fsa.CFSA2Serializer
-
-
-
All Implemented Interfaces:
FSASerializer
-
-
-
-
public final class CFSA2Serializer
extends java.lang.Object
implements FSASerializer
- - -

-Serializes in-memory FSA graphs to CFSA2. - -

- It is possible to serialize the automaton with numbers required for perfect - hashing. See withNumbers() method. -

-

- -

-

-
See Also:
CFSA2, -FSA.read(java.io.InputStream)
-
- -

- - - - - - - - - - - -
-Constructor Summary
CFSA2Serializer() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- java.util.Set<FSAFlags>getFlags() - -
-          Return supported flags.
- - - - - -
-<T extends java.io.OutputStream> -
-T
-
serialize(FSA fsa, - T os) - -
-          Serializes any FSA to CFSA2 stream.
- CFSA2SerializerwithAnnotationSeparator(byte annotationSeparator) - -
-          Supports built-in annotation separator.
- CFSA2SerializerwithFiller(byte filler) - -
-          Supports built-in filler separator.
- CFSA2SerializerwithLogger(IMessageLogger logger) - -
-          Log extra messages during construction.
- CFSA2SerializerwithNumbers() - -
-          Serialize the automaton with the number of right-language sequences in - each node.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-CFSA2Serializer

-
-public CFSA2Serializer()
-
-
- - - - - - - - -
-Method Detail
- -

-withNumbers

-
-public CFSA2Serializer withNumbers()
-
-
Serialize the automaton with the number of right-language sequences in - each node. This is required to implement perfect hashing. The numbering - also preserves the order of input sequences. -

-

-
Specified by:
withNumbers in interface FSASerializer
-
-
- -
Returns:
Returns the same object for easier call chaining.
-
-
-
- -

-serialize

-
-public <T extends java.io.OutputStream> T serialize(FSA fsa,
-                                                    T os)
-                                         throws java.io.IOException
-
-
Serializes any FSA to CFSA2 stream. -

-

-
Specified by:
serialize in interface FSASerializer
-
-
- -
Returns:
Returns os for chaining. -
Throws: -
java.io.IOException
See Also:
withNumbers
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Return supported flags. -

-

-
Specified by:
getFlags in interface FSASerializer
-
-
-
-
-
-
- -

-withFiller

-
-public CFSA2Serializer withFiller(byte filler)
-
-
Description copied from interface: FSASerializer
-
Supports built-in filler separator. Only if FSASerializer.getFlags() returns - FSAFlags.SEPARATORS. -

-

-
Specified by:
withFiller in interface FSASerializer
-
-
-
-
-
-
- -

-withAnnotationSeparator

-
-public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator)
-
-
Description copied from interface: FSASerializer
-
Supports built-in annotation separator. Only if FSASerializer.getFlags() returns - FSAFlags.SEPARATORS. -

-

-
Specified by:
withAnnotationSeparator in interface FSASerializer
-
-
-
-
-
-
- -

-withLogger

-
-public CFSA2Serializer withLogger(IMessageLogger logger)
-
-
Description copied from interface: FSASerializer
-
Log extra messages during construction. -

-

-
Specified by:
withLogger in interface FSASerializer
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/ConstantArcSizeFSA.html b/doc/api/morfologik/fsa/ConstantArcSizeFSA.html deleted file mode 100644 index e73b13a..0000000 --- a/doc/api/morfologik/fsa/ConstantArcSizeFSA.html +++ /dev/null @@ -1,654 +0,0 @@ - - - - - - -ConstantArcSizeFSA - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class ConstantArcSizeFSA

-
-java.lang.Object
-  extended by morfologik.fsa.FSA
-      extended by morfologik.fsa.ConstantArcSizeFSA
-
-
-
All Implemented Interfaces:
java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
-
public final class ConstantArcSizeFSA
extends FSA
- - -

-An FSA with constant-size arc representation produced directly - by FSABuilder. -

- -

-

-
See Also:
FSABuilder
-
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
-static intADDRESS_OFFSET - -
-          Offset of the address field inside an arc.
-static intARC_SIZE - -
-          Size of a single arc structure.
-static intBIT_ARC_FINAL - -
-          An arc flag indicating the target node of an arc corresponds to a final - state.
-static intBIT_ARC_LAST - -
-          An arc flag indicating the arc is last within its state.
-static intFLAGS_OFFSET - -
-          Offset of the flags field inside an arc.
-static intFLAGS_SIZE - -
-          Size of the flags field (constant for the builder).
-static intLABEL_OFFSET - -
-          Offset of the label field inside an arc.
-static intLABEL_SIZE - -
-          Size of the label field (constant for the builder).
-static intTARGET_ADDRESS_SIZE - -
-          Size of the target address field (constant for the builder).
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- intgetArc(int node, - byte label) - -
-           
- bytegetArcLabel(int arc) - -
-          Return the label associated with a given arc.
- intgetEndNode(int arc) - -
-          Return the end node pointed to by a given arc.
- intgetFirstArc(int node) - -
-           
- java.util.Set<FSAFlags>getFlags() - -
-          Returns a set of flags for this FSA instance.
- intgetNextArc(int arc) - -
-           
- intgetRootNode() - -
-           
- booleanisArcFinal(int arc) - -
-          Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton.
- booleanisArcTerminal(int arc) - -
-          Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception).
- - - - - - - -
Methods inherited from class morfologik.fsa.FSA
getArcCount, getRightLanguageCount, getSequences, getSequences, iterator, read, visitAllStates, visitInPostOrder, visitInPostOrder, visitInPreOrder, visitInPreOrder
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-TARGET_ADDRESS_SIZE

-
-public static final int TARGET_ADDRESS_SIZE
-
-
Size of the target address field (constant for the builder). -

-

-
See Also:
Constant Field Values
-
-
- -

-FLAGS_SIZE

-
-public static final int FLAGS_SIZE
-
-
Size of the flags field (constant for the builder). -

-

-
See Also:
Constant Field Values
-
-
- -

-LABEL_SIZE

-
-public static final int LABEL_SIZE
-
-
Size of the label field (constant for the builder). -

-

-
See Also:
Constant Field Values
-
-
- -

-ARC_SIZE

-
-public static final int ARC_SIZE
-
-
Size of a single arc structure. -

-

-
See Also:
Constant Field Values
-
-
- -

-FLAGS_OFFSET

-
-public static final int FLAGS_OFFSET
-
-
Offset of the flags field inside an arc. -

-

-
See Also:
Constant Field Values
-
-
- -

-LABEL_OFFSET

-
-public static final int LABEL_OFFSET
-
-
Offset of the label field inside an arc. -

-

-
See Also:
Constant Field Values
-
-
- -

-ADDRESS_OFFSET

-
-public static final int ADDRESS_OFFSET
-
-
Offset of the address field inside an arc. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_ARC_FINAL

-
-public static final int BIT_ARC_FINAL
-
-
An arc flag indicating the target node of an arc corresponds to a final - state. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_ARC_LAST

-
-public static final int BIT_ARC_LAST
-
-
An arc flag indicating the arc is last within its state. -

-

-
See Also:
Constant Field Values
-
- - - - - - - - -
-Method Detail
- -

-getRootNode

-
-public int getRootNode()
-
-
-
Specified by:
getRootNode in class FSA
-
-
- -
Returns:
Returns the identifier of the root node of this automaton. - Returns 0 if the start node is also the end node (the automaton - is empty).
-
-
-
- -

-getFirstArc

-
-public int getFirstArc(int node)
-
-
-
Specified by:
getFirstArc in class FSA
-
-
- -
Returns:
Returns the identifier of the first arc leaving node - or 0 if the node has no outgoing arcs.
-
-
-
- -

-getArc

-
-public int getArc(int node,
-                  byte label)
-
-
-
Specified by:
getArc in class FSA
-
-
- -
Returns:
Returns the identifier of an arc leaving node and - labeled with label. An identifier equal to 0 means - the node has no outgoing arc labeled label.
-
-
-
- -

-getNextArc

-
-public int getNextArc(int arc)
-
-
-
Specified by:
getNextArc in class FSA
-
-
- -
Returns:
Returns the identifier of the next arc after arc and - leaving node. Zero is returned if no more arcs are - available for the node.
-
-
-
- -

-getArcLabel

-
-public byte getArcLabel(int arc)
-
-
Description copied from class: FSA
-
Return the label associated with a given arc. -

-

-
Specified by:
getArcLabel in class FSA
-
-
-
-
-
-
- -

-isArcFinal

-
-public boolean isArcFinal(int arc)
-
-
Description copied from class: FSA
-
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -

-

-
Specified by:
isArcFinal in class FSA
-
-
-
-
-
-
- -

-isArcTerminal

-
-public boolean isArcTerminal(int arc)
-
-
Description copied from class: FSA
-
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). Implies FSA.isArcFinal(int). -

-

-
Specified by:
isArcTerminal in class FSA
-
-
-
-
-
-
- -

-getEndNode

-
-public int getEndNode(int arc)
-
-
Description copied from class: FSA
-
Return the end node pointed to by a given arc. Terminal arcs - (those that point to a terminal state) have no end node representation - and throw a runtime exception. -

-

-
Specified by:
getEndNode in class FSA
-
-
-
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Description copied from class: FSA
-
Returns a set of flags for this FSA instance. -

-

-
Specified by:
getFlags in class FSA
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSA.html b/doc/api/morfologik/fsa/FSA.html deleted file mode 100644 index b75dd1a..0000000 --- a/doc/api/morfologik/fsa/FSA.html +++ /dev/null @@ -1,855 +0,0 @@ - - - - - - -FSA - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSA

-
-java.lang.Object
-  extended by morfologik.fsa.FSA
-
-
-
All Implemented Interfaces:
java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
Direct Known Subclasses:
CFSA, CFSA2, ConstantArcSizeFSA, FSA5
-
-
-
-
public abstract class FSA
extends java.lang.Object
implements java.lang.Iterable<java.nio.ByteBuffer>
- - -

-This is a top abstract class for handling finite state automata. These - automata are arc-based, a design described in Jan Daciuk's Incremental - Construction of Finite-State Automata and Transducers, and Their Use in the - Natural Language Processing (PhD thesis, Technical University of Gdansk). - -

- Concrete subclasses (implementations) provide varying tradeoffs and features: - traversal speed vs. memory size, for example. -

-

- -

-

-
See Also:
FSABuilder
-
- -

- - - - - - - - - - - -
-Constructor Summary
FSA() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-abstract  intgetArc(int node, - byte label) - -
-           
- intgetArcCount(int node) - -
-          Calculates the number of arcs of a given node.
-abstract  bytegetArcLabel(int arc) - -
-          Return the label associated with a given arc.
-abstract  intgetEndNode(int arc) - -
-          Return the end node pointed to by a given arc.
-abstract  intgetFirstArc(int node) - -
-           
-abstract  java.util.Set<FSAFlags>getFlags() - -
-          Returns a set of flags for this FSA instance.
-abstract  intgetNextArc(int arc) - -
-           
- intgetRightLanguageCount(int node) - -
-           
-abstract  intgetRootNode() - -
-           
- java.lang.Iterable<java.nio.ByteBuffer>getSequences() - -
-          An alias of calling iterator() directly (FSA is also - Iterable).
- java.lang.Iterable<java.nio.ByteBuffer>getSequences(int node) - -
-          Returns an iterator over all binary sequences starting at the given FSA - state (node) and ending in final nodes.
-abstract  booleanisArcFinal(int arc) - -
-          Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton.
-abstract  booleanisArcTerminal(int arc) - -
-          Returns true if this arc does not have a - terminating node (@link getEndNode(int) will throw an - exception).
- java.util.Iterator<java.nio.ByteBuffer>iterator() - -
-          Returns an iterator over all binary sequences starting from the initial - FSA state (node) and ending in final nodes.
-static - - - - -
-<T extends FSA> -
-T
-
read(java.io.InputStream in) - -
-          A factory for reading automata in any of the supported versions.
- - - - - -
-<T extends StateVisitor> -
-T
-
visitAllStates(T v) - -
-          Visit all states.
- - - - - -
-<T extends StateVisitor> -
-T
-
visitInPostOrder(T v) - -
-          Same as visitInPostOrder(StateVisitor, int), - starting from root automaton node.
- - - - - -
-<T extends StateVisitor> -
-T
-
visitInPostOrder(T v, - int node) - -
-          Visits all states reachable from node in postorder.
- - - - - -
-<T extends StateVisitor> -
-T
-
visitInPreOrder(T v) - -
-          Same as visitInPreOrder(StateVisitor, int), starting from root automaton node.
- - - - - -
-<T extends StateVisitor> -
-T
-
visitInPreOrder(T v, - int node) - -
-          Visits all states in preorder.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-FSA

-
-public FSA()
-
-
- - - - - - - - -
-Method Detail
- -

-getRootNode

-
-public abstract int getRootNode()
-
-
-
-
-
- -
Returns:
Returns the identifier of the root node of this automaton. - Returns 0 if the start node is also the end node (the automaton - is empty).
-
-
-
- -

-getFirstArc

-
-public abstract int getFirstArc(int node)
-
-
-
-
-
- -
Returns:
Returns the identifier of the first arc leaving node - or 0 if the node has no outgoing arcs.
-
-
-
- -

-getNextArc

-
-public abstract int getNextArc(int arc)
-
-
-
-
-
- -
Returns:
Returns the identifier of the next arc after arc and - leaving node. Zero is returned if no more arcs are - available for the node.
-
-
-
- -

-getArc

-
-public abstract int getArc(int node,
-                           byte label)
-
-
-
-
-
- -
Returns:
Returns the identifier of an arc leaving node and - labeled with label. An identifier equal to 0 means - the node has no outgoing arc labeled label.
-
-
-
- -

-getArcLabel

-
-public abstract byte getArcLabel(int arc)
-
-
Return the label associated with a given arc. -

-

-
-
-
-
-
-
-
- -

-isArcFinal

-
-public abstract boolean isArcFinal(int arc)
-
-
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -

-

-
-
-
-
-
-
-
- -

-isArcTerminal

-
-public abstract boolean isArcTerminal(int arc)
-
-
Returns true if this arc does not have a - terminating node (@link getEndNode(int) will throw an - exception). Implies isArcFinal(int). -

-

-
-
-
-
-
-
-
- -

-getEndNode

-
-public abstract int getEndNode(int arc)
-
-
Return the end node pointed to by a given arc. Terminal arcs - (those that point to a terminal state) have no end node representation - and throw a runtime exception. -

-

-
-
-
-
-
-
-
- -

-getFlags

-
-public abstract java.util.Set<FSAFlags> getFlags()
-
-
Returns a set of flags for this FSA instance. -

-

-
-
-
-
-
-
-
- -

-getArcCount

-
-public int getArcCount(int node)
-
-
Calculates the number of arcs of a given node. Unless really required, - use the following idiom for looping through all arcs: -
- for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
- }
- 
-

-

-
-
-
-
-
-
-
- -

-getRightLanguageCount

-
-public int getRightLanguageCount(int node)
-
-
-
-
-
- -
Returns:
Returns the number of sequences reachable from the given state if - the automaton was compiled with FSAFlags.NUMBERS. The size of - the right language of the state, in other words. -
Throws: -
java.lang.UnsupportedOperationException - If the automaton was not compiled with - FSAFlags.NUMBERS. The value can then be computed by manual count - of getSequences(int).
-
-
-
- -

-getSequences

-
-public java.lang.Iterable<java.nio.ByteBuffer> getSequences(int node)
-
-
Returns an iterator over all binary sequences starting at the given FSA - state (node) and ending in final nodes. This corresponds to a set of - suffixes of a given prefix from all sequences stored in the automaton. - -

- The returned iterator is a ByteBuffer whose contents changes on - each call to Iterator.next(). The keep the contents between calls - to Iterator.next(), one must copy the buffer to some other - location. -

- -

- Important. It is guaranteed that the returned byte buffer is - backed by a byte array and that the content of the byte buffer starts at - the array's index 0. -

-

-

-
-
-
-
See Also:
Iterable
-
-
-
- -

-getSequences

-
-public final java.lang.Iterable<java.nio.ByteBuffer> getSequences()
-
-
An alias of calling iterator() directly (FSA is also - Iterable). -

-

-
-
-
-
-
-
-
- -

-iterator

-
-public final java.util.Iterator<java.nio.ByteBuffer> iterator()
-
-
Returns an iterator over all binary sequences starting from the initial - FSA state (node) and ending in final nodes. The returned iterator is a - ByteBuffer whose contents changes on each call to - Iterator.next(). The keep the contents between calls to - Iterator.next(), one must copy the buffer to some other location. - -

- Important. It is guaranteed that the returned byte buffer is - backed by a byte array and that the content of the byte buffer starts at - the array's index 0. -

-

-

-
Specified by:
iterator in interface java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
See Also:
Iterable
-
-
-
- -

-visitAllStates

-
-public <T extends StateVisitor> T visitAllStates(T v)
-
-
Visit all states. The order of visiting is undefined. This method may be faster - than traversing the automaton in post or preorder since it can scan states - linearly. Returning false from StateVisitor.accept(int) - immediately terminates the traversal. -

-

-
-
-
-
-
-
-
- -

-visitInPostOrder

-
-public <T extends StateVisitor> T visitInPostOrder(T v)
-
-
Same as visitInPostOrder(StateVisitor, int), - starting from root automaton node. -

-

-
-
-
-
-
-
-
- -

-visitInPostOrder

-
-public <T extends StateVisitor> T visitInPostOrder(T v,
-                                                   int node)
-
-
Visits all states reachable from node in postorder. - Returning false from StateVisitor.accept(int) - immediately terminates the traversal. -

-

-
-
-
-
-
-
-
- -

-visitInPreOrder

-
-public <T extends StateVisitor> T visitInPreOrder(T v)
-
-
Same as visitInPreOrder(StateVisitor, int), starting from root automaton node. -

-

-
-
-
-
-
-
-
- -

-visitInPreOrder

-
-public <T extends StateVisitor> T visitInPreOrder(T v,
-                                                  int node)
-
-
Visits all states in preorder. Returning false from StateVisitor.accept(int) - skips traversal of all sub-states of a given state. -

-

-
-
-
-
-
-
-
- -

-read

-
-public static <T extends FSA> T read(java.io.InputStream in)
-                          throws java.io.IOException
-
-
A factory for reading automata in any of the supported versions. If - possible, explicit constructors should be used. -

-

-
-
-
- -
Throws: -
java.io.IOException
See Also:
FSA5.FSA5(InputStream)
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSA5.html b/doc/api/morfologik/fsa/FSA5.html deleted file mode 100644 index c6ae2cf..0000000 --- a/doc/api/morfologik/fsa/FSA5.html +++ /dev/null @@ -1,887 +0,0 @@ - - - - - - -FSA5 - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSA5

-
-java.lang.Object
-  extended by morfologik.fsa.FSA
-      extended by morfologik.fsa.FSA5
-
-
-
All Implemented Interfaces:
java.lang.Iterable<java.nio.ByteBuffer>
-
-
-
-
public final class FSA5
extends FSA
- - -

-FSA binary format implementation for version 5. - -

- Version 5 indicates the dictionary was built with these flags: - FSAFlags.FLEXIBLE, FSAFlags.STOPBIT and - FSAFlags.NEXTBIT. The internal representation of the FSA must - therefore follow this description (please note this format describes only a - single transition (arc), not the entire dictionary file). - -

- ---- this node header present only if automaton was compiled with NUMBERS option.
- Byte
-        +-+-+-+-+-+-+-+-+\
-      0 | | | | | | | | | \  LSB
-        +-+-+-+-+-+-+-+-+  +
-      1 | | | | | | | | |  |      number of strings recognized
-        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
-        : : : : : : : : :  |      from this node.
-        +-+-+-+-+-+-+-+-+  +
-  ctl-1 | | | | | | | | | /  MSB
-        +-+-+-+-+-+-+-+-+/
-        
- ---- remaining part of the node
- 
- Byte
-       +-+-+-+-+-+-+-+-+\
-     0 | | | | | | | | | +------ label
-       +-+-+-+-+-+-+-+-+/
- 
-                  +------------- node pointed to is next
-                  | +----------- the last arc of the node
-                  | | +--------- the arc is final
-                  | | |
-             +-----------+
-             |    | | |  |
-         ___+___  | | |  |
-        /       \ | | |  |
-       MSB           LSB |
-        7 6 5 4 3 2 1 0  |
-       +-+-+-+-+-+-+-+-+ |
-     1 | | | | | | | | | \ \
-       +-+-+-+-+-+-+-+-+  \ \  LSB
-       +-+-+-+-+-+-+-+-+     +
-     2 | | | | | | | | |     |
-       +-+-+-+-+-+-+-+-+     |
-     3 | | | | | | | | |     +----- target node address (in bytes)
-       +-+-+-+-+-+-+-+-+     |      (not present except for the byte
-       : : : : : : : : :     |       with flags if the node pointed to
-       +-+-+-+-+-+-+-+-+     +       is next)
-   gtl | | | | | | | | |    /  MSB
-       +-+-+-+-+-+-+-+-+   /
- gtl+1                           (gtl = gotoLength)
- 
-

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
-static intADDRESS_OFFSET - -
-          An offset in the arc structure, where the address and flags field begins.
- byteannotation - -
-          Annotation character.
- byte[]arcs - -
-          An array of bytes with the internal representation of the automaton.
-static intBIT_FINAL_ARC - -
-          Bit indicating that an arc corresponds to the last character of a - sequence available when building the automaton.
-static intBIT_LAST_ARC - -
-          Bit indicating that an arc is the last one of the node's list and the - following one belongs to another node.
-static intBIT_TARGET_NEXT - -
-          Bit indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field).
-static byteDEFAULT_ANNOTATION - -
-          Default annotation byte.
-static byteDEFAULT_FILLER - -
-          Default filler byte.
- bytefiller - -
-          Filler character.
- intgtl - -
-          Number of bytes each address takes in full, expanded form (goto length).
- intnodeDataLength - -
-          The length of the node header structure (if the automaton was compiled with - NUMBERS option).
-static byteVERSION - -
-          Automaton version as in the file header.
-  - - - - - - - - - - -
-Constructor Summary
FSA5(java.io.InputStream fsaStream) - -
-          Read and wrap a binary automaton in FSA version 5.
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- intgetArc(int node, - byte label) - -
-          
- bytegetArcLabel(int arc) - -
-          Return the label associated with a given arc.
- intgetEndNode(int arc) - -
-          Return the end node pointed to by a given arc.
- intgetFirstArc(int node) - -
-          
- java.util.Set<FSAFlags>getFlags() - -
-          Returns a set of flags for this FSA instance.
- intgetNextArc(int arc) - -
-          
- intgetRightLanguageCount(int node) - -
-          Returns the number encoded at the given node.
- intgetRootNode() - -
-          Returns the start node of this automaton.
- booleanisArcFinal(int arc) - -
-          Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton.
- booleanisArcLast(int arc) - -
-          Returns true if this arc has LAST bit set.
- booleanisArcTerminal(int arc) - -
-          Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception).
- booleanisNextSet(int arc) - -
-           
- - - - - - - -
Methods inherited from class morfologik.fsa.FSA
getArcCount, getSequences, getSequences, iterator, read, visitAllStates, visitInPostOrder, visitInPostOrder, visitInPreOrder, visitInPreOrder
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-DEFAULT_FILLER

-
-public static final byte DEFAULT_FILLER
-
-
Default filler byte. -

-

-
See Also:
Constant Field Values
-
-
- -

-DEFAULT_ANNOTATION

-
-public static final byte DEFAULT_ANNOTATION
-
-
Default annotation byte. -

-

-
See Also:
Constant Field Values
-
-
- -

-VERSION

-
-public static final byte VERSION
-
-
Automaton version as in the file header. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_FINAL_ARC

-
-public static final int BIT_FINAL_ARC
-
-
Bit indicating that an arc corresponds to the last character of a - sequence available when building the automaton. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_LAST_ARC

-
-public static final int BIT_LAST_ARC
-
-
Bit indicating that an arc is the last one of the node's list and the - following one belongs to another node. -

-

-
See Also:
Constant Field Values
-
-
- -

-BIT_TARGET_NEXT

-
-public static final int BIT_TARGET_NEXT
-
-
Bit indicating that the target node of this arc follows it in the - compressed automaton structure (no goto field). -

-

-
See Also:
Constant Field Values
-
-
- -

-ADDRESS_OFFSET

-
-public static final int ADDRESS_OFFSET
-
-
An offset in the arc structure, where the address and flags field begins. - In version 5 of FSA automata, this value is constant (1, skip label). -

-

-
See Also:
Constant Field Values
-
-
- -

-arcs

-
-public final byte[] arcs
-
-
An array of bytes with the internal representation of the automaton. - Please see the documentation of this class for more information on how - this structure is organized. -

-

-
-
-
- -

-nodeDataLength

-
-public final int nodeDataLength
-
-
The length of the node header structure (if the automaton was compiled with - NUMBERS option). Otherwise zero. -

-

-
-
-
- -

-gtl

-
-public final int gtl
-
-
Number of bytes each address takes in full, expanded form (goto length). -

-

-
-
-
- -

-filler

-
-public final byte filler
-
-
Filler character. -

-

-
-
-
- -

-annotation

-
-public final byte annotation
-
-
Annotation character. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-FSA5

-
-public FSA5(java.io.InputStream fsaStream)
-     throws java.io.IOException
-
-
Read and wrap a binary automaton in FSA version 5. -

-

- -
Throws: -
java.io.IOException
-
- - - - - - - - -
-Method Detail
- -

-getRootNode

-
-public int getRootNode()
-
-
Returns the start node of this automaton. -

-

-
Specified by:
getRootNode in class FSA
-
-
- -
Returns:
Returns the identifier of the root node of this automaton. - Returns 0 if the start node is also the end node (the automaton - is empty).
-
-
-
- -

-getFirstArc

-
-public final int getFirstArc(int node)
-
-
-

-

-
Specified by:
getFirstArc in class FSA
-
-
- -
Returns:
Returns the identifier of the first arc leaving node - or 0 if the node has no outgoing arcs.
-
-
-
- -

-getNextArc

-
-public final int getNextArc(int arc)
-
-
-

-

-
Specified by:
getNextArc in class FSA
-
-
- -
Returns:
Returns the identifier of the next arc after arc and - leaving node. Zero is returned if no more arcs are - available for the node.
-
-
-
- -

-getArc

-
-public int getArc(int node,
-                  byte label)
-
-
-

-

-
Specified by:
getArc in class FSA
-
-
- -
Returns:
Returns the identifier of an arc leaving node and - labeled with label. An identifier equal to 0 means - the node has no outgoing arc labeled label.
-
-
-
- -

-getEndNode

-
-public int getEndNode(int arc)
-
-
Return the end node pointed to by a given arc. Terminal arcs - (those that point to a terminal state) have no end node representation - and throw a runtime exception. -

-

-
Specified by:
getEndNode in class FSA
-
-
-
-
-
-
- -

-getArcLabel

-
-public byte getArcLabel(int arc)
-
-
Return the label associated with a given arc. -

-

-
Specified by:
getArcLabel in class FSA
-
-
-
-
-
-
- -

-isArcFinal

-
-public boolean isArcFinal(int arc)
-
-
Returns true if the destination node at the end of this - arc corresponds to an input sequence created when building - this automaton. -

-

-
Specified by:
isArcFinal in class FSA
-
-
-
-
-
-
- -

-isArcTerminal

-
-public boolean isArcTerminal(int arc)
-
-
Returns true if this arc does not have a - terminating node (@link FSA.getEndNode(int) will throw an - exception). Implies FSA.isArcFinal(int). -

-

-
Specified by:
isArcTerminal in class FSA
-
-
-
-
-
-
- -

-getRightLanguageCount

-
-public int getRightLanguageCount(int node)
-
-
Returns the number encoded at the given node. The number equals the count - of the set of suffixes reachable from node (called its right - language). -

-

-
Overrides:
getRightLanguageCount in class FSA
-
-
- -
Returns:
Returns the number of sequences reachable from the given state if - the automaton was compiled with FSAFlags.NUMBERS. The size of - the right language of the state, in other words.
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Returns a set of flags for this FSA instance. - -

For this automaton version, an additional FSAFlags.NUMBERS flag - may be set to indicate the automaton contains extra fields for each node.

-

-

-
Specified by:
getFlags in class FSA
-
-
-
-
-
-
- -

-isArcLast

-
-public boolean isArcLast(int arc)
-
-
Returns true if this arc has LAST bit set. -

-

-
See Also:
BIT_LAST_ARC
-
-
-
- -

-isNextSet

-
-public boolean isNextSet(int arc)
-
-
-
See Also:
BIT_TARGET_NEXT
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSA5Serializer.html b/doc/api/morfologik/fsa/FSA5Serializer.html deleted file mode 100644 index 8a7a237..0000000 --- a/doc/api/morfologik/fsa/FSA5Serializer.html +++ /dev/null @@ -1,468 +0,0 @@ - - - - - - -FSA5Serializer - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSA5Serializer

-
-java.lang.Object
-  extended by morfologik.fsa.FSA5Serializer
-
-
-
All Implemented Interfaces:
FSASerializer
-
-
-
-
public final class FSA5Serializer
extends java.lang.Object
implements FSASerializer
- - -

-Serializes in-memory FSA graphs to a binary format compatible with - Jan Daciuk's fsa's package FSA5 format. - -

- It is possible to serialize the automaton with numbers required for perfect - hashing. See withNumbers() method. -

-

- -

-

-
See Also:
FSA5, -FSA.read(java.io.InputStream)
-
- -

- - - - - - - - - - - - - - - -
-Field Summary
- byteannotationByte - -
-           
- bytefillerByte - -
-           
-  - - - - - - - - - - -
-Constructor Summary
FSA5Serializer() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- java.util.Set<FSAFlags>getFlags() - -
-          Return supported flags.
- - - - - -
-<T extends java.io.OutputStream> -
-T
-
serialize(FSA fsa, - T os) - -
-          Serialize root state s to an output stream in - FSA5 format.
- FSA5SerializerwithAnnotationSeparator(byte annotationSeparator) - -
-          Supports built-in annotation separator.
- FSA5SerializerwithFiller(byte filler) - -
-          Supports built-in filler separator.
- FSASerializerwithLogger(IMessageLogger logger) - -
-          Log extra messages during construction.
- FSA5SerializerwithNumbers() - -
-          Serialize the automaton with the number of right-language sequences in - each node.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-fillerByte

-
-public byte fillerByte
-
-
-
See Also:
FSA5.filler
-
-
- -

-annotationByte

-
-public byte annotationByte
-
-
-
See Also:
FSA5.annotation
-
- - - - - - - - -
-Constructor Detail
- -

-FSA5Serializer

-
-public FSA5Serializer()
-
-
- - - - - - - - -
-Method Detail
- -

-withNumbers

-
-public FSA5Serializer withNumbers()
-
-
Serialize the automaton with the number of right-language sequences in - each node. This is required to implement perfect hashing. The numbering - also preserves the order of input sequences. -

-

-
Specified by:
withNumbers in interface FSASerializer
-
-
- -
Returns:
Returns the same object for easier call chaining.
-
-
-
- -

-withFiller

-
-public FSA5Serializer withFiller(byte filler)
-
-
Supports built-in filler separator. Only if FSASerializer.getFlags() returns - FSAFlags.SEPARATORS. -

-

-
Specified by:
withFiller in interface FSASerializer
-
-
-
-
-
-
- -

-withAnnotationSeparator

-
-public FSA5Serializer withAnnotationSeparator(byte annotationSeparator)
-
-
Supports built-in annotation separator. Only if FSASerializer.getFlags() returns - FSAFlags.SEPARATORS. -

-

-
Specified by:
withAnnotationSeparator in interface FSASerializer
-
-
-
-
-
-
- -

-withLogger

-
-public FSASerializer withLogger(IMessageLogger logger)
-
-
Log extra messages during construction. -

-

-
Specified by:
withLogger in interface FSASerializer
-
-
-
-
-
-
- -

-serialize

-
-public <T extends java.io.OutputStream> T serialize(FSA fsa,
-                                                    T os)
-                                         throws java.io.IOException
-
-
Serialize root state s to an output stream in - FSA5 format. -

-

-
Specified by:
serialize in interface FSASerializer
-
-
- -
Returns:
Returns os for chaining. -
Throws: -
java.io.IOException
See Also:
withNumbers
-
-
-
- -

-getFlags

-
-public java.util.Set<FSAFlags> getFlags()
-
-
Return supported flags. -

-

-
Specified by:
getFlags in interface FSASerializer
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSABuilder.InfoEntry.html b/doc/api/morfologik/fsa/FSABuilder.InfoEntry.html deleted file mode 100644 index 67fc1e3..0000000 --- a/doc/api/morfologik/fsa/FSABuilder.InfoEntry.html +++ /dev/null @@ -1,431 +0,0 @@ - - - - - - -FSABuilder.InfoEntry - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Enum FSABuilder.InfoEntry

-
-java.lang.Object
-  extended by java.lang.Enum<FSABuilder.InfoEntry>
-      extended by morfologik.fsa.FSABuilder.InfoEntry
-
-
-
All Implemented Interfaces:
java.io.Serializable, java.lang.Comparable<FSABuilder.InfoEntry>
-
-
-
Enclosing class:
FSABuilder
-
-
-
-
public static enum FSABuilder.InfoEntry
extends java.lang.Enum<FSABuilder.InfoEntry>
- - -

-Debug and information constants. -

- -

-

-
See Also:
FSABuilder.getInfo()
-
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Enum Constant Summary
CONSTANT_ARC_AUTOMATON_SIZE - -
-           
ESTIMATED_MEMORY_CONSUMPTION_MB - -
-           
MAX_ACTIVE_PATH_LENGTH - -
-           
SERIALIZATION_BUFFER_REALLOCATIONS - -
-           
SERIALIZATION_BUFFER_SIZE - -
-           
STATE_REGISTRY_SIZE - -
-           
STATE_REGISTRY_TABLE_SLOTS - -
-           
-  - - - - - - - - - - - - - - - - - - - -
-Method Summary
- java.lang.StringtoString() - -
-           
-static FSABuilder.InfoEntryvalueOf(java.lang.String name) - -
-          Returns the enum constant of this type with the specified name.
-static FSABuilder.InfoEntry[]values() - -
-          Returns an array containing the constants of this enum type, in -the order they are declared.
- - - - - - - -
Methods inherited from class java.lang.Enum
clone, compareTo, equals, finalize, getDeclaringClass, hashCode, name, ordinal, valueOf
- - - - - - - -
Methods inherited from class java.lang.Object
getClass, notify, notifyAll, wait, wait, wait
-  -

- - - - - - - - -
-Enum Constant Detail
- -

-SERIALIZATION_BUFFER_SIZE

-
-public static final FSABuilder.InfoEntry SERIALIZATION_BUFFER_SIZE
-
-
-
-
-
- -

-SERIALIZATION_BUFFER_REALLOCATIONS

-
-public static final FSABuilder.InfoEntry SERIALIZATION_BUFFER_REALLOCATIONS
-
-
-
-
-
- -

-CONSTANT_ARC_AUTOMATON_SIZE

-
-public static final FSABuilder.InfoEntry CONSTANT_ARC_AUTOMATON_SIZE
-
-
-
-
-
- -

-MAX_ACTIVE_PATH_LENGTH

-
-public static final FSABuilder.InfoEntry MAX_ACTIVE_PATH_LENGTH
-
-
-
-
-
- -

-STATE_REGISTRY_TABLE_SLOTS

-
-public static final FSABuilder.InfoEntry STATE_REGISTRY_TABLE_SLOTS
-
-
-
-
-
- -

-STATE_REGISTRY_SIZE

-
-public static final FSABuilder.InfoEntry STATE_REGISTRY_SIZE
-
-
-
-
-
- -

-ESTIMATED_MEMORY_CONSUMPTION_MB

-
-public static final FSABuilder.InfoEntry ESTIMATED_MEMORY_CONSUMPTION_MB
-
-
-
-
- - - - - - - - -
-Method Detail
- -

-values

-
-public static FSABuilder.InfoEntry[] values()
-
-
Returns an array containing the constants of this enum type, in -the order they are declared. This method may be used to iterate -over the constants as follows: -
-for (FSABuilder.InfoEntry c : FSABuilder.InfoEntry.values())
-    System.out.println(c);
-
-

-

- -
Returns:
an array containing the constants of this enum type, in -the order they are declared
-
-
-
- -

-valueOf

-
-public static FSABuilder.InfoEntry valueOf(java.lang.String name)
-
-
Returns the enum constant of this type with the specified name. -The string must match exactly an identifier used to declare an -enum constant in this type. (Extraneous whitespace characters are -not permitted.) -

-

-
Parameters:
name - the name of the enum constant to be returned. -
Returns:
the enum constant with the specified name -
Throws: -
java.lang.IllegalArgumentException - if this enum type has no constant -with the specified name -
java.lang.NullPointerException - if the argument is null
-
-
-
- -

-toString

-
-public java.lang.String toString()
-
-
-
Overrides:
toString in class java.lang.Enum<FSABuilder.InfoEntry>
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSABuilder.html b/doc/api/morfologik/fsa/FSABuilder.html deleted file mode 100644 index b862e6c..0000000 --- a/doc/api/morfologik/fsa/FSABuilder.html +++ /dev/null @@ -1,450 +0,0 @@ - - - - - - -FSABuilder - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSABuilder

-
-java.lang.Object
-  extended by morfologik.fsa.FSABuilder
-
-
-
-
public final class FSABuilder
extends java.lang.Object
- - -

-Fast, memory-conservative finite state automaton builder, returning a - byte-serialized ConstantArcSizeFSA (a tradeoff between construction - speed and memory consumption). -

- -

-


- -

- - - - - - - - - - - -
-Nested Class Summary
-static classFSABuilder.InfoEntry - -
-          Debug and information constants.
- - - - - - - - - - -
-Field Summary
-static java.util.Comparator<byte[]>LEXICAL_ORDERING - -
-          Comparator comparing full byte arrays consistently with - compare(byte[], int, int, byte[], int, int).
-  - - - - - - - - - - - - - -
-Constructor Summary
FSABuilder() - -
-           
FSABuilder(int bufferGrowthSize) - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- voidadd(byte[] sequence, - int start, - int len) - -
-          Add a single sequence of bytes to the FSA.
-static FSAbuild(byte[][] input) - -
-          Build a minimal, deterministic automaton from a sorted list of byte sequences.
-static FSAbuild(java.lang.Iterable<byte[]> input) - -
-          Build a minimal, deterministic automaton from an iterable list of byte sequences.
-static intcompare(byte[] s1, - int start1, - int lens1, - byte[] s2, - int start2, - int lens2) - -
-          Lexicographic order of input sequences.
- FSAcomplete() - -
-          Complete the automaton.
- java.util.Map<FSABuilder.InfoEntry,java.lang.Object>getInfo() - -
-          Return various statistics concerning the FSA and its compilation.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-LEXICAL_ORDERING

-
-public static final java.util.Comparator<byte[]> LEXICAL_ORDERING
-
-
Comparator comparing full byte arrays consistently with - compare(byte[], int, int, byte[], int, int). -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-FSABuilder

-
-public FSABuilder()
-
-
-
- -

-FSABuilder

-
-public FSABuilder(int bufferGrowthSize)
-
-
- - - - - - - - -
-Method Detail
- -

-add

-
-public void add(byte[] sequence,
-                int start,
-                int len)
-
-
Add a single sequence of bytes to the FSA. The input must be lexicographically greater - than any previously added sequence. -

-

-
-
-
-
- -

-complete

-
-public FSA complete()
-
-
Complete the automaton. -

-

-
-
-
-
- -

-build

-
-public static FSA build(byte[][] input)
-
-
Build a minimal, deterministic automaton from a sorted list of byte sequences. -

-

-
-
-
-
- -

-build

-
-public static FSA build(java.lang.Iterable<byte[]> input)
-
-
Build a minimal, deterministic automaton from an iterable list of byte sequences. -

-

-
-
-
-
- -

-getInfo

-
-public java.util.Map<FSABuilder.InfoEntry,java.lang.Object> getInfo()
-
-
Return various statistics concerning the FSA and its compilation. -

-

-
-
-
-
- -

-compare

-
-public static int compare(byte[] s1,
-                          int start1,
-                          int lens1,
-                          byte[] s2,
-                          int start2,
-                          int lens2)
-
-
Lexicographic order of input sequences. By default, consistent with the "C" sort - (absolute value of bytes, 0-255). -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSAFinalStatesIterator.html b/doc/api/morfologik/fsa/FSAFinalStatesIterator.html deleted file mode 100644 index 796922a..0000000 --- a/doc/api/morfologik/fsa/FSAFinalStatesIterator.html +++ /dev/null @@ -1,339 +0,0 @@ - - - - - - -FSAFinalStatesIterator - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSAFinalStatesIterator

-
-java.lang.Object
-  extended by morfologik.fsa.FSAFinalStatesIterator
-
-
-
All Implemented Interfaces:
java.util.Iterator<java.nio.ByteBuffer>
-
-
-
-
public final class FSAFinalStatesIterator
extends java.lang.Object
implements java.util.Iterator<java.nio.ByteBuffer>
- - -

-An iterator that traverses the right language of a given node (all sequences - reachable from a given node). -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
FSAFinalStatesIterator(FSA fsa, - int node) - -
-          Create an instance of the iterator for a given node.
-  - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- booleanhasNext() - -
-          Returns true if there are still elements in this iterator.
- java.nio.ByteBuffernext() - -
-           
- voidremove() - -
-          Not implemented in this iterator.
- voidrestartFrom(int node) - -
-          Restart walking from node.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-FSAFinalStatesIterator

-
-public FSAFinalStatesIterator(FSA fsa,
-                              int node)
-
-
Create an instance of the iterator for a given node. -

-

- - - - - - - - -
-Method Detail
- -

-restartFrom

-
-public void restartFrom(int node)
-
-
Restart walking from node. Allows iterator reuse. -

-

-
-
-
-
-
-
-
- -

-hasNext

-
-public boolean hasNext()
-
-
Returns true if there are still elements in this iterator. -

-

-
Specified by:
hasNext in interface java.util.Iterator<java.nio.ByteBuffer>
-
-
-
-
-
-
- -

-next

-
-public java.nio.ByteBuffer next()
-
-
-
Specified by:
next in interface java.util.Iterator<java.nio.ByteBuffer>
-
-
- -
Returns:
Returns a ByteBuffer with the sequence corresponding to - the next final state in the automaton.
-
-
-
- -

-remove

-
-public void remove()
-
-
Not implemented in this iterator. -

-

-
Specified by:
remove in interface java.util.Iterator<java.nio.ByteBuffer>
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSAFlags.html b/doc/api/morfologik/fsa/FSAFlags.html deleted file mode 100644 index 4e24416..0000000 --- a/doc/api/morfologik/fsa/FSAFlags.html +++ /dev/null @@ -1,484 +0,0 @@ - - - - - - -FSAFlags - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Enum FSAFlags

-
-java.lang.Object
-  extended by java.lang.Enum<FSAFlags>
-      extended by morfologik.fsa.FSAFlags
-
-
-
All Implemented Interfaces:
java.io.Serializable, java.lang.Comparable<FSAFlags>
-
-
-
-
public enum FSAFlags
extends java.lang.Enum<FSAFlags>
- - -

-FSA automaton flags. Where applicable, flags follow Daciuk's fsa package. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - -
-Enum Constant Summary
FLEXIBLE - -
-          Daciuk: flexible FSA encoding.
NEXTBIT - -
-          Daciuk: next bit in use.
NUMBERS - -
-          The FSA contains right-language count numbers on states.
SEPARATORS - -
-          The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package - compatibility).
STOPBIT - -
-          Daciuk: stop bit in use.
TAILS - -
-          Daciuk: tails compression.
- - - - - - - - - - -
-Field Summary
- intbits - -
-          Bit mask for the corresponding flag.
-  - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static shortasShort(java.util.Set<FSAFlags> flags) - -
-          Returns the set of flags encoded in a single short.
-static booleanisSet(int flags, - FSAFlags flag) - -
-          Returns true if the corresponding flag is set in the bit set.
-static FSAFlagsvalueOf(java.lang.String name) - -
-          Returns the enum constant of this type with the specified name.
-static FSAFlags[]values() - -
-          Returns an array containing the constants of this enum type, in -the order they are declared.
- - - - - - - -
Methods inherited from class java.lang.Enum
clone, compareTo, equals, finalize, getDeclaringClass, hashCode, name, ordinal, toString, valueOf
- - - - - - - -
Methods inherited from class java.lang.Object
getClass, notify, notifyAll, wait, wait, wait
-  -

- - - - - - - - -
-Enum Constant Detail
- -

-FLEXIBLE

-
-public static final FSAFlags FLEXIBLE
-
-
Daciuk: flexible FSA encoding. -

-

-
-
-
- -

-STOPBIT

-
-public static final FSAFlags STOPBIT
-
-
Daciuk: stop bit in use. -

-

-
-
-
- -

-NEXTBIT

-
-public static final FSAFlags NEXTBIT
-
-
Daciuk: next bit in use. -

-

-
-
-
- -

-TAILS

-
-public static final FSAFlags TAILS
-
-
Daciuk: tails compression. -

-

-
-
-
- -

-NUMBERS

-
-public static final FSAFlags NUMBERS
-
-
The FSA contains right-language count numbers on states. -

-

-
See Also:
FSA.getRightLanguageCount(int)
-
-
- -

-SEPARATORS

-
-public static final FSAFlags SEPARATORS
-
-
The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package - compatibility). -

-

-
-
- - - - - - - - -
-Field Detail
- -

-bits

-
-public final int bits
-
-
Bit mask for the corresponding flag. -

-

-
-
- - - - - - - - -
-Method Detail
- -

-values

-
-public static FSAFlags[] values()
-
-
Returns an array containing the constants of this enum type, in -the order they are declared. This method may be used to iterate -over the constants as follows: -
-for (FSAFlags c : FSAFlags.values())
-    System.out.println(c);
-
-

-

- -
Returns:
an array containing the constants of this enum type, in -the order they are declared
-
-
-
- -

-valueOf

-
-public static FSAFlags valueOf(java.lang.String name)
-
-
Returns the enum constant of this type with the specified name. -The string must match exactly an identifier used to declare an -enum constant in this type. (Extraneous whitespace characters are -not permitted.) -

-

-
Parameters:
name - the name of the enum constant to be returned. -
Returns:
the enum constant with the specified name -
Throws: -
java.lang.IllegalArgumentException - if this enum type has no constant -with the specified name -
java.lang.NullPointerException - if the argument is null
-
-
-
- -

-isSet

-
-public static boolean isSet(int flags,
-                            FSAFlags flag)
-
-
Returns true if the corresponding flag is set in the bit set. -

-

-
-
-
-
- -

-asShort

-
-public static short asShort(java.util.Set<FSAFlags> flags)
-
-
Returns the set of flags encoded in a single short. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSAInfo.html b/doc/api/morfologik/fsa/FSAInfo.html deleted file mode 100644 index 9f08c90..0000000 --- a/doc/api/morfologik/fsa/FSAInfo.html +++ /dev/null @@ -1,399 +0,0 @@ - - - - - - -FSAInfo - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSAInfo

-
-java.lang.Object
-  extended by morfologik.fsa.FSAInfo
-
-
-
-
public final class FSAInfo
extends java.lang.Object
- - -

-Compute additional information about an FSA: number of arcs, nodes, etc. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
- intarcsCount - -
-          Number of arcs in the automaton, excluding an arcs from the zero node - (initial) and an arc from the start node to the root node.
- intarcsCountTotal - -
-          Total number of arcs, counting arcs that physically overlap due to - merging.
- intfinalStatesCount - -
-          Number of final states (number of input sequences stored in the automaton).
- intnodeCount - -
-          Number of nodes in the automaton.
- intsize - -
-          Arcs size (in serialized form).
-  - - - - - - - - - - - - - -
-Constructor Summary
FSAInfo(FSA fsa) - -
-           
FSAInfo(int nodeCount, - int arcsCount, - int arcsCountTotal, - int finalStatesCount) - -
-           
-  - - - - - - - - - - - -
-Method Summary
- java.lang.StringtoString() - -
-           
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-nodeCount

-
-public final int nodeCount
-
-
Number of nodes in the automaton. -

-

-
-
-
- -

-arcsCount

-
-public final int arcsCount
-
-
Number of arcs in the automaton, excluding an arcs from the zero node - (initial) and an arc from the start node to the root node. -

-

-
-
-
- -

-arcsCountTotal

-
-public final int arcsCountTotal
-
-
Total number of arcs, counting arcs that physically overlap due to - merging. -

-

-
-
-
- -

-finalStatesCount

-
-public final int finalStatesCount
-
-
Number of final states (number of input sequences stored in the automaton). -

-

-
-
-
- -

-size

-
-public final int size
-
-
Arcs size (in serialized form). -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-FSAInfo

-
-public FSAInfo(FSA fsa)
-
-
-
- -

-FSAInfo

-
-public FSAInfo(int nodeCount,
-               int arcsCount,
-               int arcsCountTotal,
-               int finalStatesCount)
-
-
- - - - - - - - -
-Method Detail
- -

-toString

-
-public java.lang.String toString()
-
-
-
Overrides:
toString in class java.lang.Object
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSASerializer.html b/doc/api/morfologik/fsa/FSASerializer.html deleted file mode 100644 index 5e57fae..0000000 --- a/doc/api/morfologik/fsa/FSASerializer.html +++ /dev/null @@ -1,335 +0,0 @@ - - - - - - -FSASerializer - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Interface FSASerializer

-
-
All Known Implementing Classes:
CFSA2Serializer, FSA5Serializer
-
-
-
-
public interface FSASerializer
- - -

-All FSA serializers to binary formats will implement this interface. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- java.util.Set<FSAFlags>getFlags() - -
-          Returns the set of flags supported by the serializer (and the output automaton).
- - - - - -
-<T extends java.io.OutputStream> -
-T
-
serialize(FSA fsa, - T os) - -
-          Serialize a finite state automaton to an output stream.
- FSASerializerwithAnnotationSeparator(byte annotationSeparator) - -
-          Supports built-in annotation separator.
- FSASerializerwithFiller(byte filler) - -
-          Supports built-in filler separator.
- FSASerializerwithLogger(IMessageLogger logger) - -
-          Log extra messages during construction.
- FSASerializerwithNumbers() - -
-          Supports built-in right language count on nodes, speeding up perfect hash counts.
-  -

- - - - - - - - -
-Method Detail
- -

-serialize

-
-<T extends java.io.OutputStream> T serialize(FSA fsa,
-                                             T os)
-                                         throws java.io.IOException
-
-
Serialize a finite state automaton to an output stream. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-getFlags

-
-java.util.Set<FSAFlags> getFlags()
-
-
Returns the set of flags supported by the serializer (and the output automaton). -

-

-
-
-
-
- -

-withLogger

-
-FSASerializer withLogger(IMessageLogger logger)
-
-
Log extra messages during construction. -

-

-
-
-
-
- -

-withFiller

-
-FSASerializer withFiller(byte filler)
-
-
Supports built-in filler separator. Only if getFlags() returns - FSAFlags.SEPARATORS. -

-

-
-
-
-
- -

-withAnnotationSeparator

-
-FSASerializer withAnnotationSeparator(byte annotationSeparator)
-
-
Supports built-in annotation separator. Only if getFlags() returns - FSAFlags.SEPARATORS. -

-

-
-
-
-
- -

-withNumbers

-
-FSASerializer withNumbers()
-
-
Supports built-in right language count on nodes, speeding up perfect hash counts. - Only if getFlags() returns FSAFlags.NUMBERS. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSATraversal.html b/doc/api/morfologik/fsa/FSATraversal.html deleted file mode 100644 index 65a778d..0000000 --- a/doc/api/morfologik/fsa/FSATraversal.html +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - -FSATraversal - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSATraversal

-
-java.lang.Object
-  extended by morfologik.fsa.FSATraversal
-
-
-
-
public final class FSATraversal
extends java.lang.Object
- - -

-This class implements some common matching and scanning operations on a - generic FSA. -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
FSATraversal(FSA fsa) - -
-          Traversals of the given FSA.
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- MatchResultmatch(byte[] sequence) - -
-           
- MatchResultmatch(byte[] sequence, - int node) - -
-           
- MatchResultmatch(byte[] sequence, - int start, - int length, - int node) - -
-          Finds a matching path in the dictionary for a given sequence of labels - from sequence and starting at node node.
- MatchResultmatch(MatchResult result, - byte[] sequence, - int start, - int length, - int node) - -
-          Same as match(byte[], int, int, int), but allows passing - a reusable MatchResult object so that no intermediate garbage is - produced.
- intperfectHash(byte[] sequence) - -
-           
- intperfectHash(byte[] sequence, - int start, - int length, - int node) - -
-          Calculate perfect hash for a given input sequence of bytes.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-FSATraversal

-
-public FSATraversal(FSA fsa)
-
-
Traversals of the given FSA. -

-

- - - - - - - - -
-Method Detail
- -

-perfectHash

-
-public int perfectHash(byte[] sequence,
-                       int start,
-                       int length,
-                       int node)
-
-
Calculate perfect hash for a given input sequence of bytes. The perfect hash requires - that FSA is built with FSAFlags.NUMBERS and corresponds to the sequential - order of input sequences used at automaton construction time. -

-

-
Parameters:
start - Start index in the sequence array.
length - Length of the byte sequence, must be at least 1. -
Returns:
Returns a unique integer assigned to the input sequence in the automaton (reflecting - the number of that sequence in the input used to build the automaton). Returns a negative - integer if the input sequence was not part of the input from which the automaton was created. - The type of mismatch is a constant defined in MatchResult.
-
-
-
- -

-perfectHash

-
-public int perfectHash(byte[] sequence)
-
-
-
See Also:
perfectHash(byte[], int, int, int)
-
-
-
- -

-match

-
-public MatchResult match(MatchResult result,
-                         byte[] sequence,
-                         int start,
-                         int length,
-                         int node)
-
-
Same as match(byte[], int, int, int), but allows passing - a reusable MatchResult object so that no intermediate garbage is - produced. -

-

- -
Returns:
The same object as result, but with reset internal - type and other fields.
-
-
-
- -

-match

-
-public MatchResult match(byte[] sequence,
-                         int start,
-                         int length,
-                         int node)
-
-
Finds a matching path in the dictionary for a given sequence of labels - from sequence and starting at node node. -

-

-
Parameters:
sequence - An array of labels to follow in the FSA.
start - Starting index in sequence.
length - How many symbols to consider from sequence?
node - Start node identifier in the FSA.
See Also:
match(byte [], int)
-
-
-
- -

-match

-
-public MatchResult match(byte[] sequence,
-                         int node)
-
-
-
See Also:
match(byte[], int, int, int)
-
-
-
- -

-match

-
-public MatchResult match(byte[] sequence)
-
-
-
See Also:
match(byte[], int, int, int)
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSAUtils.IntIntHolder.html b/doc/api/morfologik/fsa/FSAUtils.IntIntHolder.html deleted file mode 100644 index 871d878..0000000 --- a/doc/api/morfologik/fsa/FSAUtils.IntIntHolder.html +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - -FSAUtils.IntIntHolder - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSAUtils.IntIntHolder

-
-java.lang.Object
-  extended by morfologik.fsa.FSAUtils.IntIntHolder
-
-
-
Enclosing class:
FSAUtils
-
-
-
-
public static final class FSAUtils.IntIntHolder
extends java.lang.Object
- - -

-


- -

- - - - - - - - - - - - - - - -
-Field Summary
- inta - -
-           
- intb - -
-           
-  - - - - - - - - - - - - - -
-Constructor Summary
FSAUtils.IntIntHolder() - -
-           
FSAUtils.IntIntHolder(int a, - int b) - -
-           
-  - - - - - - - -
-Method Summary
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-a

-
-public int a
-
-
-
-
-
- -

-b

-
-public int b
-
-
-
-
- - - - - - - - -
-Constructor Detail
- -

-FSAUtils.IntIntHolder

-
-public FSAUtils.IntIntHolder(int a,
-                             int b)
-
-
-
- -

-FSAUtils.IntIntHolder

-
-public FSAUtils.IntIntHolder()
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/FSAUtils.html b/doc/api/morfologik/fsa/FSAUtils.html deleted file mode 100644 index 53b4725..0000000 --- a/doc/api/morfologik/fsa/FSAUtils.html +++ /dev/null @@ -1,379 +0,0 @@ - - - - - - -FSAUtils - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class FSAUtils

-
-java.lang.Object
-  extended by morfologik.fsa.FSAUtils
-
-
-
-
public final class FSAUtils
extends java.lang.Object
- - -

-Other FSA-related utilities not directly associated with the class hierarchy. -

- -

-


- -

- - - - - - - - - - - -
-Nested Class Summary
-static classFSAUtils.IntIntHolder - -
-           
-  - - - - - - - - - - -
-Constructor Summary
FSAUtils() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static java.util.TreeMap<java.lang.Integer,java.lang.Integer>calculateFanOuts(FSA fsa, - int root) - -
-          Calculate fan-out ratio.
-static java.util.ArrayList<byte[]>rightLanguage(FSA fsa, - int state) - -
-          All byte sequences generated as the right language of state.
-static com.carrotsearch.hppc.IntIntOpenHashMaprightLanguageForAllStates(FSA fsa) - -
-          Calculate the size of right language for each state in an FSA.
-static java.lang.StringtoDot(FSA fsa, - int node) - -
-          Returns the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language).
-static voidtoDot(java.io.Writer w, - FSA fsa, - int node) - -
-          Saves the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language), to the given writer.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-FSAUtils

-
-public FSAUtils()
-
-
- - - - - - - - -
-Method Detail
- -

-toDot

-
-public static java.lang.String toDot(FSA fsa,
-                                     int node)
-
-
Returns the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language). -

-

-
-
-
-
- -

-toDot

-
-public static void toDot(java.io.Writer w,
-                         FSA fsa,
-                         int node)
-                  throws java.io.IOException
-
-
Saves the right-language reachable from a given FSA node, formatted - as an input for the graphviz package (expressed in the dot - language), to the given writer. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-rightLanguage

-
-public static java.util.ArrayList<byte[]> rightLanguage(FSA fsa,
-                                                        int state)
-
-
All byte sequences generated as the right language of state. -

-

-
-
-
-
- -

-calculateFanOuts

-
-public static java.util.TreeMap<java.lang.Integer,java.lang.Integer> calculateFanOuts(FSA fsa,
-                                                                                      int root)
-
-
Calculate fan-out ratio. -

-

- -
Returns:
The returned array: result[outgoing-arcs]
-
-
-
- -

-rightLanguageForAllStates

-
-public static com.carrotsearch.hppc.IntIntOpenHashMap rightLanguageForAllStates(FSA fsa)
-
-
Calculate the size of right language for each state in an FSA. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/MatchResult.html b/doc/api/morfologik/fsa/MatchResult.html deleted file mode 100644 index 889c378..0000000 --- a/doc/api/morfologik/fsa/MatchResult.html +++ /dev/null @@ -1,397 +0,0 @@ - - - - - - -MatchResult - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Class MatchResult

-
-java.lang.Object
-  extended by morfologik.fsa.MatchResult
-
-
-
-
public final class MatchResult
extends java.lang.Object
- - -

-A matching result returned from FSATraversal. -

- -

-

-
See Also:
FSATraversal
-
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
-static intAUTOMATON_HAS_PREFIX - -
-          The automaton contains a prefix of the input sequence.
-static intEXACT_MATCH - -
-          The automaton has exactly one match for the input sequence.
- intindex - -
-          Input sequence's index, interpretation depends on kind.
- intkind - -
-          One of the match kind constants defined in this class.
-static intNO_MATCH - -
-          The automaton has no match for the input sequence.
- intnode - -
-          Automaton node, interpretation depends on the kind.
-static intSEQUENCE_IS_A_PREFIX - -
-          The sequence is a prefix of at least one sequence in the automaton.
-  - - - - - - - - - - -
-Constructor Summary
MatchResult() - -
-           
-  - - - - - - - -
-Method Summary
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-EXACT_MATCH

-
-public static final int EXACT_MATCH
-
-
The automaton has exactly one match for the input sequence. -

-

-
See Also:
Constant Field Values
-
-
- -

-NO_MATCH

-
-public static final int NO_MATCH
-
-
The automaton has no match for the input sequence. -

-

-
See Also:
Constant Field Values
-
-
- -

-AUTOMATON_HAS_PREFIX

-
-public static final int AUTOMATON_HAS_PREFIX
-
-
The automaton contains a prefix of the input sequence. That is: - one of the input sequences used to build the automaton is a - prefix of the input sequence that is shorter than the sequence. - -

index will contain an index of the - first character of the input sequence not present in the - dictionary.

-

-

-
See Also:
Constant Field Values
-
-
- -

-SEQUENCE_IS_A_PREFIX

-
-public static final int SEQUENCE_IS_A_PREFIX
-
-
The sequence is a prefix of at least one sequence in the automaton. - node returns the node from which all sequences - with the given prefix start in the automaton. -

-

-
See Also:
Constant Field Values
-
-
- -

-kind

-
-public int kind
-
-
One of the match kind constants defined in this class. -

-

-
See Also:
NO_MATCH, -EXACT_MATCH, -AUTOMATON_HAS_PREFIX, -SEQUENCE_IS_A_PREFIX
-
-
- -

-index

-
-public int index
-
-
Input sequence's index, interpretation depends on kind. -

-

-
-
-
- -

-node

-
-public int node
-
-
Automaton node, interpretation depends on the kind. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-MatchResult

-
-public MatchResult()
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/StateVisitor.html b/doc/api/morfologik/fsa/StateVisitor.html deleted file mode 100644 index a8bbf0b..0000000 --- a/doc/api/morfologik/fsa/StateVisitor.html +++ /dev/null @@ -1,211 +0,0 @@ - - - - - - -StateVisitor - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.fsa -
-Interface StateVisitor

-
-
-
public interface StateVisitor
- - -

-State visitor. -

- -

-

-
See Also:
FSA.visitInPostOrder(StateVisitor), -FSA.visitInPreOrder(StateVisitor)
-
- -

- - - - - - - - - - - - -
-Method Summary
- booleanaccept(int state) - -
-           
-  -

- - - - - - - - -
-Method Detail
- -

-accept

-
-boolean accept(int state)
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/package-frame.html b/doc/api/morfologik/fsa/package-frame.html deleted file mode 100644 index e4fec52..0000000 --- a/doc/api/morfologik/fsa/package-frame.html +++ /dev/null @@ -1,84 +0,0 @@ - - - - - - -morfologik.fsa - - - - - - - - - - - -morfologik.fsa - - - - -
-Interfaces  - -
-FSASerializer -
-StateVisitor
- - - - - - -
-Classes  - -
-CFSA -
-CFSA2 -
-CFSA2Serializer -
-ConstantArcSizeFSA -
-FSA -
-FSA5 -
-FSA5Serializer -
-FSABuilder -
-FSAFinalStatesIterator -
-FSAInfo -
-FSATraversal -
-FSAUtils -
-FSAUtils.IntIntHolder -
-MatchResult
- - - - - - -
-Enums  - -
-FSABuilder.InfoEntry -
-FSAFlags
- - - - diff --git a/doc/api/morfologik/fsa/package-summary.html b/doc/api/morfologik/fsa/package-summary.html deleted file mode 100644 index aba058d..0000000 --- a/doc/api/morfologik/fsa/package-summary.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - -morfologik.fsa - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-

-Package morfologik.fsa -

- - - - - - - - - - - - - -
-Interface Summary
FSASerializerAll FSA serializers to binary formats will implement this interface.
StateVisitorState visitor.
-  - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Class Summary
CFSACFSA (Compact Finite State Automaton) binary format implementation.
CFSA2CFSA (Compact Finite State Automaton) binary format implementation, version 2: - - CFSA2.BIT_TARGET_NEXT applicable on all arcs, not necessarily the last one.
CFSA2SerializerSerializes in-memory FSA graphs to CFSA2.
ConstantArcSizeFSAAn FSA with constant-size arc representation produced directly - by FSABuilder.
FSAThis is a top abstract class for handling finite state automata.
FSA5FSA binary format implementation for version 5.
FSA5SerializerSerializes in-memory FSA graphs to a binary format compatible with - Jan Daciuk's fsa's package FSA5 format.
FSABuilderFast, memory-conservative finite state automaton builder, returning a - byte-serialized ConstantArcSizeFSA (a tradeoff between construction - speed and memory consumption).
FSAFinalStatesIteratorAn iterator that traverses the right language of a given node (all sequences - reachable from a given node).
FSAInfoCompute additional information about an FSA: number of arcs, nodes, etc.
FSATraversalThis class implements some common matching and scanning operations on a - generic FSA.
FSAUtilsOther FSA-related utilities not directly associated with the class hierarchy.
FSAUtils.IntIntHolder 
MatchResultA matching result returned from FSATraversal.
-  - -

- - - - - - - - - - - - - -
-Enum Summary
FSABuilder.InfoEntryDebug and information constants.
FSAFlagsFSA automaton flags.
-  - -

-

-
-
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/fsa/package-tree.html b/doc/api/morfologik/fsa/package-tree.html deleted file mode 100644 index be565e3..0000000 --- a/doc/api/morfologik/fsa/package-tree.html +++ /dev/null @@ -1,172 +0,0 @@ - - - - - - -morfologik.fsa Class Hierarchy - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Hierarchy For Package morfologik.fsa -

-
-
-
Package Hierarchies:
All Packages
-
-

-Class Hierarchy -

- -

-Interface Hierarchy -

- -

-Enum Hierarchy -

- -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/Dictionary.html b/doc/api/morfologik/stemming/Dictionary.html deleted file mode 100644 index 9ed04ce..0000000 --- a/doc/api/morfologik/stemming/Dictionary.html +++ /dev/null @@ -1,489 +0,0 @@ - - - - - - -Dictionary - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class Dictionary

-
-java.lang.Object
-  extended by morfologik.stemming.Dictionary
-
-
-
-
public final class Dictionary
extends java.lang.Object
- - -

-A dictionary combines FSA automaton and metadata describing the - internals of dictionary entries' coding (DictionaryMetadata. - -

- A dictionary consists of two files: -

- Use static methods in this class to read dictionaries and their metadata. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
-static java.util.WeakHashMap<java.lang.String,Dictionary>defaultDictionaries - -
-          Default loaded dictionaries.
- FSAfsa - -
-          FSA automaton with the compiled dictionary data.
- DictionaryMetadatametadata - -
-          Metadata associated with the dictionary.
-static java.lang.StringMETADATA_FILE_EXTENSION - -
-          Expected metadata file extension.
-  - - - - - - - - - - -
-Constructor Summary
Dictionary(FSA fsa, - DictionaryMetadata metadata) - -
-          It is strongly recommended to use static methods in this class for - reading dictionaries.
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static java.lang.StringgetExpectedFeaturesName(java.lang.String name) - -
-          Returns the expected name of the metadata file, based on the name of the - FSA dictionary file.
-static DictionarygetForLanguage(java.lang.String languageCode) - -
-          Return a built-in dictionary for a given ISO language code.
-static Dictionaryread(java.io.File fsaFile) - -
-          Attempts to load a dictionary using the path to the FSA file and the - expected metadata extension.
-static Dictionaryread(java.net.URL fsaURL) - -
-           - Attempts to load a dictionary using the URL to the FSA file and the - expected metadata extension.
-static DictionaryreadAndClose(java.io.InputStream fsaData, - java.io.InputStream featuresData) - -
-          Attempts to load a dictionary from opened streams of FSA dictionary data - and associated metadata.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-METADATA_FILE_EXTENSION

-
-public static final java.lang.String METADATA_FILE_EXTENSION
-
-
Expected metadata file extension. -

-

-
See Also:
Constant Field Values
-
-
- -

-fsa

-
-public final FSA fsa
-
-
FSA automaton with the compiled dictionary data. -

-

-
-
-
- -

-metadata

-
-public final DictionaryMetadata metadata
-
-
Metadata associated with the dictionary. -

-

-
-
-
- -

-defaultDictionaries

-
-public static final java.util.WeakHashMap<java.lang.String,Dictionary> defaultDictionaries
-
-
Default loaded dictionaries. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-Dictionary

-
-public Dictionary(FSA fsa,
-                  DictionaryMetadata metadata)
-
-
It is strongly recommended to use static methods in this class for - reading dictionaries. -

-

-
Parameters:
fsa - An instantiated FSA instance.
metadata - A map of attributes describing the compression format and - other settings not contained in the FSA automaton. For an - explanation of available attributes and their possible values, - see DictionaryMetadata.
-
- - - - - - - - -
-Method Detail
- -

-read

-
-public static Dictionary read(java.io.File fsaFile)
-                       throws java.io.IOException
-
-
Attempts to load a dictionary using the path to the FSA file and the - expected metadata extension. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-read

-
-public static Dictionary read(java.net.URL fsaURL)
-                       throws java.io.IOException
-
-

- Attempts to load a dictionary using the URL to the FSA file and the - expected metadata extension. - -

- This method can be used to load resource-based dictionaries, but be aware - of JAR resource-locking issues that arise from resource URLs. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-readAndClose

-
-public static Dictionary readAndClose(java.io.InputStream fsaData,
-                                      java.io.InputStream featuresData)
-                               throws java.io.IOException
-
-
Attempts to load a dictionary from opened streams of FSA dictionary data - and associated metadata. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-getExpectedFeaturesName

-
-public static java.lang.String getExpectedFeaturesName(java.lang.String name)
-
-
Returns the expected name of the metadata file, based on the name of the - FSA dictionary file. The expected name is resolved by truncating any - suffix of name and appending - METADATA_FILE_EXTENSION. -

-

-
-
-
-
- -

-getForLanguage

-
-public static Dictionary getForLanguage(java.lang.String languageCode)
-
-
Return a built-in dictionary for a given ISO language code. Dictionaries - are cached internally for potential reuse. -

-

- -
Throws: -
java.lang.RuntimeException - Throws a RuntimeException if the dictionary is not - bundled with the library.
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/DictionaryIterator.html b/doc/api/morfologik/stemming/DictionaryIterator.html deleted file mode 100644 index f857393..0000000 --- a/doc/api/morfologik/stemming/DictionaryIterator.html +++ /dev/null @@ -1,310 +0,0 @@ - - - - - - -DictionaryIterator - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class DictionaryIterator

-
-java.lang.Object
-  extended by morfologik.stemming.DictionaryIterator
-
-
-
All Implemented Interfaces:
java.util.Iterator<WordData>
-
-
-
-
public final class DictionaryIterator
extends java.lang.Object
implements java.util.Iterator<WordData>
- - -

-An iterator over WordData entries of a Dictionary. The stems - can be decoded from compressed format or the compressed form can be - preserved. -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
DictionaryIterator(Dictionary dictionary, - java.nio.charset.CharsetDecoder decoder, - boolean decodeStems) - -
-           
-  - - - - - - - - - - - - - - - - - - - -
-Method Summary
- booleanhasNext() - -
-           
- WordDatanext() - -
-           
- voidremove() - -
-           
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-DictionaryIterator

-
-public DictionaryIterator(Dictionary dictionary,
-                          java.nio.charset.CharsetDecoder decoder,
-                          boolean decodeStems)
-
-
- - - - - - - - -
-Method Detail
- -

-hasNext

-
-public boolean hasNext()
-
-
-
Specified by:
hasNext in interface java.util.Iterator<WordData>
-
-
-
-
-
-
- -

-next

-
-public WordData next()
-
-
-
Specified by:
next in interface java.util.Iterator<WordData>
-
-
-
-
-
-
- -

-remove

-
-public void remove()
-
-
-
Specified by:
remove in interface java.util.Iterator<WordData>
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/DictionaryLookup.html b/doc/api/morfologik/stemming/DictionaryLookup.html deleted file mode 100644 index 423b6ac..0000000 --- a/doc/api/morfologik/stemming/DictionaryLookup.html +++ /dev/null @@ -1,376 +0,0 @@ - - - - - - -DictionaryLookup - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class DictionaryLookup

-
-java.lang.Object
-  extended by morfologik.stemming.DictionaryLookup
-
-
-
All Implemented Interfaces:
java.lang.Iterable<WordData>, IStemmer
-
-
-
-
public final class DictionaryLookup
extends java.lang.Object
implements IStemmer, java.lang.Iterable<WordData>
- - -

-This class implements a dictionary lookup over an FSA dictionary. The - dictionary for this class should be prepared from a text file using Jan - Daciuk's FSA package (see link below). - -

- Important: finite state automatons in Jan Daciuk's implementation use - bytes not unicode characters. Therefore objects of this class always - have to be constructed with an encoding used to convert Java strings to byte - arrays and the other way around. You can use UTF-8 encoding, as it - should not conflict with any control sequences and separator characters. -

- -

-

-
See Also:
FSA package Web - site
-
- -

- - - - - - - - - - - -
-Constructor Summary
DictionaryLookup(Dictionary dictionary) - -
-           - Creates a new object of this class using the given FSA for word lookups - and encoding for converting characters to bytes.
-  - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static java.nio.ByteBufferdecodeStem(java.nio.ByteBuffer bb, - byte[] bytes, - int len, - java.nio.ByteBuffer inflectedBuffer, - DictionaryMetadata metadata) - -
-          Decode the base form of an inflected word and save its decoded form into - a byte buffer.
- DictionarygetDictionary() - -
-           
- java.util.Iterator<WordData>iterator() - -
-          Return an iterator over all WordData entries available in the - embedded Dictionary.
- java.util.List<WordData>lookup(java.lang.CharSequence word) - -
-          Searches the automaton for a symbol sequence equal to word, - followed by a separator.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-DictionaryLookup

-
-public DictionaryLookup(Dictionary dictionary)
-                 throws java.lang.IllegalArgumentException
-
-

- Creates a new object of this class using the given FSA for word lookups - and encoding for converting characters to bytes. -

-

- -
Throws: -
java.lang.IllegalArgumentException - if FSA's root node cannot be acquired (dictionary is empty).
-
- - - - - - - - -
-Method Detail
- -

-lookup

-
-public java.util.List<WordData> lookup(java.lang.CharSequence word)
-
-
Searches the automaton for a symbol sequence equal to word, - followed by a separator. The result is a stem (decompressed accordingly - to the dictionary's specification) and an optional tag data. -

-

-
Specified by:
lookup in interface IStemmer
-
-
-
-
-
-
- -

-decodeStem

-
-public static java.nio.ByteBuffer decodeStem(java.nio.ByteBuffer bb,
-                                             byte[] bytes,
-                                             int len,
-                                             java.nio.ByteBuffer inflectedBuffer,
-                                             DictionaryMetadata metadata)
-
-
Decode the base form of an inflected word and save its decoded form into - a byte buffer. -

-

-
-
-
-
Parameters:
bb - The byte buffer to save the result to. A new buffer may be - allocated if the capacity of bb is not large - enough to store the result. The buffer is not flipped upon - return.
inflectedBuffer - Inflected form's bytes (decoded properly).
bytes - Bytes of the encoded base form, starting at 0 index.
len - Length of the encode base form. -
Returns:
Returns either bb or a new buffer whose capacity is - large enough to store the output of the decoded data.
-
-
-
- -

-iterator

-
-public java.util.Iterator<WordData> iterator()
-
-
Return an iterator over all WordData entries available in the - embedded Dictionary. -

-

-
Specified by:
iterator in interface java.lang.Iterable<WordData>
-
-
-
-
-
-
- -

-getDictionary

-
-public Dictionary getDictionary()
-
-
-
-
-
- -
Returns:
Return the Dictionary used by this object.
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/DictionaryMetadata.html b/doc/api/morfologik/stemming/DictionaryMetadata.html deleted file mode 100644 index 63b289f..0000000 --- a/doc/api/morfologik/stemming/DictionaryMetadata.html +++ /dev/null @@ -1,437 +0,0 @@ - - - - - - -DictionaryMetadata - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class DictionaryMetadata

-
-java.lang.Object
-  extended by morfologik.stemming.DictionaryMetadata
-
-
-
-
public final class DictionaryMetadata
extends java.lang.Object
- - -

-Description of attributes, their types and default values. -

- -

-

-
See Also:
Dictionary
-
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Field Summary
-static java.lang.StringATTR_NAME_ENCODING - -
-          Attribute name for encoding.
-static java.lang.StringATTR_NAME_SEPARATOR - -
-          Attribute name for separator.
-static java.lang.StringATTR_NAME_USES_INFIXES - -
-          Attribute name for usesInfixes.
-static java.lang.StringATTR_NAME_USES_PREFIXES - -
-          Attribute name for usesPrefixes.
- java.lang.Stringencoding - -
-          Encoding used for converting bytes to characters and vice versa.
- java.util.Map<java.lang.String,java.lang.String>metadata - -
-          Other meta data not included above.
- byteseparator - -
-          A separator character between fields (stem, lemma, form).
- booleanusesInfixes - -
-          True if the dictionary was compiled with infix compression.
- booleanusesPrefixes - -
-          True if the dictionary was compiled with prefix compression.
-  - - - - - - - - - - -
-Constructor Summary
DictionaryMetadata(char separator, - java.lang.String encoding, - boolean usesPrefixes, - boolean usesInfixes, - java.util.Map<java.lang.String,java.lang.String> metadata) - -
-          Creates an immutable instance of DictionaryMetadata.
-  - - - - - - - -
-Method Summary
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-ATTR_NAME_SEPARATOR

-
-public static final java.lang.String ATTR_NAME_SEPARATOR
-
-
Attribute name for separator. -

-

-
See Also:
Constant Field Values
-
-
- -

-ATTR_NAME_ENCODING

-
-public static final java.lang.String ATTR_NAME_ENCODING
-
-
Attribute name for encoding. -

-

-
See Also:
Constant Field Values
-
-
- -

-ATTR_NAME_USES_PREFIXES

-
-public static final java.lang.String ATTR_NAME_USES_PREFIXES
-
-
Attribute name for usesPrefixes. -

-

-
See Also:
Constant Field Values
-
-
- -

-ATTR_NAME_USES_INFIXES

-
-public static final java.lang.String ATTR_NAME_USES_INFIXES
-
-
Attribute name for usesInfixes. -

-

-
See Also:
Constant Field Values
-
-
- -

-separator

-
-public final byte separator
-
-
A separator character between fields (stem, lemma, form). The character - must be within byte range (FSA uses bytes internally). -

-

-
-
-
- -

-encoding

-
-public final java.lang.String encoding
-
-
Encoding used for converting bytes to characters and vice versa. -

-

-
-
-
- -

-usesPrefixes

-
-public final boolean usesPrefixes
-
-
True if the dictionary was compiled with prefix compression. -

-

-
-
-
- -

-usesInfixes

-
-public final boolean usesInfixes
-
-
True if the dictionary was compiled with infix compression. -

-

-
-
-
- -

-metadata

-
-public final java.util.Map<java.lang.String,java.lang.String> metadata
-
-
Other meta data not included above. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-DictionaryMetadata

-
-public DictionaryMetadata(char separator,
-                          java.lang.String encoding,
-                          boolean usesPrefixes,
-                          boolean usesInfixes,
-                          java.util.Map<java.lang.String,java.lang.String> metadata)
-
-
Creates an immutable instance of DictionaryMetadata. -

-

- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/IStemmer.html b/doc/api/morfologik/stemming/IStemmer.html deleted file mode 100644 index 8d901f9..0000000 --- a/doc/api/morfologik/stemming/IStemmer.html +++ /dev/null @@ -1,220 +0,0 @@ - - - - - - -IStemmer - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Interface IStemmer

-
-
All Known Implementing Classes:
DictionaryLookup, PolishStemmer
-
-
-
-
public interface IStemmer
- - -

-A generic "stemmer" interface in Morfologik. -

- -

-


- -

- - - - - - - - - - - - -
-Method Summary
- java.util.List<WordData>lookup(java.lang.CharSequence word) - -
-          Returns a list of WordData entries for a given word.
-  -

- - - - - - - - -
-Method Detail
- -

-lookup

-
-java.util.List<WordData> lookup(java.lang.CharSequence word)
-
-
Returns a list of WordData entries for a given word. The returned - list is never null. Depending on the stemmer's - implementation the WordData may carry the stem and additional - information (tag) or just the stem. -

- The returned list and any object it contains are not usable after a - subsequent call to this method. Any data that should be stored in between - must be copied by the caller. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/PolishStemmer.html b/doc/api/morfologik/stemming/PolishStemmer.html deleted file mode 100644 index ccaa8f4..0000000 --- a/doc/api/morfologik/stemming/PolishStemmer.html +++ /dev/null @@ -1,302 +0,0 @@ - - - - - - -PolishStemmer - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class PolishStemmer

-
-java.lang.Object
-  extended by morfologik.stemming.PolishStemmer
-
-
-
All Implemented Interfaces:
java.lang.Iterable<WordData>, IStemmer
-
-
-
-
public final class PolishStemmer
extends java.lang.Object
implements IStemmer, java.lang.Iterable<WordData>
- - -

-A dictionary-based stemmer for the Polish language. This stemmer requires an - FSA-compiled dictionary to be present in classpath resources. - - Objects of this class are not thread safe. -

- -

-

-
See Also:
DictionaryLookup
-
- -

- - - - - - - - - - - -
-Constructor Summary
PolishStemmer() - -
-          This constructor is initialized with a built-in dictionary or fails with - a runtime exception if the dictionary is not available.
-  - - - - - - - - - - - - - - - -
-Method Summary
- java.util.Iterator<WordData>iterator() - -
-          Iterates over all dictionary forms stored in this stemmer.
- java.util.List<WordData>lookup(java.lang.CharSequence word) - -
-          Returns a list of WordData entries for a given word.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-PolishStemmer

-
-public PolishStemmer()
-
-
This constructor is initialized with a built-in dictionary or fails with - a runtime exception if the dictionary is not available. -

-

- - - - - - - - -
-Method Detail
- -

-lookup

-
-public java.util.List<WordData> lookup(java.lang.CharSequence word)
-
-
Returns a list of WordData entries for a given word. The returned - list is never null. Depending on the stemmer's - implementation the WordData may carry the stem and additional - information (tag) or just the stem. -

- The returned list and any object it contains are not usable after a - subsequent call to this method. Any data that should be stored in between - must be copied by the caller. -

-

-
Specified by:
lookup in interface IStemmer
-
-
-
-
-
-
- -

-iterator

-
-public java.util.Iterator<WordData> iterator()
-
-
Iterates over all dictionary forms stored in this stemmer. -

-

-
Specified by:
iterator in interface java.lang.Iterable<WordData>
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/WordData.html b/doc/api/morfologik/stemming/WordData.html deleted file mode 100644 index 42d956b..0000000 --- a/doc/api/morfologik/stemming/WordData.html +++ /dev/null @@ -1,447 +0,0 @@ - - - - - - -WordData - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.stemming -
-Class WordData

-
-java.lang.Object
-  extended by morfologik.stemming.WordData
-
-
-
All Implemented Interfaces:
java.lang.Cloneable
-
-
-
-
public final class WordData
extends java.lang.Object
implements java.lang.Cloneable
- - -

-Stem and tag data associated with a given word. - -

- Important notes: -

-

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-protected  WordDataclone() - -
-          Declare a covariant of Object.clone() that returns a deep copy of - this object.
- booleanequals(java.lang.Object obj) - -
-           
- java.lang.CharSequencegetStem() - -
-           
- java.nio.ByteBuffergetStemBytes(java.nio.ByteBuffer target) - -
-          Copy the stem's binary data (no charset decoding) to a custom byte - buffer.
- java.lang.CharSequencegetTag() - -
-           
- java.nio.ByteBuffergetTagBytes(java.nio.ByteBuffer target) - -
-          Copy the tag's binary data (no charset decoding) to a custom byte buffer.
- java.lang.CharSequencegetWord() - -
-           
- java.nio.ByteBuffergetWordBytes(java.nio.ByteBuffer target) - -
-          Copy the inflected word's binary data (no charset decoding) to a custom - byte buffer.
- inthashCode() - -
-           
- - - - - - - -
Methods inherited from class java.lang.Object
finalize, getClass, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Method Detail
- -

-getStemBytes

-
-public java.nio.ByteBuffer getStemBytes(java.nio.ByteBuffer target)
-
-
Copy the stem's binary data (no charset decoding) to a custom byte - buffer. If the buffer is null or not large enough to hold the result, a - new buffer is allocated. -

-

-
-
-
-
Parameters:
target - Target byte buffer to copy the stem buffer to or - null if a new buffer should be allocated. -
Returns:
Returns target or the new reallocated buffer.
-
-
-
- -

-getTagBytes

-
-public java.nio.ByteBuffer getTagBytes(java.nio.ByteBuffer target)
-
-
Copy the tag's binary data (no charset decoding) to a custom byte buffer. - If the buffer is null or not large enough to hold the result, a new - buffer is allocated. -

-

-
-
-
-
Parameters:
target - Target byte buffer to copy the tag buffer to or - null if a new buffer should be allocated. -
Returns:
Returns target or the new reallocated buffer.
-
-
-
- -

-getWordBytes

-
-public java.nio.ByteBuffer getWordBytes(java.nio.ByteBuffer target)
-
-
Copy the inflected word's binary data (no charset decoding) to a custom - byte buffer. If the buffer is null or not large enough to hold the - result, a new buffer is allocated. -

-

-
-
-
-
Parameters:
target - Target byte buffer to copy the word buffer to or - null if a new buffer should be allocated. -
Returns:
Returns target or the new reallocated buffer.
-
-
-
- -

-getTag

-
-public java.lang.CharSequence getTag()
-
-
-
-
-
- -
Returns:
Return tag data decoded to a character sequence or - null if no associated tag data exists.
-
-
-
- -

-getStem

-
-public java.lang.CharSequence getStem()
-
-
-
-
-
- -
Returns:
Return stem data decoded to a character sequence or - null if no associated stem data exists.
-
-
-
- -

-getWord

-
-public java.lang.CharSequence getWord()
-
-
-
-
-
- -
Returns:
Return inflected word form data. Usually the parameter passed to - DictionaryLookup.lookup(CharSequence).
-
-
-
- -

-equals

-
-public boolean equals(java.lang.Object obj)
-
-
-
Overrides:
equals in class java.lang.Object
-
-
-
-
-
-
- -

-hashCode

-
-public int hashCode()
-
-
-
Overrides:
hashCode in class java.lang.Object
-
-
-
-
-
-
- -

-clone

-
-protected WordData clone()
-
-
Declare a covariant of Object.clone() that returns a deep copy of - this object. The content of all internal buffers is copied. -

-

-
Overrides:
clone in class java.lang.Object
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/package-frame.html b/doc/api/morfologik/stemming/package-frame.html deleted file mode 100644 index b8d12fb..0000000 --- a/doc/api/morfologik/stemming/package-frame.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - -morfologik.stemming - - - - - - - - - - - -morfologik.stemming - - - - -
-Interfaces  - -
-IStemmer
- - - - - - -
-Classes  - -
-Dictionary -
-DictionaryIterator -
-DictionaryLookup -
-DictionaryMetadata -
-PolishStemmer -
-WordData
- - - - diff --git a/doc/api/morfologik/stemming/package-summary.html b/doc/api/morfologik/stemming/package-summary.html deleted file mode 100644 index d439020..0000000 --- a/doc/api/morfologik/stemming/package-summary.html +++ /dev/null @@ -1,190 +0,0 @@ - - - - - - -morfologik.stemming - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-

-Package morfologik.stemming -

- - - - - - - - - -
-Interface Summary
IStemmerA generic "stemmer" interface in Morfologik.
-  - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Class Summary
DictionaryA dictionary combines FSA automaton and metadata describing the - internals of dictionary entries' coding (DictionaryMetadata.
DictionaryIteratorAn iterator over WordData entries of a Dictionary.
DictionaryLookupThis class implements a dictionary lookup over an FSA dictionary.
DictionaryMetadataDescription of attributes, their types and default values.
PolishStemmerA dictionary-based stemmer for the Polish language.
WordDataStem and tag data associated with a given word.
-  - -

-

-
-
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/stemming/package-tree.html b/doc/api/morfologik/stemming/package-tree.html deleted file mode 100644 index 73b3e3d..0000000 --- a/doc/api/morfologik/stemming/package-tree.html +++ /dev/null @@ -1,160 +0,0 @@ - - - - - - -morfologik.stemming Class Hierarchy - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Hierarchy For Package morfologik.stemming -

-
-
-
Package Hierarchies:
All Packages
-
-

-Class Hierarchy -

- -

-Interface Hierarchy -

- -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/FSABuildTool.Format.html b/doc/api/morfologik/tools/FSABuildTool.Format.html deleted file mode 100644 index af79179..0000000 --- a/doc/api/morfologik/tools/FSABuildTool.Format.html +++ /dev/null @@ -1,346 +0,0 @@ - - - - - - -FSABuildTool.Format - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Enum FSABuildTool.Format

-
-java.lang.Object
-  extended by java.lang.Enum<FSABuildTool.Format>
-      extended by morfologik.tools.FSABuildTool.Format
-
-
-
All Implemented Interfaces:
java.io.Serializable, java.lang.Comparable<FSABuildTool.Format>
-
-
-
Enclosing class:
FSABuildTool
-
-
-
-
public static enum FSABuildTool.Format
extends java.lang.Enum<FSABuildTool.Format>
- - -

-The serialization format to use for the binary output. -

- -

-


- -

- - - - - - - - - - - - - -
-Enum Constant Summary
CFSA2 - -
-           
FSA5 - -
-           
-  - - - - - - - - - - - - - - - - - - - -
-Method Summary
- FSASerializergetSerializer() - -
-           
-static FSABuildTool.FormatvalueOf(java.lang.String name) - -
-          Returns the enum constant of this type with the specified name.
-static FSABuildTool.Format[]values() - -
-          Returns an array containing the constants of this enum type, in -the order they are declared.
- - - - - - - -
Methods inherited from class java.lang.Enum
clone, compareTo, equals, finalize, getDeclaringClass, hashCode, name, ordinal, toString, valueOf
- - - - - - - -
Methods inherited from class java.lang.Object
getClass, notify, notifyAll, wait, wait, wait
-  -

- - - - - - - - -
-Enum Constant Detail
- -

-FSA5

-
-public static final FSABuildTool.Format FSA5
-
-
-
-
-
- -

-CFSA2

-
-public static final FSABuildTool.Format CFSA2
-
-
-
-
- - - - - - - - -
-Method Detail
- -

-values

-
-public static FSABuildTool.Format[] values()
-
-
Returns an array containing the constants of this enum type, in -the order they are declared. This method may be used to iterate -over the constants as follows: -
-for (FSABuildTool.Format c : FSABuildTool.Format.values())
-    System.out.println(c);
-
-

-

- -
Returns:
an array containing the constants of this enum type, in -the order they are declared
-
-
-
- -

-valueOf

-
-public static FSABuildTool.Format valueOf(java.lang.String name)
-
-
Returns the enum constant of this type with the specified name. -The string must match exactly an identifier used to declare an -enum constant in this type. (Extraneous whitespace characters are -not permitted.) -

-

-
Parameters:
name - the name of the enum constant to be returned. -
Returns:
the enum constant with the specified name -
Throws: -
java.lang.IllegalArgumentException - if this enum type has no constant -with the specified name -
java.lang.NullPointerException - if the argument is null
-
-
-
- -

-getSerializer

-
-public FSASerializer getSerializer()
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/FSABuildTool.html b/doc/api/morfologik/tools/FSABuildTool.html deleted file mode 100644 index 74a0465..0000000 --- a/doc/api/morfologik/tools/FSABuildTool.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - -FSABuildTool - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class FSABuildTool

-
-java.lang.Object
-  extended by morfologik.tools.FSABuildTool
-
-
-
-
public final class FSABuildTool
extends java.lang.Object
- - -

-Convert from plain text input to a serialized FSA in any of the - available FSABuildTool.Formats. -

- -

-


- -

- - - - - - - - - - - -
-Nested Class Summary
-static classFSABuildTool.Format - -
-          The serialization format to use for the binary output.
- - - - - - - - - - -
-Field Summary
-protected  org.apache.commons.cli.Optionsoptions - -
-          Command line options.
-  - - - - - - - - - - -
-Constructor Summary
FSABuildTool() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static voidcheckSingleByte(java.lang.String chr) - -
-          Check if the argument is a single byte after conversion using platform-default - encoding.
-protected  java.lang.StringdumpLine(byte[] line, - int length) - -
-          Dump input line, byte-by-byte.
-protected  voidgo(org.apache.commons.cli.CommandLine line) - -
-          Command line entry point after parsing arguments.
-protected  voidgo(java.lang.String[] args) - -
-          Initializes application context.
-protected  voidinitializeOptions(org.apache.commons.cli.Options options) - -
-          Override and initialize options.
-protected  booleanisAvailable() - -
-          Is the tool available?
-static voidmain(java.lang.String[] args) - -
-          Command line entry point.
-protected  voidprintError(java.lang.String msg) - -
-          Print an error without an exception.
-protected  voidprintError(java.lang.String msg, - java.lang.Throwable t) - -
-          Print an error and an associated exception.
-protected  voidprintUsage() - -
-          Prints usage (options).
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-options

-
-protected final org.apache.commons.cli.Options options
-
-
Command line options. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-FSABuildTool

-
-public FSABuildTool()
-
-
- - - - - - - - -
-Method Detail
- -

-go

-
-protected void go(org.apache.commons.cli.CommandLine line)
-           throws java.lang.Exception
-
-
Command line entry point after parsing arguments. -

-

-
-
-
- -
Throws: -
java.lang.Exception
-
-
-
- -

-dumpLine

-
-protected java.lang.String dumpLine(byte[] line,
-                                    int length)
-
-
Dump input line, byte-by-byte. -

-

-
-
-
-
- -

-checkSingleByte

-
-public static void checkSingleByte(java.lang.String chr)
-
-
Check if the argument is a single byte after conversion using platform-default - encoding. -

-

-
-
-
-
- -

-printUsage

-
-protected void printUsage()
-
-
Prints usage (options). -

-

-
-
-
-
-
-
-
- -

-initializeOptions

-
-protected void initializeOptions(org.apache.commons.cli.Options options)
-
-
Override and initialize options. -

-

-
-
-
-
-
-
-
- -

-main

-
-public static void main(java.lang.String[] args)
-                 throws java.lang.Exception
-
-
Command line entry point. -

-

- -
Throws: -
java.lang.Exception
-
-
-
- -

-go

-
-protected final void go(java.lang.String[] args)
-
-
Initializes application context. -

-

-
-
-
-
- -

-printError

-
-protected void printError(java.lang.String msg,
-                          java.lang.Throwable t)
-
-
Print an error and an associated exception. -

-

-
-
-
-
- -

-printError

-
-protected void printError(java.lang.String msg)
-
-
Print an error without an exception. -

-

-
-
-
-
- -

-isAvailable

-
-protected boolean isAvailable()
-
-
Is the tool available? true by default. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/FSADumpTool.html b/doc/api/morfologik/tools/FSADumpTool.html deleted file mode 100644 index e7b9098..0000000 --- a/doc/api/morfologik/tools/FSADumpTool.html +++ /dev/null @@ -1,457 +0,0 @@ - - - - - - -FSADumpTool - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class FSADumpTool

-
-java.lang.Object
-  extended by morfologik.tools.FSADumpTool
-
-
-
-
public final class FSADumpTool
extends java.lang.Object
- - -

-This utility will dump the information and contents of a given FSA - dictionary. It can dump dictionaries in the raw form (as fed to the - fsa_build program) or decoding compressed stem forms. -

- -

-


- -

- - - - - - - - - - - -
-Field Summary
-protected  org.apache.commons.cli.Optionsoptions - -
-          Command line options.
-  - - - - - - - - - - -
-Constructor Summary
FSADumpTool() - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-protected  voidgo(org.apache.commons.cli.CommandLine line) - -
-          Command line entry point after parsing arguments.
-protected  voidgo(java.lang.String[] args) - -
-          Initializes application context.
-protected  voidinitializeOptions(org.apache.commons.cli.Options options) - -
-          Command line options for the tool.
-protected  booleanisAvailable() - -
-          Is the tool available?
-static voidmain(java.lang.String[] args) - -
-          Command line entry point.
-protected  voidprintError(java.lang.String msg) - -
-          Print an error without an exception.
-protected  voidprintError(java.lang.String msg, - java.lang.Throwable t) - -
-          Print an error and an associated exception.
-protected  voidprintUsage() - -
-          Prints usage (options).
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Field Detail
- -

-options

-
-protected final org.apache.commons.cli.Options options
-
-
Command line options. -

-

-
-
- - - - - - - - -
-Constructor Detail
- -

-FSADumpTool

-
-public FSADumpTool()
-
-
- - - - - - - - -
-Method Detail
- -

-go

-
-protected void go(org.apache.commons.cli.CommandLine line)
-           throws java.lang.Exception
-
-
Command line entry point after parsing arguments. -

-

-
-
-
- -
Throws: -
java.lang.Exception
-
-
-
- -

-initializeOptions

-
-protected void initializeOptions(org.apache.commons.cli.Options options)
-
-
Command line options for the tool. -

-

-
-
-
-
-
-
-
- -

-main

-
-public static void main(java.lang.String[] args)
-                 throws java.lang.Exception
-
-
Command line entry point. -

-

- -
Throws: -
java.lang.Exception
-
-
-
- -

-go

-
-protected final void go(java.lang.String[] args)
-
-
Initializes application context. -

-

-
-
-
-
- -

-printError

-
-protected void printError(java.lang.String msg,
-                          java.lang.Throwable t)
-
-
Print an error and an associated exception. -

-

-
-
-
-
- -

-printError

-
-protected void printError(java.lang.String msg)
-
-
Print an error without an exception. -

-

-
-
-
-
- -

-printUsage

-
-protected void printUsage()
-
-
Prints usage (options). -

-

-
-
-
-
- -

-isAvailable

-
-protected boolean isAvailable()
-
-
Is the tool available? true by default. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/IMessageLogger.html b/doc/api/morfologik/tools/IMessageLogger.html deleted file mode 100644 index d84023a..0000000 --- a/doc/api/morfologik/tools/IMessageLogger.html +++ /dev/null @@ -1,272 +0,0 @@ - - - - - - -IMessageLogger - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Interface IMessageLogger

-
-
All Known Implementing Classes:
WriterMessageLogger
-
-
-
-
public interface IMessageLogger
- - -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- voidendPart() - -
-           
- voidlog(java.lang.String msg) - -
-          Log progress to the console.
- voidlog(java.lang.String header, - java.lang.Object v) - -
-          Log a two-part message.
- voidstartPart(java.lang.String header) - -
-          Log message header and save current time.
-  -

- - - - - - - - -
-Method Detail
- -

-log

-
-void log(java.lang.String msg)
-
-
Log progress to the console. -

-

-
-
-
-
- -

-startPart

-
-void startPart(java.lang.String header)
-
-
Log message header and save current time. -

-

-
-
-
-
- -

-endPart

-
-void endPart()
-
-
-
-
-
-
- -

-log

-
-void log(java.lang.String header,
-         java.lang.Object v)
-
-
Log a two-part message. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/InflectionFramesTool.html b/doc/api/morfologik/tools/InflectionFramesTool.html deleted file mode 100644 index 6d3d792..0000000 --- a/doc/api/morfologik/tools/InflectionFramesTool.html +++ /dev/null @@ -1,279 +0,0 @@ - - - - - - -InflectionFramesTool - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class InflectionFramesTool

-
-java.lang.Object
-  extended by morfologik.tools.InflectionFramesTool
-
-
-
-
public class InflectionFramesTool
extends java.lang.Object
- - -

-Calculate inflection frames from the Polish dictionary. -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
InflectionFramesTool() - -
-           
-  - - - - - - - - - - - - - - - -
-Method Summary
- voidinflectionFrames() - -
-           
-static voidmain(java.lang.String[] args) - -
-           
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-InflectionFramesTool

-
-public InflectionFramesTool()
-
-
- - - - - - - - -
-Method Detail
- -

-main

-
-public static void main(java.lang.String[] args)
-                 throws java.io.IOException
-
-
- -
Throws: -
java.io.IOException
-
-
-
- -

-inflectionFrames

-
-public void inflectionFrames()
-                      throws java.io.IOException
-
-
- -
Throws: -
java.io.IOException
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/Launcher.html b/doc/api/morfologik/tools/Launcher.html deleted file mode 100644 index da29ad9..0000000 --- a/doc/api/morfologik/tools/Launcher.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - -Launcher - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class Launcher

-
-java.lang.Object
-  extended by morfologik.tools.Launcher
-
-
-
-
public final class Launcher
extends java.lang.Object
- - -

-A launcher for other command-line tools. -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
Launcher() - -
-           
-  - - - - - - - - - - - -
-Method Summary
-static voidmain(java.lang.String[] args) - -
-          Command line entry point.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-Launcher

-
-public Launcher()
-
-
- - - - - - - - -
-Method Detail
- -

-main

-
-public static void main(java.lang.String[] args)
-                 throws java.lang.Exception
-
-
Command line entry point. -

-

- -
Throws: -
java.lang.Exception
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/MorphEncoder.html b/doc/api/morfologik/tools/MorphEncoder.html deleted file mode 100644 index fc9fd18..0000000 --- a/doc/api/morfologik/tools/MorphEncoder.html +++ /dev/null @@ -1,584 +0,0 @@ - - - - - - -MorphEncoder - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class MorphEncoder

-
-java.lang.Object
-  extended by morfologik.tools.MorphEncoder
-
-
-
-
public final class MorphEncoder
extends java.lang.Object
- - -

-A class that converts tabular data to fsa morphological format. Three formats - are supported: -

-

- -

-


- -

- - - - - - - - - - - - - - -
-Constructor Summary
MorphEncoder() - -
-           
MorphEncoder(byte annotationSeparator) - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-protected static java.lang.StringasString(byte[] str, - java.lang.String encoding) - -
-          Converts a byte array to a given encoding.
-static intcommonPrefix(byte[] s1, - byte[] s2) - -
-           
- byte[]infixEncode(byte[] wordForm, - byte[] wordLemma, - byte[] wordTag) - -
-          This method converts wordform, wordLemma and the tag to the form: - - inflected_form + MLKending + tags - - - where '+' is a separator, M is the position of characters to be deleted - towards the beginning of the inflected form ("A" means from the - beginning, "B" from the second character, "C" - from the third one, and - so on), L is the number of characters to be deleted from the position - specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - character that specifies how many characters should be deleted from the - end of the inflected form to produce the lexeme by concatenating the - stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - on).
- java.lang.StringinfixEncodeUTF8(java.lang.String wordForm, - java.lang.String wordLemma, - java.lang.String wordTag) - -
-          A UTF-8 variant of infixEncode(byte[], byte[], byte[]).
- byte[]prefixEncode(byte[] wordForm, - byte[] wordLemma, - byte[] wordTag) - -
-          This method converts wordform, wordLemma and the tag to the form: - - - - inflected_form + LKending + tags - - - where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on).
- java.lang.StringprefixEncodeUTF8(java.lang.String wordForm, - java.lang.String wordLemma, - java.lang.String wordTag) - -
-          A UTF-8 variant of prefixEncode(byte[], byte[], byte[]) This - method converts wordform, wordLemma and the tag to the form: - - inflected_form + LKending + tags - - - where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on).
- byte[]standardEncode(byte[] wordForm, - byte[] wordLemma, - byte[] wordTag) - -
-          This method converts the wordForm, wordLemma and tag to the form: - - - wordForm + Kending + tags - - - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending.
- java.lang.StringstandardEncodeUTF8(java.lang.String wordForm, - java.lang.String wordLemma, - java.lang.String wordTag) - -
-          A UTF-8 variant of standardEncode(byte[], byte[], byte[]) This - method converts the wordForm, wordLemma and tag to the form: - - - wordForm + Kending + tags - - - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-MorphEncoder

-
-public MorphEncoder()
-
-
-
- -

-MorphEncoder

-
-public MorphEncoder(byte annotationSeparator)
-
-
- - - - - - - - -
-Method Detail
- -

-commonPrefix

-
-public static int commonPrefix(byte[] s1,
-                               byte[] s2)
-
-
-
-
-
-
- -

-standardEncode

-
-public byte[] standardEncode(byte[] wordForm,
-                             byte[] wordLemma,
-                             byte[] wordTag)
-
-
This method converts the wordForm, wordLemma and tag to the form: - -
- wordForm + Kending + tags
- 
- - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending. -

-

-
-
-
-
- -

-prefixEncode

-
-public byte[] prefixEncode(byte[] wordForm,
-                           byte[] wordLemma,
-                           byte[] wordTag)
-
-
This method converts wordform, wordLemma and the tag to the form: -

- -

- inflected_form + LKending + tags
- 
-

- where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on). -

-

-
Parameters:
wordForm - - inflected word form
wordLemma - - canonical form
wordTag - - tag -
Returns:
the encoded string
-
-
-
- -

-infixEncode

-
-public byte[] infixEncode(byte[] wordForm,
-                          byte[] wordLemma,
-                          byte[] wordTag)
-
-
This method converts wordform, wordLemma and the tag to the form: -
- inflected_form + MLKending + tags
- 
-

- where '+' is a separator, M is the position of characters to be deleted - towards the beginning of the inflected form ("A" means from the - beginning, "B" from the second character, "C" - from the third one, and - so on), L is the number of characters to be deleted from the position - specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - character that specifies how many characters should be deleted from the - end of the inflected form to produce the lexeme by concatenating the - stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - on). -

-

-
Parameters:
wordForm - - inflected word form
wordLemma - - canonical form
wordTag - - tag -
Returns:
the encoded string
-
-
-
- -

-asString

-
-protected static java.lang.String asString(byte[] str,
-                                           java.lang.String encoding)
-
-
Converts a byte array to a given encoding. -

-

-
Parameters:
str - Byte-array to be converted. -
Returns:
Java String. If decoding is unsuccessful, the string is empty.
-
-
-
- -

-standardEncodeUTF8

-
-public java.lang.String standardEncodeUTF8(java.lang.String wordForm,
-                                           java.lang.String wordLemma,
-                                           java.lang.String wordTag)
-                                    throws java.io.UnsupportedEncodingException
-
-
A UTF-8 variant of standardEncode(byte[], byte[], byte[]) This - method converts the wordForm, wordLemma and tag to the form: - -
- wordForm + Kending + tags
- 
- - where '+' is a separator, K is a character that specifies how many - characters should be deleted from the end of the inflected form to - produce the lexeme by concatenating the stripped string with the ending. -

-

- -
Throws: -
java.io.UnsupportedEncodingException
-
-
-
- -

-prefixEncodeUTF8

-
-public java.lang.String prefixEncodeUTF8(java.lang.String wordForm,
-                                         java.lang.String wordLemma,
-                                         java.lang.String wordTag)
-                                  throws java.io.UnsupportedEncodingException
-
-
A UTF-8 variant of prefixEncode(byte[], byte[], byte[]) This - method converts wordform, wordLemma and the tag to the form: -
- inflected_form + LKending + tags
- 
-

- where '+' is a separator, L is the number of characters to be deleted - from the beginning of the word ("A" means none, "B" means one, "C" - 2, - etc.), K is a character that specifies how many characters should be - deleted from the end of the inflected form to produce the lexeme by - concatenating the stripped string with the ending ("A" means none, - "B' - 1, "C" - 2, and so on). -

-

-
Parameters:
wordForm - - inflected word form
wordLemma - - canonical form
wordTag - - tag -
Returns:
the encoded string -
Throws: -
java.io.UnsupportedEncodingException
-
-
-
- -

-infixEncodeUTF8

-
-public java.lang.String infixEncodeUTF8(java.lang.String wordForm,
-                                        java.lang.String wordLemma,
-                                        java.lang.String wordTag)
-                                 throws java.io.UnsupportedEncodingException
-
-
A UTF-8 variant of infixEncode(byte[], byte[], byte[]). - - This method converts wordform, wordLemma and the tag to the form: -
- inflected_form + MLKending + tags
- 
-

- where '+' is a separator, M is the position of characters to be deleted - towards the beginning of the inflected form ("A" means from the - beginning, "B" from the second character, "C" - from the third one, and - so on), L is the number of characters to be deleted from the position - specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - character that specifies how many characters should be deleted from the - end of the inflected form to produce the lexeme by concatenating the - stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - on). -

-

-
Parameters:
wordForm - - inflected word form
wordLemma - - canonical form
wordTag - - tag -
Returns:
the encoded string -
Throws: -
java.io.UnsupportedEncodingException
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/WriterMessageLogger.html b/doc/api/morfologik/tools/WriterMessageLogger.html deleted file mode 100644 index 0cd0cf7..0000000 --- a/doc/api/morfologik/tools/WriterMessageLogger.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - -WriterMessageLogger - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.tools -
-Class WriterMessageLogger

-
-java.lang.Object
-  extended by morfologik.tools.WriterMessageLogger
-
-
-
All Implemented Interfaces:
IMessageLogger
-
-
-
-
public class WriterMessageLogger
extends java.lang.Object
implements IMessageLogger
- - -

-A logger dumping info to System.err. -

- -

-


- -

- - - - - - - - - - - -
-Constructor Summary
WriterMessageLogger(java.io.PrintWriter w) - -
-           
-  - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
- voidendPart() - -
-           
- voidlog(java.lang.String msg) - -
-          Log progress to the console.
- voidlog(java.lang.String header, - java.lang.Object v) - -
-          Log a two-part message.
- voidstartPart(java.lang.String header) - -
-          Log message header and save current time.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Constructor Detail
- -

-WriterMessageLogger

-
-public WriterMessageLogger(java.io.PrintWriter w)
-
-
- - - - - - - - -
-Method Detail
- -

-log

-
-public void log(java.lang.String msg)
-
-
Description copied from interface: IMessageLogger
-
Log progress to the console. -

-

-
Specified by:
log in interface IMessageLogger
-
-
-
-
-
-
- -

-log

-
-public void log(java.lang.String header,
-                java.lang.Object v)
-
-
Description copied from interface: IMessageLogger
-
Log a two-part message. -

-

-
Specified by:
log in interface IMessageLogger
-
-
-
-
-
-
- -

-startPart

-
-public void startPart(java.lang.String header)
-
-
Description copied from interface: IMessageLogger
-
Log message header and save current time. -

-

-
Specified by:
startPart in interface IMessageLogger
-
-
-
-
-
-
- -

-endPart

-
-public void endPart()
-
-
-
Specified by:
endPart in interface IMessageLogger
-
-
-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/package-frame.html b/doc/api/morfologik/tools/package-frame.html deleted file mode 100644 index 1648f37..0000000 --- a/doc/api/morfologik/tools/package-frame.html +++ /dev/null @@ -1,64 +0,0 @@ - - - - - - -morfologik.tools - - - - - - - - - - - -morfologik.tools - - - - -
-Interfaces  - -
-IMessageLogger
- - - - - - -
-Classes  - -
-FSABuildTool -
-FSADumpTool -
-InflectionFramesTool -
-Launcher -
-MorphEncoder -
-WriterMessageLogger
- - - - - - -
-Enums  - -
-FSABuildTool.Format
- - - - diff --git a/doc/api/morfologik/tools/package-summary.html b/doc/api/morfologik/tools/package-summary.html deleted file mode 100644 index f926b20..0000000 --- a/doc/api/morfologik/tools/package-summary.html +++ /dev/null @@ -1,205 +0,0 @@ - - - - - - -morfologik.tools - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-

-Package morfologik.tools -

- - - - - - - - - -
-Interface Summary
IMessageLogger 
-  - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Class Summary
FSABuildToolConvert from plain text input to a serialized FSA in any of the - available FSABuildTool.Formats.
FSADumpToolThis utility will dump the information and contents of a given FSA - dictionary.
InflectionFramesToolCalculate inflection frames from the Polish dictionary.
LauncherA launcher for other command-line tools.
MorphEncoderA class that converts tabular data to fsa morphological format.
WriterMessageLoggerA logger dumping info to System.err.
-  - -

- - - - - - - - - -
-Enum Summary
FSABuildTool.FormatThe serialization format to use for the binary output.
-  - -

-

-
-
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/tools/package-tree.html b/doc/api/morfologik/tools/package-tree.html deleted file mode 100644 index d4395e1..0000000 --- a/doc/api/morfologik/tools/package-tree.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - -morfologik.tools Class Hierarchy - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Hierarchy For Package morfologik.tools -

-
-
-
Package Hierarchies:
All Packages
-
-

-Class Hierarchy -

- -

-Interface Hierarchy -

- -

-Enum Hierarchy -

- -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/Arrays.html b/doc/api/morfologik/util/Arrays.html deleted file mode 100644 index 9add801..0000000 --- a/doc/api/morfologik/util/Arrays.html +++ /dev/null @@ -1,343 +0,0 @@ - - - - - - -Arrays - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.util -
-Class Arrays

-
-java.lang.Object
-  extended by morfologik.util.Arrays
-
-
-
-
public final class Arrays
extends java.lang.Object
- - -

-Compatibility layer for JVM 1.5. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static booleanequals(boolean[] a1, - int a1s, - boolean[] a2, - int a2s, - int length) - -
-          Compare two arrays for equality.
-static booleanequals(byte[] a1, - int a1s, - byte[] a2, - int a2s, - int length) - -
-          Compare two arrays for equality.
-static booleanequals(int[] a1, - int a1s, - int[] a2, - int a2s, - int length) - -
-          Compare two arrays for equality.
-static booleanreferenceEquals(java.lang.Object[] a1, - int a1s, - java.lang.Object[] a2, - int a2s, - int length) - -
-          Compare two lists of objects for reference-equality.
-static java.lang.StringtoString(byte[] bytes, - int start, - int length) - -
-          Convert an array of strings to bytes.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Method Detail
- -

-referenceEquals

-
-public static boolean referenceEquals(java.lang.Object[] a1,
-                                      int a1s,
-                                      java.lang.Object[] a2,
-                                      int a2s,
-                                      int length)
-
-
Compare two lists of objects for reference-equality. -

-

-
-
-
-
- -

-equals

-
-public static boolean equals(byte[] a1,
-                             int a1s,
-                             byte[] a2,
-                             int a2s,
-                             int length)
-
-
Compare two arrays for equality. -

-

-
-
-
-
- -

-equals

-
-public static boolean equals(boolean[] a1,
-                             int a1s,
-                             boolean[] a2,
-                             int a2s,
-                             int length)
-
-
Compare two arrays for equality. -

-

-
-
-
-
- -

-equals

-
-public static boolean equals(int[] a1,
-                             int a1s,
-                             int[] a2,
-                             int a2s,
-                             int length)
-
-
Compare two arrays for equality. -

-

-
-
-
-
- -

-toString

-
-public static java.lang.String toString(byte[] bytes,
-                                        int start,
-                                        int length)
-
-
Convert an array of strings to bytes. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/BufferUtils.html b/doc/api/morfologik/util/BufferUtils.html deleted file mode 100644 index 58d8a88..0000000 --- a/doc/api/morfologik/util/BufferUtils.html +++ /dev/null @@ -1,273 +0,0 @@ - - - - - - -BufferUtils - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.util -
-Class BufferUtils

-
-java.lang.Object
-  extended by morfologik.util.BufferUtils
-
-
-
-
public final class BufferUtils
extends java.lang.Object
- - -

-Utility functions for buffers. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static java.nio.ByteBufferensureCapacity(java.nio.ByteBuffer buffer, - int capacity) - -
-          Ensure the byte buffer's capacity.
-static java.nio.CharBufferensureCapacity(java.nio.CharBuffer buffer, - int capacity) - -
-          Ensure the char buffer's capacity.
-static java.lang.StringtoString(java.nio.ByteBuffer sequence) - -
-          Convert a byte buffer to a string in platform default encoding.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Method Detail
- -

-ensureCapacity

-
-public static java.nio.ByteBuffer ensureCapacity(java.nio.ByteBuffer buffer,
-                                                 int capacity)
-
-
Ensure the byte buffer's capacity. If a new buffer is allocated, its - content is empty (the old buffer's contents is not copied). -

-

-
Parameters:
buffer - The buffer to check or null if a new buffer - should be allocated.
-
-
-
- -

-ensureCapacity

-
-public static java.nio.CharBuffer ensureCapacity(java.nio.CharBuffer buffer,
-                                                 int capacity)
-
-
Ensure the char buffer's capacity. If a new buffer is allocated, its - content is empty (the old buffer's contents is not copied). -

-

-
Parameters:
buffer - The buffer to check or null if a new buffer - should be allocated.
-
-
-
- -

-toString

-
-public static java.lang.String toString(java.nio.ByteBuffer sequence)
-
-
Convert a byte buffer to a string in platform default encoding. -

-

-
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/FileUtils.html b/doc/api/morfologik/util/FileUtils.html deleted file mode 100644 index 438e851..0000000 --- a/doc/api/morfologik/util/FileUtils.html +++ /dev/null @@ -1,424 +0,0 @@ - - - - - - -FileUtils - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.util -
-Class FileUtils

-
-java.lang.Object
-  extended by morfologik.util.FileUtils
-
-
-
-
public final class FileUtils
extends java.lang.Object
- - -

-Utility functions. -

- -

-


- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Method Summary
-static voidassertExists(java.io.File fsaFile, - boolean requireFile, - boolean requireDirectory) - -
-          Checks if the given file exists.
-static voidclose(java.io.Closeable... closeables) - -
-          Force any non-null closeables.
-static bytereadByte(java.io.InputStream in) - -
-          Read exactly one byte from the input stream.
-static byte[]readFully(java.io.InputStream stream) - -
-          Reads all bytes from an input stream (until EOF).
-static voidreadFully(java.io.InputStream in, - byte[] array) - -
-          Read enough bytes to fill array If there are not enough - bytes, throw an exception.
-static intreadInt(java.io.InputStream in) - -
-          Read exactly 4 bytes from the input stream.
-static shortreadShort(java.io.InputStream in) - -
-          Read exactly 2 bytes from the input stream.
-static voidwriteInt(java.io.OutputStream os, - int v) - -
-           
-static voidwriteShort(java.io.OutputStream os, - short v) - -
-           
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Method Detail
- -

-assertExists

-
-public static void assertExists(java.io.File fsaFile,
-                                boolean requireFile,
-                                boolean requireDirectory)
-                         throws java.io.IOException
-
-
Checks if the given file exists. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-close

-
-public static void close(java.io.Closeable... closeables)
-
-
Force any non-null closeables. -

-

-
-
-
-
- -

-readFully

-
-public static byte[] readFully(java.io.InputStream stream)
-                        throws java.io.IOException
-
-
Reads all bytes from an input stream (until EOF). -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-readFully

-
-public static void readFully(java.io.InputStream in,
-                             byte[] array)
-                      throws java.io.IOException
-
-
Read enough bytes to fill array If there are not enough - bytes, throw an exception. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-readInt

-
-public static int readInt(java.io.InputStream in)
-                   throws java.io.IOException
-
-
Read exactly 4 bytes from the input stream. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-writeInt

-
-public static void writeInt(java.io.OutputStream os,
-                            int v)
-                     throws java.io.IOException
-
-
- -
Throws: -
java.io.IOException
-
-
-
- -

-readShort

-
-public static short readShort(java.io.InputStream in)
-                       throws java.io.IOException
-
-
Read exactly 2 bytes from the input stream. -

-

- -
Throws: -
java.io.IOException
-
-
-
- -

-readByte

-
-public static byte readByte(java.io.InputStream in)
-                     throws java.io.IOException
-
-
Read exactly one byte from the input stream. -

-

- -
Throws: -
java.io.EOFException - if EOF is reached. -
java.io.IOException
-
-
-
- -

-writeShort

-
-public static void writeShort(java.io.OutputStream os,
-                              short v)
-                       throws java.io.IOException
-
-
- -
Throws: -
java.io.IOException
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/ResourceUtils.html b/doc/api/morfologik/util/ResourceUtils.html deleted file mode 100644 index b924071..0000000 --- a/doc/api/morfologik/util/ResourceUtils.html +++ /dev/null @@ -1,228 +0,0 @@ - - - - - - -ResourceUtils - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- -

- -morfologik.util -
-Class ResourceUtils

-
-java.lang.Object
-  extended by morfologik.util.ResourceUtils
-
-
-
-
public final class ResourceUtils
extends java.lang.Object
- - -

-Resource management utilities. -

- -

-


- -

- - - - - - - - - - - - -
-Method Summary
-static java.io.InputStreamopenInputStream(java.lang.String resource) - -
-          Returns an input stream to the resource.
- - - - - - - -
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
-  -

- - - - - - - - -
-Method Detail
- -

-openInputStream

-
-public static java.io.InputStream openInputStream(java.lang.String resource)
-                                           throws java.io.IOException
-
-
Returns an input stream to the resource. -

-

-
Parameters:
resource - The path leading to the resource. Can be an URL, a path - leading to a class resource or a File. -
Returns:
InputStream instance. -
Throws: -
java.io.IOException - If the resource could not be found or opened.
-
-
- -
- - - - - - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/package-frame.html b/doc/api/morfologik/util/package-frame.html deleted file mode 100644 index 1803ab0..0000000 --- a/doc/api/morfologik/util/package-frame.html +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - -morfologik.util - - - - - - - - - - - -morfologik.util - - - - -
-Classes  - -
-Arrays -
-BufferUtils -
-FileUtils -
-ResourceUtils
- - - - diff --git a/doc/api/morfologik/util/package-summary.html b/doc/api/morfologik/util/package-summary.html deleted file mode 100644 index c3ce8cb..0000000 --- a/doc/api/morfologik/util/package-summary.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - -morfologik.util - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-

-Package morfologik.util -

- - - - - - - - - - - - - - - - - - - - - -
-Class Summary
ArraysCompatibility layer for JVM 1.5.
BufferUtilsUtility functions for buffers.
FileUtilsUtility functions.
ResourceUtilsResource management utilities.
-  - -

-

-
-
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/morfologik/util/package-tree.html b/doc/api/morfologik/util/package-tree.html deleted file mode 100644 index c34d1a1..0000000 --- a/doc/api/morfologik/util/package-tree.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - -morfologik.util Class Hierarchy - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Hierarchy For Package morfologik.util -

-
-
-
Package Hierarchies:
All Packages
-
-

-Class Hierarchy -

- -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/overview-frame.html b/doc/api/overview-frame.html deleted file mode 100644 index 3e3c6be..0000000 --- a/doc/api/overview-frame.html +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - -Overview List - - - - - - - - - - - - - - - -
-
- - - - - -
All Classes -

- -Packages -
-morfologik.fsa -
-morfologik.stemming -
-morfologik.tools -
-morfologik.util -
-

- -

-  - - diff --git a/doc/api/overview-summary.html b/doc/api/overview-summary.html deleted file mode 100644 index 250077d..0000000 --- a/doc/api/overview-summary.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - -Overview - - - - - - - - - - - - -


- - - - - - - - - - - - - - - -
- -
- - - -
- - - - - - - - - - - - - - - - - - - - - -
-Packages
morfologik.fsa 
morfologik.stemming 
morfologik.tools 
morfologik.util 
- -


- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/overview-tree.html b/doc/api/overview-tree.html deleted file mode 100644 index 4d068a8..0000000 --- a/doc/api/overview-tree.html +++ /dev/null @@ -1,176 +0,0 @@ - - - - - - -Class Hierarchy - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - -
-
-

-Hierarchy For All Packages

-
-
-
Package Hierarchies:
morfologik.fsa, morfologik.stemming, morfologik.tools, morfologik.util
-
-

-Class Hierarchy -

- -

-Interface Hierarchy -

- -

-Enum Hierarchy -

- -
- - - - - - - - - - - - - - - -
- -
- - - -
- - - diff --git a/doc/api/package-list b/doc/api/package-list deleted file mode 100644 index dd0082d..0000000 --- a/doc/api/package-list +++ /dev/null @@ -1,4 +0,0 @@ -morfologik.fsa -morfologik.stemming -morfologik.tools -morfologik.util diff --git a/doc/api/resources/inherit.gif b/doc/api/resources/inherit.gif deleted file mode 100644 index c814867..0000000 Binary files a/doc/api/resources/inherit.gif and /dev/null differ diff --git a/doc/api/stylesheet.css b/doc/api/stylesheet.css deleted file mode 100644 index cbd3428..0000000 --- a/doc/api/stylesheet.css +++ /dev/null @@ -1,29 +0,0 @@ -/* Javadoc style sheet */ - -/* Define colors, fonts and other style attributes here to override the defaults */ - -/* Page background color */ -body { background-color: #FFFFFF; color:#000000 } - -/* Headings */ -h1 { font-size: 145% } - -/* Table colors */ -.TableHeadingColor { background: #CCCCFF; color:#000000 } /* Dark mauve */ -.TableSubHeadingColor { background: #EEEEFF; color:#000000 } /* Light mauve */ -.TableRowColor { background: #FFFFFF; color:#000000 } /* White */ - -/* Font used in left-hand frame lists */ -.FrameTitleFont { font-size: 100%; font-family: Helvetica, Arial, sans-serif; color:#000000 } -.FrameHeadingFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif; color:#000000 } -.FrameItemFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif; color:#000000 } - -/* Navigation bar fonts and colors */ -.NavBarCell1 { background-color:#EEEEFF; color:#000000} /* Light mauve */ -.NavBarCell1Rev { background-color:#00008B; color:#FFFFFF} /* Dark Blue */ -.NavBarFont1 { font-family: Arial, Helvetica, sans-serif; color:#000000;color:#000000;} -.NavBarFont1Rev { font-family: Arial, Helvetica, sans-serif; color:#FFFFFF;color:#FFFFFF;} - -.NavBarCell2 { font-family: Arial, Helvetica, sans-serif; background-color:#FFFFFF; color:#000000} -.NavBarCell3 { font-family: Arial, Helvetica, sans-serif; background-color:#FFFFFF; color:#000000} - diff --git a/lib/commons-cli-1.2.LICENSE b/lib/commons-cli-1.2.LICENSE deleted file mode 100644 index 57bc88a..0000000 --- a/lib/commons-cli-1.2.LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/lib/commons-cli-1.2.jar b/lib/commons-cli-1.2.jar deleted file mode 100644 index ce4b9ff..0000000 Binary files a/lib/commons-cli-1.2.jar and /dev/null differ diff --git a/lib/hppc-0.3.2.jar b/lib/hppc-0.3.2.jar deleted file mode 100644 index 2392b00..0000000 Binary files a/lib/hppc-0.3.2.jar and /dev/null differ diff --git a/lib/hppc.LICENSE b/lib/hppc.LICENSE deleted file mode 100644 index d645695..0000000 --- a/lib/hppc.LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/lib/junit-4.7.jar b/lib/junit-4.7.jar deleted file mode 100644 index 700ad69..0000000 Binary files a/lib/junit-4.7.jar and /dev/null differ diff --git a/lib/junit-benchmarks-0.1.0.jar b/lib/junit-benchmarks-0.1.0.jar deleted file mode 100644 index 225c2a6..0000000 Binary files a/lib/junit-benchmarks-0.1.0.jar and /dev/null differ diff --git a/lib/junit-benchmarks.LICENSE b/lib/junit-benchmarks.LICENSE deleted file mode 100644 index d645695..0000000 --- a/lib/junit-benchmarks.LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/lib/junit.LICENSE b/lib/junit.LICENSE deleted file mode 100644 index 4d42e47..0000000 --- a/lib/junit.LICENSE +++ /dev/null @@ -1,88 +0,0 @@ -Common Public License - v 1.0 - -THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. - -1. DEFINITIONS - -"Contribution" means: - - a) in the case of the initial Contributor, the initial code and documentation distributed under this Agreement, and - b) in the case of each subsequent Contributor: - - i) changes to the Program, and - - ii) additions to the Program; - - where such changes and/or additions to the Program originate from and are distributed by that particular Contributor. A Contribution 'originates' from a Contributor if it was added to the Program by such Contributor itself or anyone acting on such Contributor's behalf. Contributions do not include additions to the Program which: (i) are separate modules of software distributed in conjunction with the Program under their own license agreement, and (ii) are not derivative works of the Program. - -"Contributor" means any person or entity that distributes the Program. - -"Licensed Patents " mean patent claims licensable by a Contributor which are necessarily infringed by the use or sale of its Contribution alone or when combined with the Program. - -"Program" means the Contributions distributed in accordance with this Agreement. - -"Recipient" means anyone who receives the Program under this Agreement, including all Contributors. - -2. GRANT OF RIGHTS - - a) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, distribute and sublicense the Contribution of such Contributor, if any, and such derivative works, in source code and object code form. - - b) Subject to the terms of this Agreement, each Contributor hereby grants Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed Patents to make, use, sell, offer to sell, import and otherwise transfer the Contribution of such Contributor, if any, in source code and object code form. This patent license shall apply to the combination of the Contribution and the Program if, at the time the Contribution is added by the Contributor, such addition of the Contribution causes such combination to be covered by the Licensed Patents. The patent license shall not apply to any other combinations which include the Contribution. No hardware per se is licensed hereunder. - - c) Recipient understands that although each Contributor grants the licenses to its Contributions set forth herein, no assurances are provided by any Contributor that the Program does not infringe the patent or other intellectual property rights of any other entity. Each Contributor disclaims any liability to Recipient for claims brought by any other entity based on infringement of intellectual property rights or otherwise. As a condition to exercising the rights and licenses granted hereunder, each Recipient hereby assumes sole responsibility to secure any other intellectual property rights needed, if any. For example, if a third party patent license is required to allow Recipient to distribute the Program, it is Recipient's responsibility to acquire that license before distributing the Program. - - d) Each Contributor represents that to its knowledge it has sufficient copyright rights in its Contribution, if any, to grant the copyright license set forth in this Agreement. - -3. REQUIREMENTS - -A Contributor may choose to distribute the Program in object code form under its own license agreement, provided that: - - a) it complies with the terms and conditions of this Agreement; and - - b) its license agreement: - - i) effectively disclaims on behalf of all Contributors all warranties and conditions, express and implied, including warranties or conditions of title and non-infringement, and implied warranties or conditions of merchantability and fitness for a particular purpose; - - ii) effectively excludes on behalf of all Contributors all liability for damages, including direct, indirect, special, incidental and consequential damages, such as lost profits; - - iii) states that any provisions which differ from this Agreement are offered by that Contributor alone and not by any other party; and - - iv) states that source code for the Program is available from such Contributor, and informs licensees how to obtain it in a reasonable manner on or through a medium customarily used for software exchange. - -When the Program is made available in source code form: - - a) it must be made available under this Agreement; and - - b) a copy of this Agreement must be included with each copy of the Program. - -Contributors may not remove or alter any copyright notices contained within the Program. - -Each Contributor must identify itself as the originator of its Contribution, if any, in a manner that reasonably allows subsequent Recipients to identify the originator of the Contribution. - -4. COMMERCIAL DISTRIBUTION - -Commercial distributors of software may accept certain responsibilities with respect to end users, business partners and the like. While this license is intended to facilitate the commercial use of the Program, the Contributor who includes the Program in a commercial product offering should do so in a manner which does not create potential liability for other Contributors. Therefore, if a Contributor includes the Program in a commercial product offering, such Contributor ("Commercial Contributor") hereby agrees to defend and indemnify every other Contributor ("Indemnified Contributor") against any losses, damages and costs (collectively "Losses") arising from claims, lawsuits and other legal actions brought by a third party against the Indemnified Contributor to the extent caused by the acts or omissions of such Commercial Contributor in connection with its distribution of the Program in a commercial product offering. The obligations in this section do not apply to any claims or Losses relating to any actual or alleged intellectual property infringement. In order to qualify, an Indemnified Contributor must: a) promptly notify the Commercial Contributor in writing of such claim, and b) allow the Commercial Contributor to control, and cooperate with the Commercial Contributor in, the defense and any related settlement negotiations. The Indemnified Contributor may participate in any such claim at its own expense. - -For example, a Contributor might include the Program in a commercial product offering, Product X. That Contributor is then a Commercial Contributor. If that Commercial Contributor then makes performance claims, or offers warranties related to Product X, those performance claims and warranties are such Commercial Contributor's responsibility alone. Under this section, the Commercial Contributor would have to defend claims against the other Contributors related to those performance claims and warranties, and if a court requires any other Contributor to pay any damages as a result, the Commercial Contributor must pay those damages. - -5. NO WARRANTY - -EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the appropriateness of using and distributing the Program and assumes all risks associated with its exercise of rights under this Agreement, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and unavailability or interruption of operations. - -6. DISCLAIMER OF LIABILITY - -EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -7. GENERAL - -If any provision of this Agreement is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this Agreement, and without further action by the parties hereto, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. - -If Recipient institutes patent litigation against a Contributor with respect to a patent applicable to software (including a cross-claim or counterclaim in a lawsuit), then any patent licenses granted by that Contributor to such Recipient under this Agreement shall terminate as of the date such litigation is filed. In addition, if Recipient institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program itself (excluding combinations of the Program with other software or hardware) infringes such Recipient's patent(s), then such Recipient's rights granted under Section 2(b) shall terminate as of the date such litigation is filed. - -All Recipient's rights under this Agreement shall terminate if it fails to comply with any of the material terms or conditions of this Agreement and does not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If all Recipient's rights under this Agreement terminate, Recipient agrees to cease use and distribution of the Program as soon as reasonably practicable. However, Recipient's obligations under this Agreement and any licenses granted by Recipient relating to the Program shall continue and survive. - -Everyone is permitted to copy and distribute copies of this Agreement, but in order to avoid inconsistency the Agreement is copyrighted and may only be modified in the following manner. The Agreement Steward reserves the right to publish new versions (including revisions) of this Agreement from time to time. No one other than the Agreement Steward has the right to modify this Agreement. IBM is the initial Agreement Steward. IBM may assign the responsibility to serve as the Agreement Steward to a suitable separate entity. Each new version of the Agreement will be given a distinguishing version number. The Program (including Contributions) may always be distributed subject to the version of the Agreement under which it was received. In addition, after a new version of the Agreement is published, Contributor may elect to distribute the Program (including its Contributions) under the new version. Except as expressly stated in Sections 2(a) and 2(b) above, Recipient receives no rights or licenses to the intellectual property of any Contributor under this Agreement, whether expressly, by implication, estoppel or otherwise. All rights in the Program not expressly granted under this Agreement are reserved. - -This Agreement is governed by the laws of the State of New York and the intellectual property laws of the United States of America. No party to this Agreement will bring a legal action under this Agreement more than one year after the cause of action arose. Each party waives its rights to a jury trial in any resulting litigation. - - diff --git a/lib/thirdparty.LICENSE b/lib/thirdparty.LICENSE deleted file mode 100644 index 10c50fe..0000000 --- a/lib/thirdparty.LICENSE +++ /dev/null @@ -1,5 +0,0 @@ - -HPPC library integrates the following classes from other open-source projects: - -- fast BitSets from Apache Lucene (Apache license; same as HPPC). - diff --git a/licenses/commons-cli.LICENSE b/licenses/commons-cli.LICENSE new file mode 100644 index 0000000..57bc88a --- /dev/null +++ b/licenses/commons-cli.LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/licenses/commons-lang.LICENSE b/licenses/commons-lang.LICENSE new file mode 100644 index 0000000..57bc88a --- /dev/null +++ b/licenses/commons-lang.LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/licenses/hppc.LICENSE b/licenses/hppc.LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/licenses/hppc.LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/licenses/morfologik-polish.LICENSE b/licenses/morfologik-polish.LICENSE new file mode 100644 index 0000000..60b6e2e --- /dev/null +++ b/licenses/morfologik-polish.LICENSE @@ -0,0 +1,28 @@ +BSD-licensed dictionary of Polish (Morfologik) + +Copyright (c) 2013, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/morfologik-distribution/pom.xml b/morfologik-distribution/pom.xml new file mode 100644 index 0000000..d2c3035 --- /dev/null +++ b/morfologik-distribution/pom.xml @@ -0,0 +1,112 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-distribution + pom + + Morfologik Distribution Package + Morfologik Distribution Package + + + true + + + + install + + + + + distribution + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/dependency + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + bin-assembly + package + + single + + + false + + src/main/assembly/bin.xml + + gnu + false + + + + + + + + + + + + org.carrot2 + morfologik-fsa + ${project.version} + + + + org.carrot2 + morfologik-stemming + ${project.version} + + + + org.carrot2 + morfologik-polish + ${project.version} + + + + org.carrot2 + morfologik-tools + ${project.version} + jar + + + + org.carrot2 + morfologik-tools + ${project.version} + jar + standalone + + + diff --git a/morfologik-distribution/src/main/assembly/bin.xml b/morfologik-distribution/src/main/assembly/bin.xml new file mode 100644 index 0000000..4adb8c5 --- /dev/null +++ b/morfologik-distribution/src/main/assembly/bin.xml @@ -0,0 +1,77 @@ + + + bin + + + dir + zip + + + + + + target/dependency + + *.jar + *.txt + + + *-standalone.jar + + lib + + + + + target/dependency + + *-standalone.jar + + + + + + + ../licenses + lib + + *.LICENSE + + + + + + ../ + + + morfologik.LICENSE + README + CHANGES + + 0644 + + + + + ../morfologik-fsa/target/apidocs + apidocs/morfologik-fsa + + + + ../morfologik-polish/target/apidocs + apidocs/morfologik-polish + + + + ../morfologik-stemming/target/apidocs + apidocs/morfologik-stemming + + + + ../morfologik-tools/target/apidocs + apidocs/morfologik-tools + + + diff --git a/morfologik-fsa/pom.xml b/morfologik-fsa/pom.xml new file mode 100644 index 0000000..80abf3b --- /dev/null +++ b/morfologik-fsa/pom.xml @@ -0,0 +1,38 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-fsa + jar + + Morfologik FSA + Morfologik Finite State Automata (construction and traversal). + + + + com.carrotsearch + hppc + true + + + + junit + junit + test + + + + + install + + diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java new file mode 100644 index 0000000..695664d --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java @@ -0,0 +1,364 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSAFlags.*; +import static morfologik.util.FileUtils.readFully; + +import java.io.*; +import java.util.*; + +/** + * CFSA (Compact Finite State Automaton) binary format implementation. This is a + * slightly reorganized version of {@link FSA5} offering smaller automata size + * at some (minor) performance penalty. + * + *

Note: Serialize to {@link CFSA2} for new code.

+ * + *

The encoding of automaton body is as follows.

+ * + *
+ * ---- FSA header (standard)
+ * Byte                            Description 
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 | | | | | | | | | +------ '\'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     1 | | | | | | | | | +------ 'f'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     2 | | | | | | | | | +------ 's'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     3 | | | | | | | | | +------ 'a'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     4 | | | | | | | | | +------ version (fixed 0xc5)
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     5 | | | | | | | | | +------ filler character
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     6 | | | | | | | | | +------ annot character
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *  8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding. 
+ *       : : : : : : : : : |
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ * ---- Start of a node; only if automaton was compiled with NUMBERS option.
+ * 
+ * Byte
+ *        +-+-+-+-+-+-+-+-+\
+ *      0 | | | | | | | | | \  LSB
+ *        +-+-+-+-+-+-+-+-+  +
+ *      1 | | | | | | | | |  |      number of strings recognized
+ *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
+ *        : : : : : : : : :  |      from this node.
+ *        +-+-+-+-+-+-+-+-+  +
+ *  ctl-1 | | | | | | | | | /  MSB
+ *        +-+-+-+-+-+-+-+-+/
+ *        
+ * ---- A vector of node's arcs. Conditional format, depending on flags.
+ * 
+ * 1) NEXT bit set, mapped arc label. 
+ * 
+ *                +--------------- arc's label mapped in M bits if M's field value > 0
+ *                | +------------- node pointed to is next
+ *                | | +----------- the last arc of the node
+ *         _______| | | +--------- the arc is final
+ *        /       | | | |
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ * 2) NEXT bit set, label separate.
+ * 
+ *                +--------------- arc's label stored separately (M's field is zero).
+ *                | +------------- node pointed to is next
+ *                | | +----------- the last arc of the node
+ *                | | | +--------- the arc is final
+ *                | | | |
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 |0|0|0|0|0|1|L|F| +------ flags
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     1 | | | | | | | | | +------ label
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ * 3) NEXT bit not set. Full arc.
+ * 
+ *                  +------------- node pointed to is next
+ *                  | +----------- the last arc of the node
+ *                  | | +--------- the arc is final
+ *                  | | |
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     1 | | | | | | | | | +------ label
+ *       +-+-+-+-+-+-+-+-+/
+ *       : : : : : : : : :       
+ *       +-+-+-+-+-+-+-+-+\
+ * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ */ +public final class CFSA extends FSA { + /** + * Automaton header version value. + */ + public static final byte VERSION = (byte) 0xC5; + + /** + * Bitmask indicating that an arc corresponds to the last character of a + * sequence available when building the automaton. + */ + public static final int BIT_FINAL_ARC = 1 << 0; + + /** + * Bitmask indicating that an arc is the last one of the node's list and the + * following one belongs to another node. + */ + public static final int BIT_LAST_ARC = 1 << 1; + + /** + * Bitmask indicating that the target node of this arc follows it in the + * compressed automaton structure (no goto field). + */ + public static final int BIT_TARGET_NEXT = 1 << 2; + + /** + * An array of bytes with the internal representation of the automaton. + * Please see the documentation of this class for more information on how + * this structure is organized. + */ + public byte[] arcs; + + /** + * The length of the node header structure (if the automaton was compiled with + * NUMBERS option). Otherwise zero. + */ + public final int nodeDataLength; + + /** + * Flags for this automaton version. + */ + private final Set flags; + + /** + * Number of bytes each address takes in full, expanded form (goto length). + */ + public final int gtl; + + /** + * Label mapping for arcs of type (1) (see class documentation). The array + * is indexed by mapped label's value and contains the original label. + */ + public final byte[] labelMapping; + + /** + * Creates a new automaton, reading it from a file in FSA format, version 5. + */ + public CFSA(InputStream fsaStream) throws IOException { + // Read the header first. + final FSAHeader header = FSAHeader.read(fsaStream); + + // Ensure we have the correct version. + if (header.version != VERSION) { + throw new IOException("This class can read FSA version 5 only: " + header.version); + } + + /* + * Determine if the automaton was compiled with NUMBERS. If so, modify + * ctl and goto fields accordingly. + */ + flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); + if ((header.gtl & 0xf0) != 0) { + this.nodeDataLength = (header.gtl >>> 4) & 0x0f; + this.gtl = header.gtl & 0x0f; + flags.add(NUMBERS); + } else { + this.nodeDataLength = 0; + this.gtl = header.gtl & 0x0f; + } + + /* + * Read mapping dictionary. + */ + labelMapping = new byte[1 << 5]; + readFully(fsaStream, labelMapping); + + /* + * Read arcs' data. + */ + arcs = readFully(fsaStream); + } + + /** + * Returns the start node of this automaton. May return 0 if + * the start node is also an end node. + */ + @Override + public int getRootNode() { + // Skip dummy node marking terminating state. + final int epsilonNode = skipArc(getFirstArc(0)); + + // And follow the epsilon node's first (and only) arc. + return getDestinationNodeOffset(getFirstArc(epsilonNode)); + } + + /** + * {@inheritDoc} + */ + @Override + public final int getFirstArc(int node) { + return nodeDataLength + node; + } + + /** + * {@inheritDoc} + */ + @Override + public final int getNextArc(int arc) { + if (isArcLast(arc)) + return 0; + else + return skipArc(arc); + } + + /** + * {@inheritDoc} + */ + @Override + public int getArc(int node, byte label) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (getArcLabel(arc) == label) + return arc; + } + + // An arc labeled with "label" not found. + return 0; + } + + /** + * {@inheritDoc} + */ + @Override + public int getEndNode(int arc) { + final int nodeOffset = getDestinationNodeOffset(arc); + if (0 == nodeOffset) { + throw new RuntimeException("This is a terminal arc [" + arc + "]"); + } + return nodeOffset; + } + + /** + * {@inheritDoc} + */ + @Override + public byte getArcLabel(int arc) { + if (isNextSet(arc) && isLabelCompressed(arc)) { + return this.labelMapping[(arcs[arc] >>> 3) & 0x1f]; + } else { + return arcs[arc + 1]; + } + } + + /** + * {@inheritDoc} + */ + @Override + public int getRightLanguageCount(int node) { + assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; + return FSA5.decodeFromBytes(arcs, node, nodeDataLength); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcFinal(int arc) { + return (arcs[arc] & BIT_FINAL_ARC) != 0; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcTerminal(int arc) { + return (0 == getDestinationNodeOffset(arc)); + } + + /** + * Returns true if this arc has NEXT bit set. + * + * @see #BIT_LAST_ARC + */ + public boolean isArcLast(int arc) { + return (arcs[arc] & BIT_LAST_ARC) != 0; + } + + /** + * @see #BIT_TARGET_NEXT + */ + public boolean isNextSet(int arc) { + return (arcs[arc] & BIT_TARGET_NEXT) != 0; + } + + /** + * Returns true if the label is compressed inside flags byte. + */ + public boolean isLabelCompressed(int arc) { + assert isNextSet(arc) : "Only applicable to arcs with NEXT bit."; + return (arcs[arc] & (-1 << 3)) != 0; + } + + /** + * {@inheritDoc} + * + *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag + * may be set to indicate the automaton contains extra fields for each node.

+ */ + public Set getFlags() { + return Collections.unmodifiableSet(flags); + } + + /** + * Returns the address of the node pointed to by this arc. + */ + final int getDestinationNodeOffset(int arc) { + if (isNextSet(arc)) { + /* The destination node follows this arc in the array. */ + return skipArc(arc); + } else { + /* + * The destination node address has to be extracted from the arc's + * goto field. + */ + int r = 0; + for (int i = gtl; --i >= 1;) { + r = r << 8 | (arcs[arc + 1 + i] & 0xff); + } + r = r << 8 | (arcs[arc] & 0xff); + return r >>> 3; + } + } + + /** + * Read the arc's layout and skip as many bytes, as needed, to skip it. + */ + private int skipArc(int offset) { + if (isNextSet(offset)) { + if (isLabelCompressed(offset)) { + offset++; + } else { + offset += 1 + 1; + } + } else { + offset += 1 + gtl; + } + return offset; + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java new file mode 100644 index 0000000..6955da4 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java @@ -0,0 +1,404 @@ +package morfologik.fsa; + +import static morfologik.util.FileUtils.readFully; + +import java.io.IOException; +import java.io.InputStream; +import java.util.EnumSet; +import java.util.Set; + +import morfologik.util.FileUtils; + +/** + * CFSA (Compact Finite State Automaton) binary format implementation, version 2: + *
    + *
  • {@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one.
  • + *
  • v-coded goto field
  • + *
  • v-coded perfect hashing numbers, if any
  • + *
  • 31 most frequent labels integrated with flags byte
  • + *
+ * + *

The encoding of automaton body is as follows.

+ * + *
+ * ---- CFSA header
+ * Byte                            Description 
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 | | | | | | | | | +------ '\'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     1 | | | | | | | | | +------ 'f'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     2 | | | | | | | | | +------ 's'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     3 | | | | | | | | | +------ 'a'
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     4 | | | | | | | | | +------ version (fixed 0xc6)
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     5 | | | | | | | | | +----\
+ *       +-+-+-+-+-+-+-+-+/      \ flags [MSB first]
+ *       +-+-+-+-+-+-+-+-+\      /
+ *     6 | | | | | | | | | +----/
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     7 | | | | | | | | | +------ label lookup table size
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *  8-32 | | | | | | | | | +------ label value lookup table 
+ *       : : : : : : : : : |
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ * ---- Start of a node; only if automaton was compiled with NUMBERS option.
+ * 
+ * Byte
+ *        +-+-+-+-+-+-+-+-+\
+ *      0 | | | | | | | | | \  
+ *        +-+-+-+-+-+-+-+-+  +
+ *      1 | | | | | | | | |  |      number of strings recognized
+ *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
+ *        : : : : : : : : :  |      from this node. v-coding
+ *        +-+-+-+-+-+-+-+-+  +
+ *        | | | | | | | | | /  
+ *        +-+-+-+-+-+-+-+-+/
+ *
+ * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
+ * 
+ * 1) NEXT bit set, mapped arc label. 
+ * 
+ *        +----------------------- node pointed to is next
+ *        | +--------------------- the last arc of the node
+ *        | | +------------------- this arc leads to a final state (acceptor)
+ *        | | |  _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
+ *        | | | / | | | |
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
+ *       +-+-+-+-+-+-+-+-+/
+ *       +-+-+-+-+-+-+-+-+\
+ *     1 | | | | | | | | | +------ optional label if M == 0
+ *       +-+-+-+-+-+-+-+-+/
+ *       : : : : : : : : :
+ *       +-+-+-+-+-+-+-+-+\
+ *       |A|A|A|A|A|A|A|A| +------ v-coded goto address
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ */ +public final class CFSA2 extends FSA { + /** + * Automaton header version value. + */ + public static final byte VERSION = (byte) 0xc6; + + /** + * The target node of this arc follows the last arc of the current state + * (no goto field). + */ + public static final int BIT_TARGET_NEXT = 1 << 7; + + /** + * The arc is the last one from the current node's arcs list. + */ + public static final int BIT_LAST_ARC = 1 << 6; + + /** + * The arc corresponds to the last character of a sequence + * available when building the automaton (acceptor transition). + */ + public static final int BIT_FINAL_ARC = 1 << 5; + + /** + * The count of bits assigned to storing an indexed label. + */ + static final int LABEL_INDEX_BITS = 5; + + /** + * Masks only the M bits of a flag byte. + */ + static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1; + + /** + * Maximum size of the labels index. + */ + static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1; + + /** + * An array of bytes with the internal representation of the automaton. + * Please see the documentation of this class for more information on how + * this structure is organized. + */ + public byte[] arcs; + + /** + * Flags for this automaton version. + */ + private final EnumSet flags; + + /** + * Label mapping for M-indexed labels. + */ + public final byte[] labelMapping; + + /** + * If true states are prepended with numbers. + */ + private final boolean hasNumbers; + + /** + * Epsilon node's offset. + */ + private final int epsilon = 0; + + /** + * Reads an automaton from a byte stream. + */ + public CFSA2(InputStream in) throws IOException { + // Read the header first. + if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in)) + throw new IOException("Invalid file header magic bytes."); + + // Ensure we have the correct version. + final int version = FileUtils.readByte(in); + if (version != VERSION) { + throw new IOException("This class can only read FSA version: " + VERSION); + } + + // Read flags. + short flagBits = FileUtils.readShort(in); + flags = EnumSet.noneOf(FSAFlags.class); + for (FSAFlags f : FSAFlags.values()) { + if (FSAFlags.isSet(flagBits, f)) + flags.add(f); + } + + if (flagBits != FSAFlags.asShort(flags)) + throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits)); + + this.hasNumbers = flags.contains(FSAFlags.NUMBERS); + + /* + * Read mapping dictionary. + */ + int labelMappingSize = FileUtils.readByte(in) & 0xff; + labelMapping = new byte[labelMappingSize]; + readFully(in, labelMapping); + + /* + * Read arcs' data. + */ + arcs = readFully(in); + } + + /** + * {@inheritDoc} + */ + @Override + public int getRootNode() { + // Skip dummy node marking terminating state. + return getDestinationNodeOffset(getFirstArc(epsilon)); + } + + /** + * {@inheritDoc} + */ + @Override + public final int getFirstArc(int node) { + if (hasNumbers) { + return skipVInt(node); + } else { + return node; + } + } + + /** + * {@inheritDoc} + */ + @Override + public final int getNextArc(int arc) { + if (isArcLast(arc)) + return 0; + else + return skipArc(arc); + } + + /** + * {@inheritDoc} + */ + @Override + public int getArc(int node, byte label) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (getArcLabel(arc) == label) + return arc; + } + + // An arc labeled with "label" not found. + return 0; + } + + /** + * {@inheritDoc} + */ + @Override + public int getEndNode(int arc) { + final int nodeOffset = getDestinationNodeOffset(arc); + assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc; + assert nodeOffset < arcs.length : "Node out of bounds."; + return nodeOffset; + } + + /** + * {@inheritDoc} + */ + @Override + public byte getArcLabel(int arc) { + int index = arcs[arc] & LABEL_INDEX_MASK; + if (index > 0) { + return this.labelMapping[index]; + } else { + return arcs[arc + 1]; + } + } + + /** + * {@inheritDoc} + */ + @Override + public int getRightLanguageCount(int node) { + assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; + return readVInt(arcs, node); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcFinal(int arc) { + return (arcs[arc] & BIT_FINAL_ARC) != 0; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcTerminal(int arc) { + return (0 == getDestinationNodeOffset(arc)); + } + + /** + * Returns true if this arc has NEXT bit set. + * + * @see #BIT_LAST_ARC + */ + public boolean isArcLast(int arc) { + return (arcs[arc] & BIT_LAST_ARC) != 0; + } + + /** + * @see #BIT_TARGET_NEXT + */ + public boolean isNextSet(int arc) { + return (arcs[arc] & BIT_TARGET_NEXT) != 0; + } + + /** + * {@inheritDoc} + */ + public Set getFlags() { + return flags; + } + + /** + * Returns the address of the node pointed to by this arc. + */ + final int getDestinationNodeOffset(int arc) { + if (isNextSet(arc)) { + /* Follow until the last arc of this state. */ + while (!isArcLast(arc)) { + arc = getNextArc(arc); + } + + /* And return the byte right after it. */ + return skipArc(arc); + } else { + /* + * The destination node address is v-coded. v-code starts either + * at the next byte (label indexed) or after the next byte (label explicit). + */ + return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1)); + } + } + + /** + * Read the arc's layout and skip as many bytes, as needed, to skip it. + */ + private int skipArc(int offset) { + int flag = arcs[offset++]; + + // Explicit label? + if ((flag & LABEL_INDEX_MASK) == 0) { + offset++; + } + + // Explicit goto? + if ((flag & BIT_TARGET_NEXT) == 0) { + offset = skipVInt(offset); + } + + assert offset < this.arcs.length; + return offset; + } + + /** + * Read a v-int. + */ + static int readVInt(byte[] array, int offset) { + byte b = array[offset]; + int value = b & 0x7F; + + for (int shift = 7; b < 0; shift += 7) { + b = array[++offset]; + value |= (b & 0x7F) << shift; + } + + return value; + } + + /** + * Write a v-int to a byte array. + */ + static int writeVInt(byte[] array, int offset, int value) { + assert value >= 0 : "Can't v-code negative ints."; + + while (value > 0x7F) { + array[offset++] = (byte) (0x80 | (value & 0x7F)); + value >>= 7; + } + array[offset++] = (byte) value; + + return offset; + } + + /** + * Return the byte-length of a v-coded int. + */ + static int vIntLength(int value) { + assert value >= 0 : "Can't v-code negative ints."; + + int bytes; + for (bytes = 1; value >= 0x80; bytes++) { + value >>= 7; + } + + return bytes; + } + + /** + * Skip a v-int. + */ + private int skipVInt(int offset) { + while (arcs[offset++] < 0); + return offset; + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java new file mode 100644 index 0000000..8026f33 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/CFSA2Serializer.java @@ -0,0 +1,543 @@ +package morfologik.fsa; + +import static morfologik.fsa.CFSA2.BIT_FINAL_ARC; +import static morfologik.fsa.CFSA2.BIT_LAST_ARC; +import static morfologik.fsa.CFSA2.BIT_TARGET_NEXT; +import static morfologik.fsa.FSAFlags.FLEXIBLE; +import static morfologik.fsa.FSAFlags.NEXTBIT; +import static morfologik.fsa.FSAFlags.NUMBERS; +import static morfologik.fsa.FSAFlags.STOPBIT; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayDeque; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.TreeSet; + +import morfologik.fsa.FSAUtils.IntIntHolder; +import morfologik.util.FileUtils; + +import com.carrotsearch.hppc.BitSet; +import com.carrotsearch.hppc.BoundedProportionalArraySizingStrategy; +import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntIntOpenHashMap; +import com.carrotsearch.hppc.IntStack; +import com.carrotsearch.hppc.cursors.IntCursor; +import com.carrotsearch.hppc.cursors.IntIntCursor; + +/** + * Serializes in-memory {@link FSA} graphs to {@link CFSA2}. + * + *

+ * It is possible to serialize the automaton with numbers required for perfect + * hashing. See {@link #withNumbers()} method. + *

+ * + * @see CFSA2 + * @see FSA#read(java.io.InputStream) + */ +public final class CFSA2Serializer implements FSASerializer { + /** + * Supported flags. + */ + private final static EnumSet flags = EnumSet.of(NUMBERS, FLEXIBLE, STOPBIT, NEXTBIT); + + /** + * No-state id. + */ + private final static int NO_STATE = -1; + + /** + * true if we should serialize with numbers. + * + * @see #withNumbers() + */ + private boolean withNumbers; + + /** + * A hash map of [state, offset] pairs. + */ + private IntIntOpenHashMap offsets = new IntIntOpenHashMap(); + + /** + * A hash map of [state, right-language-count] pairs. + */ + private IntIntOpenHashMap numbers = new IntIntOpenHashMap(); + + /** + * Scratch array for serializing vints. + */ + private final byte [] scratch = new byte [5]; + + /** + * The most frequent labels for integrating with the flags field. + */ + private byte [] labelsIndex; + + /** + * Inverted index of labels to be integrated with flags field. A label + * at index i has the index or zero (no integration). + */ + private int [] labelsInvIndex; + + /** + * Logger for progress. + */ + private IMessageLogger logger = new NullMessageLogger(); + + /** + * Serialize the automaton with the number of right-language sequences in + * each node. This is required to implement perfect hashing. The numbering + * also preserves the order of input sequences. + * + * @return Returns the same object for easier call chaining. + */ + public CFSA2Serializer withNumbers() { + withNumbers = true; + return this; + } + + /** + * Serializes any {@link FSA} to {@link CFSA2} stream. + * + * @see #withNumbers + * @return Returns os for chaining. + */ + @Override + public T serialize(final FSA fsa, T os) throws IOException { + /* + * Calculate the most frequent labels and build indexed labels dictionary. + */ + computeLabelsIndex(fsa); + + /* + * Calculate the number of bytes required for the node data, if + * serializing with numbers. + */ + if (withNumbers) { + this.numbers = FSAUtils.rightLanguageForAllStates(fsa); + } + + /* + * Linearize all the states, optimizing their layout. + */ + IntArrayList linearized = linearize(fsa); + + /* + * Emit the header. + */ + FileUtils.writeInt(os, FSAHeader.FSA_MAGIC); + os.write(CFSA2.VERSION); + + EnumSet fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); + if (withNumbers) fsaFlags.add(NUMBERS); + FileUtils.writeShort(os, FSAFlags.asShort(fsaFlags)); + + /* + * Emit labels index. + */ + os.write(labelsIndex.length); + os.write(labelsIndex); + + /* + * Emit the automaton. + */ + int size = emitNodes(fsa, os, linearized); + assert size == 0 : "Size changed in the final pass?"; + + return os; + } + + /** + * Compute a set of labels to be integrated with the flags field. + */ + private void computeLabelsIndex(final FSA fsa) { + // Compute labels count. + final int [] countByValue = new int [256]; + + fsa.visitAllStates(new StateVisitor() { + public boolean accept(int state) { + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) + countByValue[fsa.getArcLabel(arc) & 0xff]++; + return true; + } + }); + + // Order by descending frequency of counts and increasing label value. + Comparator comparator = new Comparator() { + public int compare(IntIntHolder o1, IntIntHolder o2) { + int countDiff = o2.b - o1.b; + if (countDiff == 0) { + countDiff = o1.a - o2.a; + } + return countDiff; + } + }; + + TreeSet labelAndCount = new TreeSet(comparator); + for (int label = 0; label < countByValue.length; label++) { + if (countByValue[label] > 0) { + labelAndCount.add(new IntIntHolder(label, countByValue[label])); + } + } + + this.logger.startPart("Label distribution"); + for (IntIntHolder c : labelAndCount) { + this.logger.log("0x" + Integer.toHexString(c.a), c.b); + } + this.logger.endPart(); + + labelsIndex = new byte [1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)]; + labelsInvIndex = new int [256]; + for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) { + IntIntHolder p = labelAndCount.first(); + labelAndCount.remove(p); + labelsInvIndex[p.a] = i; + labelsIndex[i] = (byte) p.a; + } + } + + /** + * Return supported flags. + */ + @Override + public Set getFlags() { + return flags; + } + + /** + * Linearization of states. + */ + private IntArrayList linearize(final FSA fsa) throws IOException { + /* + * Compute the states with most inlinks. These should be placed as close to the + * start of the automaton, as possible so that v-coded addresses are tiny. + */ + final IntIntOpenHashMap inlinkCount = computeInlinkCount(fsa); + + /* + * An array of ordered states for serialization. + */ + final IntArrayList linearized = new IntArrayList(0, + new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f)); + + /* + * Determine which states should be linearized first (at fixed positions) so as to + * minimize the place occupied by goto fields. + */ + int maxStates = Integer.MAX_VALUE; + int minInlinkCount = 2; + ArrayDeque statesQueue = computeFirstStates(inlinkCount, maxStates, minInlinkCount); + IntArrayList states = new IntArrayList(); + while (!statesQueue.isEmpty()) + states.add(statesQueue.pop()); + + /* + * Compute initial addresses, without node rearrangements. + */ + int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets); + + /* + * Probe for better node arrangements by selecting between [lower, upper] + * nodes from the potential candidate nodes list. + */ + IntArrayList sublist = new IntArrayList(); + sublist.buffer = states.buffer; + sublist.elementsCount = states.elementsCount; + + /* + * Probe the initial region a little bit, looking for optimal cut. It can't be binary search + * because the result isn't monotonic. + */ + logger.startPart("Compacting"); + logger.log("Initial output size", serializedSize); + int cutAt = 0; + for (int cut = Math.min(25, states.size()); cut <= Math.min(150, states.size()); cut += 25) { + sublist.elementsCount = cut; + int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); + logger.log("Moved " + sublist.size() + " states, output size", newSize); + if (newSize >= serializedSize) { + break; + } + cutAt = cut; + } + + /* + * Cut at the calculated point and repeat linearization. + */ + sublist.elementsCount = cutAt; + int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); + + logger.log("Will move " + sublist.size() + " states, final size", size); + logger.endPart(); + + return linearized; + } + + /** + * Linearize all states, putting states in front of the automaton and + * calculating stable state offsets. + */ + private int linearizeAndCalculateOffsets(FSA fsa, IntArrayList states, + IntArrayList linearized, IntIntOpenHashMap offsets) throws IOException + { + final BitSet visited = new BitSet(); + final IntStack nodes = new IntStack(); + linearized.clear(); + + /* + * Linearize states with most inlinks first. + */ + for (int i = 0; i < states.size(); i++) { + linearizeState(fsa, nodes, linearized, visited, states.get(i)); + } + + /* + * Linearize the remaining states by chaining them one after another, in depth-order. + */ + nodes.push(fsa.getRootNode()); + while (!nodes.isEmpty()) { + final int node = nodes.pop(); + if (visited.get(node)) + continue; + + linearizeState(fsa, nodes, linearized, visited, node); + } + + /* + * Calculate new state offsets. This is iterative. We start with + * maximum potential offsets and recalculate until converged. + */ + int MAX_OFFSET = Integer.MAX_VALUE; + for (IntCursor c : linearized) { + offsets.put(c.value, MAX_OFFSET); + } + + int i, j = 0; + while ((i = emitNodes(fsa, null, linearized)) > 0) { + j = i; + } + return j; + } + + /** + * Add a state to linearized list. + */ + private void linearizeState(final FSA fsa, + IntStack nodes, + IntArrayList linearized, + BitSet visited, int node) + { + linearized.add(node); + visited.set(node); + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { + if (!fsa.isArcTerminal(arc)) { + final int target = fsa.getEndNode(arc); + if (!visited.get(target)) + nodes.push(target); + } + } + } + + /** + * Compute the set of states that should be linearized first to minimize other + * states goto length. + */ + private ArrayDeque computeFirstStates(IntIntOpenHashMap inlinkCount, + int maxStates, + int minInlinkCount) + { + Comparator comparator = new Comparator() { + public int compare(IntIntHolder o1, IntIntHolder o2) { + int v = o1.a - o2.a; + return v == 0 ? (o1.b - o2.b) : v; + } + }; + + PriorityQueue stateInlink = new PriorityQueue(1, comparator); + IntIntHolder scratch = new IntIntHolder(); + for (IntIntCursor c : inlinkCount) { + if (c.value > minInlinkCount) { + scratch.a = c.value; + scratch.b = c.key; + + if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) { + stateInlink.add(new IntIntHolder(c.value, c.key)); + if (stateInlink.size() > maxStates) stateInlink.remove(); + } + } + } + + ArrayDeque states = new ArrayDeque(); + while (!stateInlink.isEmpty()) { + IntIntHolder i = stateInlink.remove(); + states.addFirst(i.b); + } + return states; + } + + /** + * Compute in-link count for each state. + */ + private IntIntOpenHashMap computeInlinkCount(final FSA fsa) { + IntIntOpenHashMap inlinkCount = new IntIntOpenHashMap(); + BitSet visited = new BitSet(); + IntStack nodes = new IntStack(); + nodes.push(fsa.getRootNode()); + + while (!nodes.isEmpty()) { + final int node = nodes.pop(); + if (visited.get(node)) + continue; + + visited.set(node); + + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { + if (!fsa.isArcTerminal(arc)) { + final int target = fsa.getEndNode(arc); + inlinkCount.putOrAdd(target, 1, 1); + if (!visited.get(target)) + nodes.push(target); + } + } + } + + return inlinkCount; + } + + /** + * Update arc offsets assuming the given goto length. + */ + private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException { + int offset = 0; + + // Add epsilon state. + offset += emitNodeData(os, 0); + if (fsa.getRootNode() != 0) + offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode())); + else + offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0); + + boolean offsetsChanged = false; + final int max = linearized.size(); + for (IntCursor c : linearized) { + final int state = c.value; + final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE; + + if (os == null) { + offsetsChanged |= (offsets.get(state) != offset); + offsets.put(state, offset); + } else { + assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset; + } + + offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0); + offset += emitNodeArcs(fsa, os, state, nextState); + } + + return offsetsChanged ? offset : 0; + } + + /** + * Emit all arcs of a single node. + */ + private int emitNodeArcs(FSA fsa, OutputStream os, + final int state, final int nextState) throws IOException { + int offset = 0; + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { + int targetOffset; + final int target; + + if (fsa.isArcTerminal(arc)) { + target = 0; + targetOffset = 0; + } else { + target = fsa.getEndNode(arc); + targetOffset = offsets.get(target); + } + + int flags = 0; + + if (fsa.isArcFinal(arc)) { + flags |= BIT_FINAL_ARC; + } + + if (fsa.getNextArc(arc) == 0) { + flags |= BIT_LAST_ARC; + } + + if (targetOffset != 0 && target == nextState) { + flags |= BIT_TARGET_NEXT; + targetOffset = 0; + } + + offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset); + } + + return offset; + } + + /** */ + private int emitArc(OutputStream os, int flags, byte label, int targetOffset) + throws IOException + { + int length = 0; + + int labelIndex = labelsInvIndex[label & 0xff]; + if (labelIndex > 0) { + if (os != null) os.write(flags | labelIndex); + length++; + } else { + if (os != null) { + os.write(flags); + os.write(label); + } + length += 2; + } + + if ((flags & BIT_TARGET_NEXT) == 0) { + int len = CFSA2.writeVInt(scratch, 0, targetOffset); + if (os != null) { + os.write(scratch, 0, len); + } + length += len; + } + + return length; + } + + /** */ + private int emitNodeData(OutputStream os, int number) throws IOException { + int size = 0; + + if (withNumbers) { + size = CFSA2.writeVInt(scratch, 0, number); + if (os != null) { + os.write(scratch, 0, size); + } + } + + return size; + } + + /** */ + @Override + public CFSA2Serializer withFiller(byte filler) { + throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file."); + } + + /** */ + @Override + public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) { + throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file."); + } + + @Override + public CFSA2Serializer withLogger(IMessageLogger logger) { + this.logger = logger; + return this; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java new file mode 100644 index 0000000..2f6d412 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java @@ -0,0 +1,134 @@ +package morfologik.fsa; + +import java.util.Collections; +import java.util.Set; + +/** + * An FSA with constant-size arc representation produced directly + * by {@link FSABuilder}. + * + * @see FSABuilder + */ +public final class ConstantArcSizeFSA extends FSA { + /** Size of the target address field (constant for the builder). */ + public final static int TARGET_ADDRESS_SIZE = 4; + + /** Size of the flags field (constant for the builder). */ + public final static int FLAGS_SIZE = 1; + + /** Size of the label field (constant for the builder). */ + public final static int LABEL_SIZE = 1; + + /** + * Size of a single arc structure. + */ + public final static int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE; + + /** Offset of the flags field inside an arc. */ + public final static int FLAGS_OFFSET = 0; + + /** Offset of the label field inside an arc. */ + public final static int LABEL_OFFSET = FLAGS_SIZE; + + /** Offset of the address field inside an arc. */ + public final static int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE; + + /** A dummy address of the terminal state. */ + final static int TERMINAL_STATE = 0; + + /** + * An arc flag indicating the target node of an arc corresponds to a final + * state. + */ + public final static int BIT_ARC_FINAL = 1 << 1; + + /** An arc flag indicating the arc is last within its state. */ + public final static int BIT_ARC_LAST = 1 << 0; + + /** + * An epsilon state. The first and only arc of this state points either + * to the root or to the terminal state, indicating an empty automaton. + */ + private final int epsilon; + + /** + * FSA data, serialized as a byte array. + */ + private final byte[] data; + + /** + * @param data FSA data. There must be no trailing bytes after the last state. + */ + ConstantArcSizeFSA(byte[] data, int epsilon) { + assert epsilon == 0 : "Epsilon is not zero?"; + + this.epsilon = epsilon; + this.data = data; + } + + @Override + public int getRootNode() { + return getEndNode(getFirstArc(epsilon)); + } + + @Override + public int getFirstArc(int node) { + return node; + } + + @Override + public int getArc(int node, byte label) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (getArcLabel(arc) == label) + return arc; + } + return 0; + } + + @Override + public int getNextArc(int arc) { + if (isArcLast(arc)) + return 0; + return arc + ARC_SIZE; + } + + @Override + public byte getArcLabel(int arc) { + return data[arc + LABEL_OFFSET]; + } + + /** + * Fills the target state address of an arc. + */ + private int getArcTarget(int arc) { + arc += ADDRESS_OFFSET; + return (data[arc]) << 24 | + (data[arc + 1] & 0xff) << 16 | + (data[arc + 2] & 0xff) << 8 | + (data[arc + 3] & 0xff); + } + + @Override + public boolean isArcFinal(int arc) { + return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; + } + + @Override + public boolean isArcTerminal(int arc) { + return getArcTarget(arc) == 0; + } + + private boolean isArcLast(int arc) { + return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; + } + + @Override + public int getEndNode(int arc) { + return getArcTarget(arc); + } + + @Override + public Set getFlags() { + return Collections.emptySet(); + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java new file mode 100644 index 0000000..d734b95 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java @@ -0,0 +1,286 @@ +package morfologik.fsa; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.BitSet; +import java.util.Collections; +import java.util.Iterator; +import java.util.Set; + +/** + * This is a top abstract class for handling finite state automata. These + * automata are arc-based, a design described in Jan Daciuk's Incremental + * Construction of Finite-State Automata and Transducers, and Their Use in the + * Natural Language Processing (PhD thesis, Technical University of Gdansk). + * + *

+ * Concrete subclasses (implementations) provide varying tradeoffs and features: + * traversal speed vs. memory size, for example. + *

+ * + * @see FSABuilder + */ +public abstract class FSA implements Iterable { + /** + * @return Returns the identifier of the root node of this automaton. + * Returns 0 if the start node is also the end node (the automaton + * is empty). + */ + public abstract int getRootNode(); + + /** + * @return Returns the identifier of the first arc leaving node + * or 0 if the node has no outgoing arcs. + */ + public abstract int getFirstArc(int node); + + /** + * @return Returns the identifier of the next arc after arc and + * leaving node. Zero is returned if no more arcs are + * available for the node. + */ + public abstract int getNextArc(int arc); + + /** + * @return Returns the identifier of an arc leaving node and + * labeled with label. An identifier equal to 0 means + * the node has no outgoing arc labeled label. + */ + public abstract int getArc(int node, byte label); + + /** + * Return the label associated with a given arc. + */ + public abstract byte getArcLabel(int arc); + + /** + * Returns true if the destination node at the end of this + * arc corresponds to an input sequence created when building + * this automaton. + */ + public abstract boolean isArcFinal(int arc); + + /** + * Returns true if this arc does not have a + * terminating node (@link {@link #getEndNode(int)} will throw an + * exception). Implies {@link #isArcFinal(int)}. + */ + public abstract boolean isArcTerminal(int arc); + + /** + * Return the end node pointed to by a given arc. Terminal arcs + * (those that point to a terminal state) have no end node representation + * and throw a runtime exception. + */ + public abstract int getEndNode(int arc); + + /** + * Returns a set of flags for this FSA instance. + */ + public abstract Set getFlags(); + + /** + * Calculates the number of arcs of a given node. Unless really required, + * use the following idiom for looping through all arcs: + *
+     * for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
+     * }
+     * 
+ */ + public int getArcCount(int node) { + int count = 0; + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + count++; + } + return count; + } + + /** + * @return Returns the number of sequences reachable from the given state if + * the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of + * the right language of the state, in other words. + * + * @throws UnsupportedOperationException If the automaton was not compiled with + * {@link FSAFlags#NUMBERS}. The value can then be computed by manual count + * of {@link #getSequences(int)}. + */ + public int getRightLanguageCount(int node) { + throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS); + } + + /** + * Returns an iterator over all binary sequences starting at the given FSA + * state (node) and ending in final nodes. This corresponds to a set of + * suffixes of a given prefix from all sequences stored in the automaton. + * + *

+ * The returned iterator is a {@link ByteBuffer} whose contents changes on + * each call to {@link Iterator#next()}. The keep the contents between calls + * to {@link Iterator#next()}, one must copy the buffer to some other + * location. + *

+ * + *

+ * Important. It is guaranteed that the returned byte buffer is + * backed by a byte array and that the content of the byte buffer starts at + * the array's index 0. + *

+ * + * @see Iterable + */ + public Iterable getSequences(final int node) { + if (node == 0) { + return Collections. emptyList(); + } + + return new Iterable() { + public Iterator iterator() { + return new FSAFinalStatesIterator(FSA.this, node); + } + }; + } + + /** + * An alias of calling {@link #iterator} directly ({@link FSA} is also + * {@link Iterable}). + */ + public final Iterable getSequences() { + return getSequences(getRootNode()); + } + + /** + * Returns an iterator over all binary sequences starting from the initial + * FSA state (node) and ending in final nodes. The returned iterator is a + * {@link ByteBuffer} whose contents changes on each call to + * {@link Iterator#next()}. The keep the contents between calls to + * {@link Iterator#next()}, one must copy the buffer to some other location. + * + *

+ * Important. It is guaranteed that the returned byte buffer is + * backed by a byte array and that the content of the byte buffer starts at + * the array's index 0. + *

+ * + * @see Iterable + */ + public final Iterator iterator() { + return getSequences().iterator(); + } + + /** + * Visit all states. The order of visiting is undefined. This method may be faster + * than traversing the automaton in post or preorder since it can scan states + * linearly. Returning false from {@link StateVisitor#accept(int)} + * immediately terminates the traversal. + */ + public T visitAllStates(T v) { + return visitInPostOrder(v); + } + + /** + * Same as {@link #visitInPostOrder(StateVisitor, int)}, + * starting from root automaton node. + */ + public T visitInPostOrder(T v) { + return visitInPostOrder(v, getRootNode()); + } + + /** + * Visits all states reachable from node in postorder. + * Returning false from {@link StateVisitor#accept(int)} + * immediately terminates the traversal. + */ + public T visitInPostOrder(T v, int node) { + visitInPostOrder(v, node, new BitSet()); + return v; + } + + /** Private recursion. */ + private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) { + if (visited.get(node)) + return true; + visited.set(node); + + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (!isArcTerminal(arc)) { + if (!visitInPostOrder(v, getEndNode(arc), visited)) + return false; + } + } + + return v.accept(node); + } + + /** + * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node. + */ + public T visitInPreOrder(T v) { + return visitInPreOrder(v, getRootNode()); + } + + /** + * Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)} + * skips traversal of all sub-states of a given state. + */ + public T visitInPreOrder(T v, int node) { + visitInPreOrder(v, node, new BitSet()); + return v; + } + + /** Private recursion. */ + private void visitInPreOrder(StateVisitor v, int node, BitSet visited) { + if (visited.get(node)) + return; + visited.set(node); + + if (v.accept(node)) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (!isArcTerminal(arc)) { + visitInPreOrder(v, getEndNode(arc), visited); + } + } + } + } + + /** + * A factory for reading automata in any of the supported versions. If + * possible, explicit constructors should be used. + * + * @see FSA5#FSA5(InputStream) + */ + @SuppressWarnings("unchecked") + public static T read(InputStream in) throws IOException { + if (!in.markSupported()) { + in = new BufferedInputStream(in, Math.max(FSAHeader.MAX_HEADER_LENGTH + 1, 1024)); + } + + in.mark(FSAHeader.MAX_HEADER_LENGTH); + FSAHeader header = FSAHeader.read(in); + in.reset(); + + if (header.version == FSA5.VERSION) + return (T) new FSA5(in); + + if (header.version == CFSA.VERSION) + return (T) new CFSA(in); + + if (header.version == CFSA2.VERSION) + return (T) new CFSA2(in); + + throw new IOException("Unsupported automaton version: " + + header.version); + } + + public static FSA read(File fsa) throws IOException { + InputStream is = new BufferedInputStream(new FileInputStream(fsa)); + try { + return read(is); + } finally { + is.close(); + } + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java new file mode 100644 index 0000000..d43f4d8 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java @@ -0,0 +1,323 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSAFlags.FLEXIBLE; +import static morfologik.fsa.FSAFlags.NEXTBIT; +import static morfologik.fsa.FSAFlags.NUMBERS; +import static morfologik.fsa.FSAFlags.STOPBIT; +import static morfologik.util.FileUtils.readFully; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.EnumSet; +import java.util.Set; + +/** + * FSA binary format implementation for version 5. + * + *

+ * Version 5 indicates the dictionary was built with these flags: + * {@link FSAFlags#FLEXIBLE}, {@link FSAFlags#STOPBIT} and + * {@link FSAFlags#NEXTBIT}. The internal representation of the FSA must + * therefore follow this description (please note this format describes only a + * single transition (arc), not the entire dictionary file). + * + *

+ * ---- this node header present only if automaton was compiled with NUMBERS option.
+ * Byte
+ *        +-+-+-+-+-+-+-+-+\
+ *      0 | | | | | | | | | \  LSB
+ *        +-+-+-+-+-+-+-+-+  +
+ *      1 | | | | | | | | |  |      number of strings recognized
+ *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
+ *        : : : : : : : : :  |      from this node.
+ *        +-+-+-+-+-+-+-+-+  +
+ *  ctl-1 | | | | | | | | | /  MSB
+ *        +-+-+-+-+-+-+-+-+/
+ *        
+ * ---- remaining part of the node
+ * 
+ * Byte
+ *       +-+-+-+-+-+-+-+-+\
+ *     0 | | | | | | | | | +------ label
+ *       +-+-+-+-+-+-+-+-+/
+ * 
+ *                  +------------- node pointed to is next
+ *                  | +----------- the last arc of the node
+ *                  | | +--------- the arc is final
+ *                  | | |
+ *             +-----------+
+ *             |    | | |  |
+ *         ___+___  | | |  |
+ *        /       \ | | |  |
+ *       MSB           LSB |
+ *        7 6 5 4 3 2 1 0  |
+ *       +-+-+-+-+-+-+-+-+ |
+ *     1 | | | | | | | | | \ \
+ *       +-+-+-+-+-+-+-+-+  \ \  LSB
+ *       +-+-+-+-+-+-+-+-+     +
+ *     2 | | | | | | | | |     |
+ *       +-+-+-+-+-+-+-+-+     |
+ *     3 | | | | | | | | |     +----- target node address (in bytes)
+ *       +-+-+-+-+-+-+-+-+     |      (not present except for the byte
+ *       : : : : : : : : :     |       with flags if the node pointed to
+ *       +-+-+-+-+-+-+-+-+     +       is next)
+ *   gtl | | | | | | | | |    /  MSB
+ *       +-+-+-+-+-+-+-+-+   /
+ * gtl+1                           (gtl = gotoLength)
+ * 
+ */ +public final class FSA5 extends FSA { + /** + * Default filler byte. + */ + public final static byte DEFAULT_FILLER = '_'; + + /** + * Default annotation byte. + */ + public final static byte DEFAULT_ANNOTATION = '+'; + + /** + * Automaton version as in the file header. + */ + public static final byte VERSION = 5; + + /** + * Bit indicating that an arc corresponds to the last character of a + * sequence available when building the automaton. + */ + public static final int BIT_FINAL_ARC = 1 << 0; + + /** + * Bit indicating that an arc is the last one of the node's list and the + * following one belongs to another node. + */ + public static final int BIT_LAST_ARC = 1 << 1; + + /** + * Bit indicating that the target node of this arc follows it in the + * compressed automaton structure (no goto field). + */ + public static final int BIT_TARGET_NEXT = 1 << 2; + + /** + * An offset in the arc structure, where the address and flags field begins. + * In version 5 of FSA automata, this value is constant (1, skip label). + */ + public final static int ADDRESS_OFFSET = 1; + + /** + * An array of bytes with the internal representation of the automaton. + * Please see the documentation of this class for more information on how + * this structure is organized. + */ + public final byte[] arcs; + + /** + * The length of the node header structure (if the automaton was compiled with + * NUMBERS option). Otherwise zero. + */ + public final int nodeDataLength; + + /** + * Flags for this automaton version. + */ + private final Set flags; + + /** + * Number of bytes each address takes in full, expanded form (goto length). + */ + public final int gtl; + + /** Filler character. */ + public final byte filler; + + /** Annotation character. */ + public final byte annotation; + + /** + * Read and wrap a binary automaton in FSA version 5. + */ + public FSA5(InputStream fsaStream) throws IOException { + // Read the header first. + final FSAHeader header = FSAHeader.read(fsaStream); + + // Ensure we have version 5. + if (header.version != VERSION) { + throw new IOException("This class can read FSA version 5 only: " + header.version); + } + + /* + * Determine if the automaton was compiled with NUMBERS. If so, modify + * ctl and goto fields accordingly. + */ + flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); + if ((header.gtl & 0xf0) != 0) { + flags.add(NUMBERS); + } + + this.nodeDataLength = (header.gtl >>> 4) & 0x0f; + this.gtl = header.gtl & 0x0f; + + this.filler = header.filler; + this.annotation = header.annotation; + + arcs = readFully(fsaStream); + } + + /** + * Returns the start node of this automaton. + */ + @Override + public int getRootNode() { + // Skip dummy node marking terminating state. + final int epsilonNode = skipArc(getFirstArc(0)); + + // And follow the epsilon node's first (and only) arc. + return getDestinationNodeOffset(getFirstArc(epsilonNode)); + } + + /** + * {@inheritDoc} + */ + @Override + public final int getFirstArc(int node) { + return nodeDataLength + node; + } + + /** + * {@inheritDoc} + */ + @Override + public final int getNextArc(int arc) { + if (isArcLast(arc)) + return 0; + else + return skipArc(arc); + } + + /** + * {@inheritDoc} + */ + @Override + public int getArc(int node, byte label) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (getArcLabel(arc) == label) + return arc; + } + + // An arc labeled with "label" not found. + return 0; + } + + /** + * {@inheritDoc} + */ + @Override + public int getEndNode(int arc) { + final int nodeOffset = getDestinationNodeOffset(arc); + assert nodeOffset != 0 : "No target node for terminal arcs."; + return nodeOffset; + } + + /** + * {@inheritDoc} + */ + @Override + public byte getArcLabel(int arc) { + return arcs[arc]; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcFinal(int arc) { + return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcTerminal(int arc) { + return (0 == getDestinationNodeOffset(arc)); + } + + /** + * Returns the number encoded at the given node. The number equals the count + * of the set of suffixes reachable from node (called its right + * language). + */ + @Override + public int getRightLanguageCount(int node) { + assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; + return decodeFromBytes(arcs, node, nodeDataLength); + } + + /** + * {@inheritDoc} + * + *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag + * may be set to indicate the automaton contains extra fields for each node.

+ */ + @Override + public Set getFlags() { + return Collections.unmodifiableSet(flags); + } + + /** + * Returns true if this arc has LAST bit set. + * + * @see #BIT_LAST_ARC + */ + public boolean isArcLast(int arc) { + return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0; + } + + /** + * @see #BIT_TARGET_NEXT + */ + public boolean isNextSet(int arc) { + return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0; + } + + /** + * Returns an n-byte integer encoded in byte-packed representation. + */ + static final int decodeFromBytes( + final byte[] arcs, final int start, final int n) + { + int r = 0; + for (int i = n; --i >= 0;) { + r = r << 8 | (arcs[start + i] & 0xff); + } + return r; + } + + /** + * Returns the address of the node pointed to by this arc. + */ + final int getDestinationNodeOffset(int arc) { + if (isNextSet(arc)) { + /* The destination node follows this arc in the array. */ + return skipArc(arc); + } else { + /* + * The destination node address has to be extracted from the arc's + * goto field. + */ + return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3; + } + } + + /** + * Read the arc's layout and skip as many bytes, as needed. + */ + private int skipArc(int offset) { + return offset + (isNextSet(offset) + ? 1 + 1 /* label + flags */ + : 1 + gtl /* label + flags/address */); + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java new file mode 100644 index 0000000..21627a9 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java @@ -0,0 +1,332 @@ +package morfologik.fsa; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.*; + +import com.carrotsearch.hppc.*; +import com.carrotsearch.hppc.BitSet; + +import static morfologik.fsa.FSAFlags.*; + +/** + * Serializes in-memory {@link FSA} graphs to a binary format compatible with + * Jan Daciuk's fsa's package FSA5 format. + * + *

+ * It is possible to serialize the automaton with numbers required for perfect + * hashing. See {@link #withNumbers()} method. + *

+ * + * @see FSA5 + * @see FSA#read(java.io.InputStream) + */ +public final class FSA5Serializer implements FSASerializer { + /** + * Maximum number of bytes for a serialized arc. + */ + private final static int MAX_ARC_SIZE = 1 + 5; + + /** + * Maximum number of bytes for per-node data. + */ + private final static int MAX_NODE_DATA_SIZE = 16; + + /** + * Number of bytes for the arc's flags header (arc representation without + * the goto address). + */ + private final static int SIZEOF_FLAGS = 1; + + /** + * Supported flags. + */ + private final static EnumSet flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT); + + /** + * @see FSA5#filler + */ + public byte fillerByte = FSA5.DEFAULT_FILLER; + + /** + * @see FSA5#annotation + */ + public byte annotationByte = FSA5.DEFAULT_ANNOTATION; + + /** + * true if we should serialize with numbers. + * + * @see #withNumbers() + */ + private boolean withNumbers; + + /** + * A hash map of [state, offset] pairs. + */ + private IntIntOpenHashMap offsets = new IntIntOpenHashMap(); + + /** + * A hash map of [state, right-language-count] pairs. + */ + private IntIntOpenHashMap numbers = new IntIntOpenHashMap(); + + /** + * Serialize the automaton with the number of right-language sequences in + * each node. This is required to implement perfect hashing. The numbering + * also preserves the order of input sequences. + * + * @return Returns the same object for easier call chaining. + */ + public FSA5Serializer withNumbers() { + withNumbers = true; + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public FSA5Serializer withFiller(byte filler) { + this.fillerByte = filler; + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) { + this.annotationByte = annotationSeparator; + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public FSASerializer withLogger(IMessageLogger logger) { + return this; + } + + /** + * Serialize root state s to an output stream in + * FSA5 format. + * + * @see #withNumbers + * @return Returns os for chaining. + */ + @Override + public T serialize(final FSA fsa, T os) + throws IOException { + + // Prepare space for arc offsets and linearize all the states. + int[] linearized = linearize(fsa); + + /* + * Calculate the number of bytes required for the node data, if + * serializing with numbers. + */ + int nodeDataLength = 0; + if (withNumbers) { + this.numbers = FSAUtils.rightLanguageForAllStates(fsa); + int maxNumber = numbers.get(fsa.getRootNode()); + while (maxNumber > 0) { + nodeDataLength++; + maxNumber >>>= 8; + } + } + + // Calculate minimal goto length. + int gtl = 1; + while (true) { + // First pass: calculate offsets of states. + if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) { + gtl++; + continue; + } + + // Second pass: check if goto overflows anywhere. + if (emitArcs(fsa, null, linearized, gtl, nodeDataLength)) + break; + + gtl++; + } + + /* + * Emit the header. + */ + os.write(new byte[] { '\\', 'f', 's', 'a' }); + os.write(FSA5.VERSION); + os.write(fillerByte); + os.write(annotationByte); + os.write((nodeDataLength << 4) | gtl); + + /* + * Emit the automaton. + */ + boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength); + assert gtlUnchanged : "gtl changed in the final pass."; + + return os; + } + + /** + * Return supported flags. + */ + @Override + public Set getFlags() { + return flags; + } + + /** + * Linearization of states. + */ + private int[] linearize(final FSA fsa) { + int[] linearized = new int[0]; + int last = 0; + + BitSet visited = new BitSet(); + IntStack nodes = new IntStack(); + nodes.push(fsa.getRootNode()); + + while (!nodes.isEmpty()) { + final int node = nodes.pop(); + if (visited.get(node)) + continue; + + if (last >= linearized.length) { + linearized = Arrays.copyOf(linearized, linearized.length + 100000); + } + + visited.set(node); + linearized[last++] = node; + + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { + if (!fsa.isArcTerminal(arc)) { + int target = fsa.getEndNode(arc); + if (!visited.get(target)) + nodes.push(target); + } + } + } + + return Arrays.copyOf(linearized, last); + } + + /** + * Update arc offsets assuming the given goto length. + */ + private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized, + int gtl, int nodeDataLength) throws IOException { + final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE, + MAX_ARC_SIZE)); + + int offset = 0; + + // Add dummy terminal state. + offset += emitNodeData(bb, os, nodeDataLength, 0); + offset += emitArc(bb, os, gtl, 0, (byte) 0, 0); + + // Add epsilon state. + offset += emitNodeData(bb, os, nodeDataLength, 0); + if (fsa.getRootNode() != 0) + offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0); + else + offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC , (byte) '^', 0); + + int maxStates = linearized.length; + for (int j = 0; j < maxStates; j++) { + final int s = linearized[j]; + + if (os == null) { + offsets.put(s, offset); + } else { + assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset; + } + + offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0); + + for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { + int targetOffset; + final int target; + if (fsa.isArcTerminal(arc)) { + targetOffset = 0; + target = 0; + } else { + target = fsa.getEndNode(arc); + targetOffset = offsets.get(target); + } + + int flags = 0; + if (fsa.isArcFinal(arc)) { + flags |= FSA5.BIT_FINAL_ARC; + } + + if (fsa.getNextArc(arc) == 0) { + flags |= FSA5.BIT_LAST_ARC; + + if (j + 1 < maxStates && target == linearized[j + 1] + && targetOffset != 0) { + flags |= FSA5.BIT_TARGET_NEXT; + targetOffset = 0; + } + } + + int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset); + if (bytes < 0) + // gtl too small. interrupt eagerly. + return false; + + offset += bytes; + } + } + + return true; + } + + /** */ + private int emitArc(ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset) + throws IOException + { + int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl; + + flags |= (targetOffset << 3); + bb.put(label); + for (int b = 0; b < arcBytes; b++) { + bb.put((byte) flags); + flags >>>= 8; + } + + if (flags != 0) { + // gtl too small. interrupt eagerly. + return -1; + } + + bb.flip(); + int bytes = bb.remaining(); + if (os != null) { + os.write(bb.array(), bb.position(), bb.remaining()); + } + bb.clear(); + + return bytes; + } + + /** */ + private int emitNodeData(ByteBuffer bb, OutputStream os, + int nodeDataLength, int number) throws IOException { + if (nodeDataLength > 0 && os != null) { + for (int i = 0; i < nodeDataLength; i++) { + bb.put((byte) number); + number >>>= 8; + } + + bb.flip(); + os.write(bb.array(), bb.position(), bb.remaining()); + bb.clear(); + } + + return nodeDataLength; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java new file mode 100644 index 0000000..0cf7cc0 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java @@ -0,0 +1,486 @@ +package morfologik.fsa; + +import java.util.*; + +import morfologik.util.Arrays; + +import static morfologik.fsa.ConstantArcSizeFSA.*; + +/** + * Fast, memory-conservative finite state automaton builder, returning a + * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction + * speed and memory consumption). + */ +public final class FSABuilder { + /** + * Debug and information constants. + * + * @see FSABuilder#getInfo() + */ + public enum InfoEntry { + SERIALIZATION_BUFFER_SIZE("Serialization buffer size"), + SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"), + CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"), + MAX_ACTIVE_PATH_LENGTH("Max active path"), + STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"), + STATE_REGISTRY_SIZE("Registry hash entries"), + ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)"); + + private final String stringified; + + InfoEntry(String stringified) { + this.stringified = stringified; + } + + @Override + public String toString() { + return stringified; + } + } + + /** A megabyte. */ + private final static int MB = 1024 * 1024; + + /** + * Internal serialized FSA buffer expand ratio. + */ + private final static int BUFFER_GROWTH_SIZE = 5 * MB; + + /** + * Maximum number of labels from a single state. + */ + private final static int MAX_LABELS = 256; + + /** + * Comparator comparing full byte arrays consistently with + * {@link #compare(byte[], int, int, byte[], int, int)}. + */ + public static final Comparator LEXICAL_ORDERING = new Comparator() { + public int compare(byte[] o1, byte[] o2) { + return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length); + } + }; + + /** + * Internal serialized FSA buffer expand ratio. + */ + private final int bufferGrowthSize; + + /** + * Holds serialized and mutable states. Each state is a sequential list of + * arcs, the last arc is marked with {@link #BIT_ARC_LAST}. + */ + private byte[] serialized = new byte[0]; + + /** + * Number of bytes already taken in {@link #serialized}. Start from 1 to + * keep 0 a sentinel value (for the hash set and final state). + */ + private int size; + + /** + * States on the "active path" (still mutable). Values are addresses of each + * state's first arc. + */ + private int[] activePath = new int[0]; + + /** + * Current length of the active path. + */ + private int activePathLen; + + /** + * The next offset at which an arc will be added to the given state on + * {@link #activePath}. + */ + private int[] nextArcOffset = new int[0]; + + /** + * Root state. If negative, the automaton has been built already and cannot be extended. + */ + private int root; + + /** + * An epsilon state. The first and only arc of this state points either + * to the root or to the terminal state, indicating an empty automaton. + */ + private int epsilon; + + /** + * Hash set of state addresses in {@link #serialized}, hashed by + * {@link #hash(int, int)}. Zero reserved for an unoccupied slot. + */ + private int[] hashSet = new int[2]; + + /** + * Number of entries currently stored in {@link #hashSet}. + */ + private int hashSize = 0; + + /** + * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only. + */ + private byte [] previous; + + /** + * Information about the automaton and its compilation. + */ + private TreeMap info; + + /** + * {@link #previous} sequence's length, used in assertions only. + */ + private int previousLength; + + /** */ + public FSABuilder() { + this(BUFFER_GROWTH_SIZE); + } + + /** */ + public FSABuilder(int bufferGrowthSize) { + this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS); + + // Allocate epsilon state. + epsilon = allocateState(1); + serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST; + + // Allocate root, with an initial empty set of output arcs. + expandActivePath(1); + root = activePath[0]; + } + + /** + * Add a single sequence of bytes to the FSA. The input must be lexicographically greater + * than any previously added sequence. + */ + public void add(byte[] sequence, int start, int len) { + assert serialized != null : "Automaton already built."; + assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : + "Input must be sorted: " + + Arrays.toString(previous, 0, previousLength) + " >= " + + Arrays.toString(sequence, start, len); + assert setPrevious(sequence, start, len); + + // Determine common prefix length. + final int commonPrefix = commonPrefix(sequence, start, len); + + // Make room for extra states on active path, if needed. + expandActivePath(len); + + // Freeze all the states after the common prefix. + for (int i = activePathLen - 1; i > commonPrefix; i--) { + final int frozenState = freezeState(i); + setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState); + nextArcOffset[i] = activePath[i]; + } + + // Create arcs to new suffix states. + for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) { + final int p = nextArcOffset[i - 1]; + + serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0); + serialized[p + LABEL_OFFSET] = sequence[j++]; + setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]); + + nextArcOffset[i - 1] = p + ARC_SIZE; + } + + // Save last sequence's length so that we don't need to calculate it again. + this.activePathLen = len; + } + + /** Number of serialization buffer reallocations. */ + private int serializationBufferReallocations; + + /** + * Complete the automaton. + */ + public FSA complete() { + add(new byte[0], 0, 0); + + if (nextArcOffset[0] - activePath[0] == 0) { + // An empty FSA. + setArcTarget(epsilon, TERMINAL_STATE); + } else { + // An automaton with at least a single arc from root. + root = freezeState(0); + setArcTarget(epsilon, root); + } + + info = new TreeMap(); + info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length); + info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations); + info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size); + info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length); + info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length); + info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize); + info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, + (this.serialized.length + this.hashSet.length * 4) / (double) MB); + + final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon); + this.serialized = null; + this.hashSet = null; + return fsa; + } + + /** + * Build a minimal, deterministic automaton from a sorted list of byte sequences. + */ + public static FSA build(byte[][] input) { + final FSABuilder builder = new FSABuilder(); + + for (byte [] chs : input) + builder.add(chs, 0, chs.length); + + return builder.complete(); + } + + /** + * Build a minimal, deterministic automaton from an iterable list of byte sequences. + */ + public static FSA build(Iterable input) { + final FSABuilder builder = new FSABuilder(); + + for (byte [] chs : input) + builder.add(chs, 0, chs.length); + + return builder.complete(); + } + + /** + * Return various statistics concerning the FSA and its compilation. + */ + public Map getInfo() { + return info; + } + + /** Is this arc the state's last? */ + private boolean isArcLast(int arc) { + return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; + } + + /** Is this arc final? */ + private boolean isArcFinal(int arc) { + return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; + } + + /** Get label's arc. */ + private byte getArcLabel(int arc) { + return serialized[arc + LABEL_OFFSET]; + } + + /** + * Fills the target state address of an arc. + */ + private void setArcTarget(int arc, int state) { + arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE; + for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) { + serialized[--arc] = (byte) state; + state >>>= 8; + } + } + + /** + * Returns the address of an arc. + */ + private int getArcTarget(int arc) { + arc += ADDRESS_OFFSET; + return (serialized[arc]) << 24 | + (serialized[arc + 1] & 0xff) << 16 | + (serialized[arc + 2] & 0xff) << 8 | + (serialized[arc + 3] & 0xff); + } + + /** + * @return The number of common prefix characters with the previous + * sequence. + */ + private int commonPrefix(byte[] sequence, int start, int len) { + // Empty root state case. + final int max = Math.min(len, activePathLen); + int i; + for (i = 0; i < max; i++) { + final int lastArc = nextArcOffset[i] - ARC_SIZE; + if (sequence[start++] != getArcLabel(lastArc)) { + break; + } + } + + return i; + } + + /** + * Freeze a state: try to find an equivalent state in the interned states + * dictionary first, if found, return it, otherwise, serialize the mutable + * state at activePathIndex and return it. + */ + private int freezeState(final int activePathIndex) { + final int start = activePath[activePathIndex]; + final int end = nextArcOffset[activePathIndex]; + final int len = end - start; + + // Set the last arc flag on the current active path's state. + serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST; + + // Try to locate a state with an identical content in the hash set. + final int bucketMask = (hashSet.length - 1); + int slot = hash(start, len) & bucketMask; + for (int i = 0;;) { + int state = hashSet[slot]; + if (state == 0) { + state = hashSet[slot] = serialize(activePathIndex); + if (++hashSize > hashSet.length / 2) + expandAndRehash(); + return state; + } else if (equivalent(state, start, len)) { + return state; + } + + slot = (slot + (++i)) & bucketMask; + } + } + + /** + * Reallocate and rehash the hash set. + */ + private void expandAndRehash() { + final int[] newHashSet = new int[hashSet.length * 2]; + final int bucketMask = (newHashSet.length - 1); + + for (int j = 0; j < hashSet.length; j++) { + final int state = hashSet[j]; + if (state > 0) { + int slot = hash(state, stateLength(state)) & bucketMask; + for (int i = 0; newHashSet[slot] > 0;) { + slot = (slot + (++i)) & bucketMask; + } + newHashSet[slot] = state; + } + } + this.hashSet = newHashSet; + } + + /** + * The total length of the serialized state data (all arcs). + */ + private int stateLength(int state) { + int arc = state; + while (!isArcLast(arc)) { + arc += ARC_SIZE; + } + return arc - state + ARC_SIZE; + } + + /** Return true if two regions in {@link #serialized} are identical. */ + private boolean equivalent(int start1, int start2, int len) { + if (start1 + len > size || start2 + len > size) + return false; + + while (len-- > 0) + if (serialized[start1++] != serialized[start2++]) + return false; + + return true; + } + + /** + * Serialize a given state on the active path. + */ + private int serialize(final int activePathIndex) { + expandBuffers(); + + final int newState = size; + final int start = activePath[activePathIndex]; + final int len = nextArcOffset[activePathIndex] - start; + System.arraycopy(serialized, start, serialized, newState, len); + + size += len; + return newState; + } + + /** + * Hash code of a fragment of {@link #serialized} array. + */ + private int hash(int start, int byteCount) { + assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?"; + + int h = 0; + for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) { + h = 17 * h + getArcLabel(start); + h = 17 * h + getArcTarget(start); + if (isArcFinal(start)) h += 17; + } + + return h; + } + + /** + * Append a new mutable state to the active path. + */ + private void expandActivePath(int size) { + if (activePath.length < size) { + final int p = activePath.length; + activePath = java.util.Arrays.copyOf(activePath, size); + nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size); + + for (int i = p; i < size; i++) { + nextArcOffset[i] = activePath[i] = + allocateState(/* assume max labels count */ MAX_LABELS); + } + } + } + + /** + * Expand internal buffers for the next state. + */ + private void expandBuffers() { + if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) { + serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize); + serializationBufferReallocations++; + } + } + + /** + * Allocate space for a state with the given number of outgoing labels. + * + * @return state offset + */ + private int allocateState(int labels) { + expandBuffers(); + final int state = size; + size += labels * ARC_SIZE; + return state; + } + + /** + * Copy current into an internal buffer. + */ + private boolean setPrevious(byte [] sequence, int start, int length) { + if (previous == null || previous.length < length) { + previous = new byte [length]; + } + + System.arraycopy(sequence, start, previous, 0, length); + previousLength = length; + return true; + } + + /** + * Lexicographic order of input sequences. By default, consistent with the "C" sort + * (absolute value of bytes, 0-255). + */ + public static int compare(byte [] s1, int start1, int lens1, + byte [] s2, int start2, int lens2) { + final int max = Math.min(lens1, lens2); + + for (int i = 0; i < max; i++) { + final byte c1 = s1[start1++]; + final byte c2 = s2[start2++]; + if (c1 != c2) + return (c1 & 0xff) - (c2 & 0xff); + } + + return lens1 - lens2; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java new file mode 100644 index 0000000..9e381f4 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFinalStatesIterator.java @@ -0,0 +1,154 @@ +package morfologik.fsa; + +import java.nio.ByteBuffer; +import java.util.*; + +/** + * An iterator that traverses the right language of a given node (all sequences + * reachable from a given node). + */ +public final class FSAFinalStatesIterator implements Iterator { + /** + * Default expected depth of the recursion stack (estimated longest sequence + * in the automaton). Buffers expand by the same value if exceeded. + */ + private final static int EXPECTED_MAX_STATES = 15; + + /** The FSA to which this iterator belongs. */ + private final FSA fsa; + + /** An internal cache for the next element in the FSA */ + private ByteBuffer nextElement; + + /** + * A buffer for the current sequence of bytes from the current node to the + * root. + */ + private byte[] buffer = new byte[EXPECTED_MAX_STATES]; + + /** Reusable byte buffer wrapper around {@link #buffer}. */ + private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer); + + /** An arc stack for DFS when processing the automaton. */ + private int[] arcs = new int[EXPECTED_MAX_STATES]; + + /** Current processing depth in {@link #arcs}. */ + private int position; + + /** + * Create an instance of the iterator for a given node. + */ + public FSAFinalStatesIterator(FSA fsa, int node) { + this.fsa = fsa; + + if (fsa.getFirstArc(node) != 0) { + restartFrom(node); + } + } + + /** + * Restart walking from node. Allows iterator reuse. + */ + public void restartFrom(int node) { + position = 0; + bufferWrapper.clear(); + nextElement = null; + + pushNode(node); + } + + /** Returns true if there are still elements in this iterator. */ + @Override + public boolean hasNext() { + if (nextElement == null) { + nextElement = advance(); + } + + return nextElement != null; + } + + /** + * @return Returns a {@link ByteBuffer} with the sequence corresponding to + * the next final state in the automaton. + */ + @Override + public ByteBuffer next() { + if (nextElement != null) { + final ByteBuffer cache = nextElement; + nextElement = null; + return cache; + } else { + final ByteBuffer cache = advance(); + if (cache == null) { + throw new NoSuchElementException(); + } + return cache; + } + } + + /** + * Advances to the next available final state. + */ + private final ByteBuffer advance() { + if (position == 0) { + return null; + } + + while (position > 0) { + final int lastIndex = position - 1; + final int arc = arcs[lastIndex]; + + if (arc == 0) { + // Remove the current node from the queue. + position--; + continue; + } + + // Go to the next arc, but leave it on the stack + // so that we keep the recursion depth level accurate. + arcs[lastIndex] = fsa.getNextArc(arc); + + // Expand buffer if needed. + final int bufferLength = this.buffer.length; + if (lastIndex >= bufferLength) { + this.buffer = Arrays.copyOf(buffer, bufferLength + + EXPECTED_MAX_STATES); + this.bufferWrapper = ByteBuffer.wrap(buffer); + } + buffer[lastIndex] = fsa.getArcLabel(arc); + + if (!fsa.isArcTerminal(arc)) { + // Recursively descend into the arc's node. + pushNode(fsa.getEndNode(arc)); + } + + if (fsa.isArcFinal(arc)) { + bufferWrapper.clear(); + bufferWrapper.limit(lastIndex + 1); + return bufferWrapper; + } + } + + return null; + } + + /** + * Not implemented in this iterator. + */ + @Override + public void remove() { + throw new UnsupportedOperationException("Read-only iterator."); + } + + /** + * Descends to a given node, adds its arcs to the stack to be traversed. + */ + private void pushNode(int node) { + // Expand buffers if needed. + if (position == arcs.length) { + arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES); + } + + arcs[position++] = fsa.getFirstArc(node); + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java new file mode 100644 index 0000000..7b9a730 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java @@ -0,0 +1,64 @@ +package morfologik.fsa; + +import java.util.Set; + +/** + * FSA automaton flags. Where applicable, flags follow Daciuk's fsa package. + */ +public enum FSAFlags { + /** Daciuk: flexible FSA encoding. */ + FLEXIBLE(1 << 0), + + /** Daciuk: stop bit in use. */ + STOPBIT(1 << 1), + + /** Daciuk: next bit in use. */ + NEXTBIT(1 << 2), + + /** Daciuk: tails compression. */ + TAILS(1 << 3), + + /* + * These flags are outside of byte range (never occur in Daciuk's FSA). + */ + + /** + * The FSA contains right-language count numbers on states. + * + * @see FSA#getRightLanguageCount(int) + */ + NUMBERS(1 << 8), + + /** + * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package + * compatibility). + */ + SEPARATORS(1 << 9); + + /** + * Bit mask for the corresponding flag. + */ + public final int bits; + + /** */ + private FSAFlags(int bits) { + this.bits = bits; + } + + /** + * Returns true if the corresponding flag is set in the bit set. + */ + public static boolean isSet(int flags, FSAFlags flag) { + return (flags & flag.bits) != 0; + } + + /** + * Returns the set of flags encoded in a single short. + */ + public static short asShort(Set flags) { + short value = 0; + for (FSAFlags f : flags) + value |= f.bits; + return value; + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java new file mode 100644 index 0000000..76fd6ff --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java @@ -0,0 +1,52 @@ +package morfologik.fsa; + +import java.io.IOException; +import java.io.InputStream; + +import morfologik.util.FileUtils; +import static morfologik.util.FileUtils.*; + +/** + * Standard FSA file header, as described in fsa package documentation. + */ +final class FSAHeader { + /** + * FSA magic (4 bytes). + */ + public final static int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a'); + + /** + * Maximum length of the header block. + */ + public static final int MAX_HEADER_LENGTH = 4 + 8; + + /** FSA version number. */ + public byte version; + + /** Filler character. */ + public byte filler; + + /** Annotation character. */ + public byte annotation; + + /** Goto field (may be a compound, depending on the automaton version). */ + public byte gtl; + + /** + * Read FSA header from a stream, consuming its bytes. + * + * @throws IOException If the stream ends prematurely or if it contains invalid data. + */ + public static FSAHeader read(InputStream in) throws IOException { + if (FSA_MAGIC != FileUtils.readInt(in)) + throw new IOException("Invalid file header magic bytes."); + + final FSAHeader h = new FSAHeader(); + h.version = readByte(in); + h.filler = readByte(in); + h.annotation = readByte(in); + h.gtl = readByte(in); + + return h; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java new file mode 100644 index 0000000..4015dce --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java @@ -0,0 +1,156 @@ +package morfologik.fsa; + +import java.util.BitSet; + +import com.carrotsearch.hppc.IntIntOpenHashMap; + +/** + * Compute additional information about an FSA: number of arcs, nodes, etc. + */ +public final class FSAInfo { + /** + * Computes the exact number of states and nodes by recursively traversing + * the FSA. + */ + private static class NodeVisitor { + final BitSet visitedArcs = new BitSet(); + final BitSet visitedNodes = new BitSet(); + + int nodes; + int arcs; + int totalArcs; + + private final FSA fsa; + + NodeVisitor(FSA fsa) { + this.fsa = fsa; + } + + public void visitNode(final int node) { + if (visitedNodes.get(node)) { + return; + } + visitedNodes.set(node); + + nodes++; + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa + .getNextArc(arc)) { + if (!visitedArcs.get(arc)) { + arcs++; + } + totalArcs++; + visitedArcs.set(arc); + + if (!fsa.isArcTerminal(arc)) { + visitNode(fsa.getEndNode(arc)); + } + } + } + } + + /** + * Computes the exact number of final states. + */ + private static class FinalStateVisitor { + final IntIntOpenHashMap visitedNodes = new IntIntOpenHashMap(); + + private final FSA fsa; + + FinalStateVisitor(FSA fsa) { + this.fsa = fsa; + } + + public int visitNode(int node) { + if (visitedNodes.containsKey(node)) + return visitedNodes.lget(); + + int fromHere = 0; + for (int arc = fsa.getFirstArc(node); + arc != 0; arc = fsa.getNextArc(arc)) + { + if (fsa.isArcFinal(arc)) + fromHere++; + + if (!fsa.isArcTerminal(arc)) { + fromHere += visitNode(fsa.getEndNode(arc)); + } + } + visitedNodes.put(node, fromHere); + return fromHere; + } + } + + /** + * Number of nodes in the automaton. + */ + public final int nodeCount; + + /** + * Number of arcs in the automaton, excluding an arcs from the zero node + * (initial) and an arc from the start node to the root node. + */ + public final int arcsCount; + + /** + * Total number of arcs, counting arcs that physically overlap due to + * merging. + */ + public final int arcsCountTotal; + + /** + * Number of final states (number of input sequences stored in the automaton). + */ + public final int finalStatesCount; + + /** + * Arcs size (in serialized form). + */ + public final int size; + + /* + * + */ + public FSAInfo(FSA fsa) { + final NodeVisitor w = new NodeVisitor(fsa); + int root = fsa.getRootNode(); + if (root > 0) { + w.visitNode(root); + } + + this.nodeCount = 1 + w.nodes; + this.arcsCount = 1 + w.arcs; + this.arcsCountTotal = 1 + w.totalArcs; + + final FinalStateVisitor fsv = new FinalStateVisitor(fsa); + this.finalStatesCount = fsv.visitNode(fsa.getRootNode()); + + if (fsa instanceof FSA5) { + this.size = ((FSA5) fsa).arcs.length; + } else { + this.size = 0; + } + } + + /* + * + */ + public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) { + this.nodeCount = nodeCount; + this.arcsCount = arcsCount; + this.arcsCountTotal = arcsCountTotal; + this.finalStatesCount = finalStatesCount; + this.size = 0; + } + + /* + * + */ + @Override + public String toString() { + return "Nodes: " + nodeCount + + ", arcs visited: " + arcsCount + + ", arcs total: " + arcsCountTotal + + ", final states: " + finalStatesCount + + ", size: " + size; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java new file mode 100644 index 0000000..fc52eeb --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java @@ -0,0 +1,43 @@ +package morfologik.fsa; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Set; + +/** + * All FSA serializers to binary formats will implement this interface. + */ +public interface FSASerializer { + /** + * Serialize a finite state automaton to an output stream. + */ + public T serialize(FSA fsa, T os) throws IOException; + + /** + * Returns the set of flags supported by the serializer (and the output automaton). + */ + public Set getFlags(); + + /** + * Log extra messages during construction. + */ + public FSASerializer withLogger(IMessageLogger logger); + + /** + * Supports built-in filler separator. Only if {@link #getFlags()} returns + * {@link FSAFlags#SEPARATORS}. + */ + public FSASerializer withFiller(byte filler); + + /** + * Supports built-in annotation separator. Only if {@link #getFlags()} returns + * {@link FSAFlags#SEPARATORS}. + */ + public FSASerializer withAnnotationSeparator(byte annotationSeparator); + + /** + * Supports built-in right language count on nodes, speeding up perfect hash counts. + * Only if {@link #getFlags()} returns {@link FSAFlags#NUMBERS}. + */ + public FSASerializer withNumbers(); +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java new file mode 100644 index 0000000..9e59003 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java @@ -0,0 +1,169 @@ +package morfologik.fsa; + +import static morfologik.fsa.MatchResult.*; + +/** + * This class implements some common matching and scanning operations on a + * generic FSA. + */ +public final class FSATraversal { + /** + * Target automaton. + */ + private final FSA fsa; + + /** + * Traversals of the given FSA. + */ + public FSATraversal(FSA fsa) { + this.fsa = fsa; + } + + /** + * Calculate perfect hash for a given input sequence of bytes. The perfect hash requires + * that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential + * order of input sequences used at automaton construction time. + * + * @param start Start index in the sequence array. + * @param length Length of the byte sequence, must be at least 1. + * + * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting + * the number of that sequence in the input used to build the automaton). Returns a negative + * integer if the input sequence was not part of the input from which the automaton was created. + * The type of mismatch is a constant defined in {@link MatchResult}. + */ + public int perfectHash(byte[] sequence, int start, int length, int node) { + assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option."; + assert length > 0 : "Must be a non-empty sequence."; + + int hash = 0; + final int end = start + length - 1; + + int seqIndex = start; + byte label = sequence[seqIndex]; + + // Seek through the current node's labels, looking for 'label', update hash. + for (int arc = fsa.getFirstArc(node); arc != 0;) { + if (fsa.getArcLabel(arc) == label) { + if (fsa.isArcFinal(arc)) { + if (seqIndex == end) + return hash; + + hash++; + } + + if (fsa.isArcTerminal(arc)) { + /* The automaton contains a prefix of the input sequence. */ + return AUTOMATON_HAS_PREFIX; + } + + // The sequence is a prefix of one of the sequences stored in the automaton. + if (seqIndex == end) { + return SEQUENCE_IS_A_PREFIX; + } + + // Make a transition along the arc, go the target node's first arc. + arc = fsa.getFirstArc(fsa.getEndNode(arc)); + label = sequence[++seqIndex]; + continue; + } else { + if (fsa.isArcFinal(arc)) + hash++; + if (!fsa.isArcTerminal(arc)) + hash += fsa.getRightLanguageCount(fsa.getEndNode(arc)); + } + + arc = fsa.getNextArc(arc); + } + + // Labels of this node ended without a match on the sequence. + // Perfect hash does not exist. + return NO_MATCH; + } + + /** + * @see #perfectHash(byte[], int, int, int) + */ + public int perfectHash(byte[] sequence) { + return perfectHash(sequence, 0, sequence.length, fsa.getRootNode()); + } + + /** + * Same as {@link #match(byte[], int, int, int)}, but allows passing + * a reusable {@link MatchResult} object so that no intermediate garbage is + * produced. + * + * @return The same object as result, but with reset internal + * type and other fields. + */ + public MatchResult match(MatchResult result, + byte[] sequence, int start, int length, int node) + { + if (node == 0) { + result.reset(NO_MATCH, start, node); + return result; + } + + final FSA fsa = this.fsa; + final int end = start + length; + for (int i = start; i < end; i++) { + final int arc = fsa.getArc(node, sequence[i]); + if (arc != 0) { + if (fsa.isArcFinal(arc) && i + 1 == end) { + /* The automaton has an exact match of the input sequence. */ + result.reset(EXACT_MATCH, i, node); + return result; + } + + if (fsa.isArcTerminal(arc)) { + /* The automaton contains a prefix of the input sequence. */ + result.reset(AUTOMATON_HAS_PREFIX, i + 1, 0); + return result; + } + + // Make a transition along the arc. + node = fsa.getEndNode(arc); + } else { + result.reset(NO_MATCH, i, node); + return result; + } + } + + /* The sequence is a prefix of at least one sequence in the automaton. */ + result.reset(SEQUENCE_IS_A_PREFIX, 0, node); + return result; + } + + /** + * Finds a matching path in the dictionary for a given sequence of labels + * from sequence and starting at node node. + * + * @param sequence + * An array of labels to follow in the FSA. + * @param start + * Starting index in sequence. + * @param length + * How many symbols to consider from sequence? + * @param node + * Start node identifier in the FSA. + * + * @see #match(byte [], int) + */ + public MatchResult match(byte[] sequence, int start, int length, int node) { + return match(new MatchResult(), sequence, start, length, node); + } + + /** + * @see #match(byte[], int, int, int) + */ + public MatchResult match(byte[] sequence, int node) { + return match(sequence, 0, sequence.length, node); + } + + /** + * @see #match(byte[], int, int, int) + */ + public MatchResult match(byte[] sequence) { + return match(sequence, fsa.getRootNode()); + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java b/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java new file mode 100644 index 0000000..cad611e --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java @@ -0,0 +1,202 @@ +package morfologik.fsa; + +import java.io.IOException; +import java.io.StringWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.TreeMap; + +import com.carrotsearch.hppc.IntIntOpenHashMap; + +/** + * Other FSA-related utilities not directly associated with the class hierarchy. + */ +public final class FSAUtils { + public final static class IntIntHolder { + public int a; + public int b; + + public IntIntHolder(int a, int b) { + this.a = a; + this.b = b; + } + + public IntIntHolder() { + } + } + + /** + * Returns the right-language reachable from a given FSA node, formatted + * as an input for the graphviz package (expressed in the dot + * language). + */ + public static String toDot(FSA fsa, int node) { + try { + StringWriter w = new StringWriter(); + toDot(w, fsa, node); + return w.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Saves the right-language reachable from a given FSA node, formatted + * as an input for the graphviz package (expressed in the dot + * language), to the given writer. + */ + public static void toDot(Writer w, FSA fsa, int node) throws IOException { + w.write("digraph Automaton {\n"); + w.write(" rankdir = LR;\n"); + + final BitSet visited = new BitSet(); + + w.write(" stop [shape=doublecircle,label=\"\"];\n"); + w.write(" initial [shape=plaintext,label=\"\"];\n"); + w.write(" initial -> " + node + "\n\n"); + + visitNode(w, 0, fsa, node, visited); + w.write("}\n"); + } + + private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException { + visited.set(s); + w.write(" "); w.write(Integer.toString(s)); + + if (fsa.getFlags().contains(FSAFlags.NUMBERS)) { + int nodeNumber = fsa.getRightLanguageCount(s); + w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n"); + } else { + w.write(" [shape=circle,label=\"\"];\n"); + } + + for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { + w.write(" "); + w.write(Integer.toString(s)); + w.write(" -> "); + if (fsa.isArcTerminal(arc)) { + w.write("stop"); + } else { + w.write(Integer.toString(fsa.getEndNode(arc))); + } + + final byte label = fsa.getArcLabel(arc); + w.write(" [label=\""); + if (Character.isLetterOrDigit(label)) + w.write((char) label); + else { + w.write("0x"); + w.write(Integer.toHexString(label & 0xFF)); + } + w.write("\""); + if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\""); + if (fsa instanceof FSA5) { + if (((FSA5) fsa).isNextSet(arc)) { + w.write(" color=\"blue\""); + } + } + + w.write("]\n"); + } + + for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { + if (!fsa.isArcTerminal(arc)) { + int endNode = fsa.getEndNode(arc); + if (!visited.get(endNode)) { + visitNode(w, d + 1, fsa, endNode, visited); + } + } + } + } + + /** + * All byte sequences generated as the right language of state. + */ + public static ArrayList rightLanguage(FSA fsa, int state) { + final ArrayList rl = new ArrayList(); + final byte [] buffer = new byte [0]; + + descend(fsa, state, buffer, 0, rl); + + return rl; + } + + /** + * Recursive descend and collection of the right language. + */ + private static byte [] descend(FSA fsa, int state, byte [] b, int position, ArrayList rl) { + + if (b.length <= position) { + b = Arrays.copyOf(b, position + 1); + } + + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { + b[position] = fsa.getArcLabel(arc); + + if (fsa.isArcFinal(arc)) { + rl.add(Arrays.copyOf(b, position + 1)); + } + + if (!fsa.isArcTerminal(arc)) + b = descend(fsa, fsa.getEndNode(arc), b, position + 1, rl); + } + + return b; + } + + /** + * Calculate fan-out ratio. + * @return The returned array: result[outgoing-arcs] + */ + public static TreeMap calculateFanOuts(final FSA fsa, int root) { + final int [] result = new int [256]; + fsa.visitInPreOrder(new StateVisitor() { + public boolean accept(int state) { + int count = 0; + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) + count++; + result[count]++; + return true; + } + }); + + TreeMap output = new TreeMap(); + + int low = 1; // Omit #0, there is always a single node like that (dummy). + while (low < result.length && result[low] == 0) low++; + + int high = result.length - 1; + while (high >= 0 && result[high] == 0) high--; + + for (int i = low; i <= high; i++) { + output.put(i, result[i]); + } + + return output; + } + + /** + * Calculate the size of right language for each state in an FSA. + */ + public static IntIntOpenHashMap rightLanguageForAllStates(final FSA fsa) { + final IntIntOpenHashMap numbers = new IntIntOpenHashMap(); + + fsa.visitInPostOrder(new StateVisitor() { + public boolean accept(int state) { + int thisNodeNumber = 0; + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { + thisNodeNumber += + (fsa.isArcFinal(arc) ? 1 : 0) + + (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc))); + } + numbers.put(state, thisNodeNumber); + + return true; + } + }); + + return numbers; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java b/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java new file mode 100644 index 0000000..4d86c1b --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java @@ -0,0 +1,25 @@ +package morfologik.fsa; + +public interface IMessageLogger { + + /** + * Log progress to the console. + */ + public void log(String msg); + + /** + * Log message header and save current time. + */ + public void startPart(String header); + + /** + * + */ + public void endPart(); + + /** + * Log a two-part message. + */ + public void log(String header, Object v); + +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java b/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java new file mode 100644 index 0000000..2f5cbd7 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java @@ -0,0 +1,86 @@ +package morfologik.fsa; + +/** + * A matching result returned from {@link FSATraversal}. + * + * @see FSATraversal + */ +public final class MatchResult { + /** + * The automaton has exactly one match for the input sequence. + */ + public static final int EXACT_MATCH = 0; + + /** + * The automaton has no match for the input sequence. + */ + public static final int NO_MATCH = -1; + + /** + * The automaton contains a prefix of the input sequence. That is: + * one of the input sequences used to build the automaton is a + * prefix of the input sequence that is shorter than the sequence. + * + *

{@link MatchResult#index} will contain an index of the + * first character of the input sequence not present in the + * dictionary.

+ */ + public static final int AUTOMATON_HAS_PREFIX = -3; + + /** + * The sequence is a prefix of at least one sequence in the automaton. + * {@link MatchResult#node} returns the node from which all sequences + * with the given prefix start in the automaton. + */ + public static final int SEQUENCE_IS_A_PREFIX = -4; + + /** + * One of the match kind constants defined in this class. + * + * @see #NO_MATCH + * @see #EXACT_MATCH + * @see #AUTOMATON_HAS_PREFIX + * @see #SEQUENCE_IS_A_PREFIX + */ + public int kind; + + /** + * Input sequence's index, interpretation depends on {@link #kind}. + */ + public int index; + + /** + * Automaton node, interpretation depends on the {@link #kind}. + */ + public int node; + + /* + * + */ + MatchResult(int kind, int index, int node) { + reset(kind, index, node); + } + + /* + * + */ + MatchResult(int kind) { + reset(kind, 0, 0); + } + + /* + * + */ + public MatchResult() { + reset(NO_MATCH, 0, 0); + } + + /* + * + */ + final void reset(int kind, int index, int node) { + this.kind = kind; + this.index = index; + this.node = node; + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java b/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java new file mode 100644 index 0000000..c5134b6 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java @@ -0,0 +1,22 @@ +package morfologik.fsa; + +/* + * Do-nothing logger. + */ +final class NullMessageLogger implements IMessageLogger { + @Override + public void log(String msg) { + } + + @Override + public void startPart(String header) { + } + + @Override + public void endPart() { + } + + @Override + public void log(String header, Object v) { + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java b/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java new file mode 100644 index 0000000..8ced239 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java @@ -0,0 +1,11 @@ +package morfologik.fsa; + +/** + * State visitor. + * + * @see FSA#visitInPostOrder(StateVisitor) + * @see FSA#visitInPreOrder(StateVisitor) + */ +public interface StateVisitor { + public boolean accept(int state); +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/util/Arrays.java b/morfologik-fsa/src/main/java/morfologik/util/Arrays.java new file mode 100644 index 0000000..4d1d840 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/util/Arrays.java @@ -0,0 +1,68 @@ +package morfologik.util; + +/** + * Compatibility layer for JVM 1.5. + */ +public final class Arrays { + private Arrays() { + // No instances. + } + + /** + * Compare two lists of objects for reference-equality. + */ + public static boolean referenceEquals(Object[] a1, int a1s, Object[] a2, int a2s, int length) { + for (int i = 0; i < length; i++) + if (a1[a1s++] != a2[a2s++]) + return false; + + return true; + } + + /** + * Compare two arrays for equality. + */ + public static boolean equals(byte[] a1, int a1s, byte [] a2, int a2s, int length) { + for (int i = 0; i < length; i++) + if (a1[a1s++] != a2[a2s++]) + return false; + + return true; + } + + /** + * Compare two arrays for equality. + */ + public static boolean equals(boolean[] a1, int a1s, boolean[] a2, int a2s, int length) { + for (int i = 0; i < length; i++) + if (a1[a1s++] != a2[a2s++]) + return false; + + return true; + } + + /** + * Compare two arrays for equality. + */ + public static boolean equals(int[] a1, int a1s, int[] a2, int a2s, int length) { + for (int i = 0; i < length; i++) + if (a1[a1s++] != a2[a2s++]) + return false; + + return true; + } + + /** + * Convert an array of strings to bytes. + */ + public static String toString(byte [] bytes, int start, int length) + { + if (bytes.length != length) + { + final byte [] sub = new byte [length]; + System.arraycopy(bytes, start, sub, 0, length); + bytes = sub; + } + return java.util.Arrays.toString(bytes); + } +} diff --git a/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java b/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java new file mode 100644 index 0000000..6ccfbc6 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java @@ -0,0 +1,54 @@ +package morfologik.util; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; + +/** + * Utility functions for buffers. + */ +public final class BufferUtils { + + /** + * No instances. + */ + private BufferUtils() { + // empty + } + + /** + * Ensure the byte buffer's capacity. If a new buffer is allocated, its + * content is empty (the old buffer's contents is not copied). + * + * @param buffer + * The buffer to check or null if a new buffer + * should be allocated. + */ + public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) { + if (buffer == null || buffer.capacity() < capacity) { + buffer = ByteBuffer.allocate(capacity); + } + return buffer; + } + + /** + * Ensure the char buffer's capacity. If a new buffer is allocated, its + * content is empty (the old buffer's contents is not copied). + * + * @param buffer + * The buffer to check or null if a new buffer + * should be allocated. + */ + public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) { + if (buffer == null || buffer.capacity() < capacity) { + buffer = CharBuffer.allocate(capacity); + } + return buffer; + } + + /** + * Convert a byte buffer to a string in platform default encoding. + */ + public static String toString(ByteBuffer sequence) { + return new String(sequence.array(), sequence.position(), sequence.remaining()); + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java b/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java new file mode 100644 index 0000000..5d62212 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java @@ -0,0 +1,137 @@ +package morfologik.util; + +import java.io.*; + +/** + * Utility functions. + */ +public final class FileUtils { + + /** + * No instances. + */ + private FileUtils() { + // empty + } + + /** + * Checks if the given file exists. + */ + public static void assertExists(File fsaFile, boolean requireFile, + boolean requireDirectory) throws IOException { + if (!fsaFile.exists()) { + throw new IOException("File does not exist: " + + fsaFile.getAbsolutePath()); + } + + if (requireFile) { + if (!fsaFile.isFile() || !fsaFile.canRead()) { + throw new IOException("File cannot be read: " + + fsaFile.getAbsolutePath()); + } + } + + if (requireDirectory) { + if (!fsaFile.isDirectory()) { + throw new IOException("Not a directory: " + + fsaFile.getAbsolutePath()); + } + } + } + + /** + * Force any non-null closeables. + */ + public static void close(Closeable... closeables) { + for (Closeable c : closeables) { + if (c != null) { + try { + c.close(); + } catch (IOException e) { + // too bad + } + } + } + } + + /** + * Reads all bytes from an input stream (until EOF). + */ + public static byte[] readFully(InputStream stream) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16); + final byte[] buffer = new byte[1024 * 8]; + int bytesCount; + while ((bytesCount = stream.read(buffer)) > 0) { + baos.write(buffer, 0, bytesCount); + } + return baos.toByteArray(); + } + + /** + * Read enough bytes to fill array If there are not enough + * bytes, throw an exception. + */ + public static void readFully(InputStream in, byte[] array) + throws IOException { + int offset = 0; + int cnt; + while ((cnt = in.read(array, offset, array.length - offset)) > 0) { + offset += cnt; + + if (offset == array.length) + break; + } + + if (cnt < 0) + throw new EOFException(); + } + + /** + * Read exactly 4 bytes from the input stream. + */ + public static int readInt(InputStream in) throws IOException { + int v = 0; + for (int i = 0; i < 4; i++) { + v = (v << 8) | (readByte(in) & 0xff); + } + return v; + } + + /** + * + */ + public static void writeInt(OutputStream os, int v) throws IOException { + os.write( v >>> 24); + os.write((v >>> 16) & 0xff); + os.write((v >>> 8) & 0xff); + os.write( v & 0xff); + } + + /** + * Read exactly 2 bytes from the input stream. + */ + public static short readShort(InputStream in) throws IOException { + return (short) (readByte(in) << 8 | + readByte(in) & 0xff); + } + + /** + * Read exactly one byte from the input stream. + * + * @throws EOFException if EOF is reached. + */ + public static byte readByte(InputStream in) throws IOException { + int b = in.read(); + if (b == -1) + throw new EOFException(); + return (byte) b; + } + + /** + * + */ + public static void writeShort(OutputStream os, short v) throws IOException { + os.write((v >>> 8) & 0xff); + os.write( v & 0xff); + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java b/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java new file mode 100644 index 0000000..2c7bd23 --- /dev/null +++ b/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java @@ -0,0 +1,58 @@ +package morfologik.util; + +import java.io.*; +import java.net.*; + +/** + * Resource management utilities. + */ +public final class ResourceUtils { + /** + * No instances. + */ + private ResourceUtils() { + } + + /** + * Returns an input stream to the resource. + * + * @param resource + * The path leading to the resource. Can be an URL, a path + * leading to a class resource or a {@link File}. + * + * @return InputStream instance. + * @throws IOException + * If the resource could not be found or opened. + */ + public static InputStream openInputStream(String resource) + throws IOException { + try { + // See if the resource is an URL first. + final URL url = new URL(resource); + // success, load the resource. + return url.openStream(); + } catch (MalformedURLException e) { + // No luck. Fallback to class loader paths. + } + + // Try current thread's class loader first. + final ClassLoader ldr = Thread.currentThread().getContextClassLoader(); + + InputStream is; + if (ldr != null && (is = ldr.getResourceAsStream(resource)) != null) { + return is; + } else if ((is = ResourceUtils.class.getResourceAsStream(resource)) != null) { + return is; + } else if ((is = ClassLoader.getSystemResourceAsStream(resource)) != null) { + return is; + } + + // Try file path + final File f = new File(resource); + if (f.exists() && f.isFile() && f.canRead()) { + return new FileInputStream(f); + } + + throw new IOException("Could not locate resource: " + resource); + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java new file mode 100644 index 0000000..332bbcc --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java @@ -0,0 +1,27 @@ +package morfologik.fsa; + +import static org.junit.Assert.*; + +import org.junit.Test; + +/** + * + */ +public class CFSA2SerializerTest extends SerializerTestBase { + protected CFSA2Serializer createSerializer() { + return new CFSA2Serializer(); + } + + @Test + public void testVIntCoding() { + byte [] scratch = new byte [5]; + + int [] values = {0, 1, 128, 256, 0x1000, Integer.MAX_VALUE }; + + for (int v : values) { + int len = CFSA2.writeVInt(scratch, 0, v); + assertEquals(v, CFSA2.readVInt(scratch, 0)); + assertEquals(len, CFSA2.vIntLength(v)); + } + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java new file mode 100644 index 0000000..1d05cfc --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java @@ -0,0 +1,10 @@ +package morfologik.fsa; + +/** + * + */ +public class FSA5SerializerTest extends SerializerTestBase { + protected FSA5Serializer createSerializer() { + return new FSA5Serializer(); + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java new file mode 100644 index 0000000..97b2f05 --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java @@ -0,0 +1,105 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSAFlags.NEXTBIT; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.junit.Test; + +/** + * Additional tests for {@link FSA5}. + */ +public final class FSA5Test { + public ArrayList expected = new ArrayList(Arrays.asList( + "a", "aba", "ac", "b", "ba", "c")); + + @Test + public void testVersion5() throws IOException { + final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); + assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS)); + verifyContent(expected, fsa); + } + + @Test + public void testVersion5WithNumbers() throws IOException { + final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); + + verifyContent(expected, fsa); + assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS)); + } + + @Test + public void testArcsAndNodes() throws IOException { + final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream( + "abc.fsa")); + final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream( + "abc-numbers.fsa")); + + FSAInfo info1 = new FSAInfo(fsa1); + FSAInfo info2 = new FSAInfo(fsa2); + + assertEquals(info1.arcsCount, info2.arcsCount); + assertEquals(info1.nodeCount, info2.nodeCount); + + assertEquals(4, info2.nodeCount); + assertEquals(7, info2.arcsCount); + } + + @Test + public void testNumbers() throws IOException { + final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); + + assertTrue(fsa.getFlags().contains(NEXTBIT)); + + // Get all numbers for nodes. + byte[] buffer = new byte[128]; + final ArrayList result = new ArrayList(); + walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); + + Collections.sort(result); + assertEquals(Arrays + .asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result); + } + + public static void walkNode(byte[] buffer, int depth, FSA fsa, int node, + int cnt, List result) throws IOException { + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { + buffer[depth] = fsa.getArcLabel(arc); + + if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) { + result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8")); + } + + if (fsa.isArcFinal(arc)) + cnt++; + + if (!fsa.isArcTerminal(arc)) { + walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result); + cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc)); + } + } + } + + private static void verifyContent(List expected, FSA fsa) throws IOException { + final ArrayList actual = new ArrayList(); + + int count = 0; + for (ByteBuffer bb : fsa.getSequences()) { + assertEquals(0, bb.arrayOffset()); + assertEquals(0, bb.position()); + actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8")); + count++; + } + assertEquals(expected.size(), count); + Collections.sort(actual); + assertEquals(expected, actual); + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java new file mode 100644 index 0000000..d2e1bad --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java @@ -0,0 +1,112 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSATestUtils.*; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.Arrays; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSABuilder; +import morfologik.util.MinMax; + +import org.junit.BeforeClass; +import org.junit.Test; + +public class FSABuilderTest { + private static byte[][] input; + private static byte[][] input2; + + @BeforeClass + public static void prepareByteInput() { + input = generateRandom(25000, new MinMax(1, 20), new MinMax(0, 255)); + input2 = generateRandom(40, new MinMax(1, 20), new MinMax(0, 3)); + } + + /** + * + */ + @Test + public void testEmptyInput() { + byte[][] input = {}; + checkCorrect(input, FSABuilder.build(input)); + } + + /** + * + */ + @Test + public void testHashResizeBug() throws Exception { + byte[][] input = { + {0, 1 }, + {0, 2 }, + {1, 1 }, + {2, 1 }, + }; + + FSA fsa = FSABuilder.build(input); + checkCorrect(input, FSABuilder.build(input)); + checkMinimal(fsa); + } + + /** + * + */ + @Test + public void testSmallInput() throws Exception { + byte[][] input = { + "abc".getBytes("UTF-8"), + "bbc".getBytes("UTF-8"), + "d".getBytes("UTF-8"), + }; + checkCorrect(input, FSABuilder.build(input)); + } + + /** + * Verify absolute byte-value ordering in the comparators and serialized automaton. + */ + @Test + public void testLexicographicOrder() throws IOException { + byte[][] input = { + {0}, + {1}, + {(byte) 0xff}, + }; + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + + // Check if lexical ordering is consistent with absolute byte value. + assertEquals(0, input[0][0]); + assertEquals(1, input[1][0]); + assertEquals((byte) 0xff, input[2][0]); + + final FSA fsa; + checkCorrect(input, fsa = FSABuilder.build(input)); + + int arc = fsa.getFirstArc(fsa.getRootNode()); + assertEquals(0, fsa.getArcLabel(arc)); + arc = fsa.getNextArc(arc); + assertEquals(1, fsa.getArcLabel(arc)); + arc = fsa.getNextArc(arc); + assertEquals((byte) 0xff, fsa.getArcLabel(arc)); + } + + /** + * + */ + @Test + public void testRandom25000_largerAlphabet() { + FSA fsa = FSABuilder.build(input); + checkCorrect(input, fsa); + checkMinimal(fsa); + } + + /** + * + */ + @Test + public void testRandom25000_smallAlphabet() throws IOException { + FSA fsa = FSABuilder.build(input2); + checkCorrect(input2, fsa); + checkMinimal(fsa); + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java new file mode 100644 index 0000000..d6cfeee --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java @@ -0,0 +1,179 @@ +package morfologik.fsa; + +import java.nio.ByteBuffer; +import java.util.*; + +import morfologik.util.BufferUtils; +import morfologik.util.MinMax; + +import org.junit.Assert; + +public class FSATestUtils { + /** + * Generate a sorted list of random sequences. + */ + public static byte[][] generateRandom(int count, MinMax length, + MinMax alphabet) { + final byte[][] input = new byte[count][]; + final Random rnd = new Random(0x11223344); + for (int i = 0; i < count; i++) { + input[i] = randomByteSequence(rnd, length, alphabet); + } + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + return input; + } + + /** + * Generate a random string. + */ + private static byte[] randomByteSequence(Random rnd, MinMax length, + MinMax alphabet) { + byte[] bytes = new byte[length.min + rnd.nextInt(length.range())]; + for (int i = 0; i < bytes.length; i++) { + bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range())); + } + return bytes; + } + + /** + * Check if the DFSA is correct with respect to the given input. + */ + public static void checkCorrect(byte[][] input, FSA fsa) { + // (1) All input sequences are in the right language. + HashSet rl = new HashSet(); + for (ByteBuffer bb : fsa) { + rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining()))); + } + + HashSet uniqueInput = new HashSet(); + for (byte[] sequence : input) { + uniqueInput.add(ByteBuffer.wrap(sequence)); + } + + for (ByteBuffer sequence : uniqueInput) { + Assert.assertTrue("Not present in the right language: " + + BufferUtils.toString(sequence), rl.remove(sequence)); + } + + // (2) No other sequence _other_ than the input is in the right language. + Assert.assertEquals(0, rl.size()); + } + + /** + * Check if the DFSA reachable from a given state is minimal. This means no + * two states have the same right language. + */ + public static void checkMinimal(final FSA fsa) { + final HashMap stateLanguages = new HashMap(); + + fsa.visitInPostOrder(new StateVisitor() { + private StringBuilder b = new StringBuilder(); + + public boolean accept(int state) { + List rightLanguage = allSequences(fsa, state); + Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING); + + b.setLength(0); + for (byte[] seq : rightLanguage) { + b.append(Arrays.toString(seq)); + b.append(','); + } + + String full = b.toString(); + Assert.assertFalse("State exists: " + state + " " + + full + " " + stateLanguages.get(full), stateLanguages.containsKey(full)); + stateLanguages.put(full, state); + + return true; + } + }); + } + + static List allSequences(FSA fsa, int state) { + ArrayList seq = new ArrayList(); + for (ByteBuffer bb : fsa.getSequences(state)) { + seq.add(Arrays.copyOf(bb.array(), bb.remaining())); + } + return seq; + } + + /** + * Check if two FSAs are identical. + */ + public static void checkIdentical(FSA fsa1, FSA fsa2) { + ArrayDeque fromRoot = new ArrayDeque(); + checkIdentical(fromRoot, + fsa1, fsa1.getRootNode(), new BitSet(), + fsa2, fsa2.getRootNode(), new BitSet()); + } + + /* + * + */ + static void checkIdentical(ArrayDeque fromRoot, + FSA fsa1, int node1, BitSet visited1, + FSA fsa2, int node2, BitSet visited2) { + int arc1 = fsa1.getFirstArc(node1); + int arc2 = fsa2.getFirstArc(node2); + + if (visited1.get(node1) != visited2.get(node2)) { + throw new RuntimeException("Two nodes should either be visited or not visited: " + + Arrays.toString(fromRoot.toArray()) + " " + + " node1: " + node1 + " " + + " node2: " + node2); + } + visited1.set(node1); + visited2.set(node2); + + TreeSet labels1 = new TreeSet(); + TreeSet labels2 = new TreeSet(); + while (true) { + labels1.add((char) fsa1.getArcLabel(arc1)); + labels2.add((char) fsa2.getArcLabel(arc2)); + + arc1 = fsa1.getNextArc(arc1); + arc2 = fsa2.getNextArc(arc2); + + if (arc1 == 0 || arc2 == 0) { + if (arc1 != arc2) { + throw new RuntimeException("Different number of labels at path: " + + Arrays.toString(fromRoot.toArray())); + } + break; + } + } + + if (!labels1.equals(labels2)) { + throw new RuntimeException("Different sets of labels at path: " + + Arrays.toString(fromRoot.toArray()) + ":\n" + + labels1 + "\n" + labels2); + } + + // recurse. + for (char chr : labels1) { + byte label = (byte) chr; + fromRoot.push(Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr)); + + arc1 = fsa1.getArc(node1, label); + arc2 = fsa2.getArc(node2, label); + + if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) { + throw new RuntimeException("Different final flag on arcs at: " + + Arrays.toString(fromRoot.toArray()) + ", label: " + label); + } + + if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) { + throw new RuntimeException("Different terminal flag on arcs at: " + + Arrays.toString(fromRoot.toArray()) + ", label: " + label); + } + + if (!fsa1.isArcTerminal(arc1)) { + checkIdentical(fromRoot, + fsa1, fsa1.getEndNode(arc1), visited1, + fsa2, fsa2.getEndNode(arc2), visited2); + } + + fromRoot.pop(); + } + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java new file mode 100644 index 0000000..ddafb6d --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java @@ -0,0 +1,160 @@ +package morfologik.fsa; + +import static org.junit.Assert.*; +import static morfologik.fsa.MatchResult.*; + +import java.io.*; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashSet; + + +import org.junit.Before; +import org.junit.Test; + +/** + * Tests {@link FSATraversal}. + */ +public final class FSATraversalTest { + private FSA fsa; + + /** + * + */ + @Before + public void setUp() throws Exception { + fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict")); + } + + /** + * + */ + @Test + public void testTraversalWithIterable() { + int count = 0; + for (ByteBuffer bb : fsa.getSequences()) { + assertEquals(0, bb.arrayOffset()); + assertEquals(0, bb.position()); + count++; + } + assertEquals(346773, count); + } + + /** + * + */ + @Test + public void testPerfectHash() throws IOException { + byte[][] input = new byte[][] { + { 'a' }, + { 'a', 'b', 'a' }, + { 'a', 'c' }, + { 'b' }, + { 'b', 'a' }, + { 'c' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + final byte[] fsaData = + new FSA5Serializer() + .withNumbers() + .serialize(s, new ByteArrayOutputStream()) + .toByteArray(); + + final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData)); + final FSATraversal traversal = new FSATraversal(fsa); + + int i = 0; + for (byte [] seq : input) + { + assertEquals(new String(seq), i++, traversal.perfectHash(seq)); + } + + // Check if the total number of sequences is encoded at the root node. + assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode())); + + // Check sub/super sequence scenarios. + assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8"))); + assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8"))); + assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8"))); + assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0})); + + assertTrue(AUTOMATON_HAS_PREFIX < 0); + assertTrue(SEQUENCE_IS_A_PREFIX < 0); + assertTrue(NO_MATCH < 0); + } + + /** + * + */ + @Test + public void testRecursiveTraversal() { + final int[] counter = new int[] { 0 }; + + class Recursion { + public void dumpNode(final int node) { + int arc = fsa.getFirstArc(node); + do { + if (fsa.isArcFinal(arc)) { + counter[0]++; + } + + if (!fsa.isArcTerminal(arc)) { + dumpNode(fsa.getEndNode(arc)); + } + + arc = fsa.getNextArc(arc); + } while (arc != 0); + } + } + + new Recursion().dumpNode(fsa.getRootNode()); + + assertEquals(346773, counter[0]); + } + + /** + * Test {@link FSATraversal} and matching results. + */ + @Test + public void testMatch() throws IOException { + final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); + final FSATraversal traversalHelper = new FSATraversal(fsa); + + MatchResult m = traversalHelper.match("ax".getBytes()); + assertEquals(NO_MATCH, m.kind); + assertEquals(1, m.index); + assertEquals(new HashSet(Arrays.asList("ba", "c")), + suffixes(fsa, m.node)); + + assertEquals(EXACT_MATCH, + traversalHelper.match("aba".getBytes()).kind); + + m = traversalHelper.match("abalonger".getBytes()); + assertEquals(AUTOMATON_HAS_PREFIX, m.kind); + assertEquals("longer", "abalonger".substring(m.index)); + + m = traversalHelper.match("ab".getBytes()); + assertEquals(SEQUENCE_IS_A_PREFIX, m.kind); + assertEquals(new HashSet(Arrays.asList("a")), + suffixes(fsa, m.node)); + } + + /** + * Return all sequences reachable from a given node, as strings. + */ + private HashSet suffixes(FSA fsa, int node) { + HashSet result = new HashSet(); + for (ByteBuffer bb : fsa.getSequences(node)) + { + try { + result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8")); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + return result; + } +} diff --git a/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java new file mode 100644 index 0000000..ce373ba --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java @@ -0,0 +1,256 @@ +package morfologik.fsa; + +import static morfologik.fsa.FSAFlags.NUMBERS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.*; +import java.nio.ByteBuffer; +import java.util.*; + +import morfologik.util.BufferUtils; + +import org.junit.*; + +public abstract class SerializerTestBase { + @Test + public void testA() throws IOException { + byte[][] input = new byte[][] { + { 'a' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testArcsSharing() throws IOException { + byte[][] input = new byte[][] { + { 'a', 'c', 'f' }, + { 'a', 'd', 'g' }, + { 'a', 'e', 'h' }, + { 'b', 'd', 'g' }, + { 'b', 'e', 'h' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testFSA5SerializerSimple() throws IOException { + byte[][] input = new byte[][] { + { 'a' }, + { 'a', 'b', 'a' }, + { 'a', 'c' }, + { 'b' }, + { 'b', 'a' }, + { 'c' }, + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + @Test + public void testNotMinimal() throws IOException { + byte[][] input = new byte[][] { + { 'a', 'b', 'a' }, + { 'b' }, + { 'b', 'a' } + }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void testFSA5Bug0() throws IOException { + checkCorrect(new String[] { + "3-D+A+JJ", + "3-D+A+NN", + "4-F+A+NN", + "z+A+NN", }); + } + + /** + * + */ + @Test + public void testFSA5Bug1() throws IOException { + checkCorrect(new String[] { "+NP", "n+N", "n+NP", }); + } + + private void checkCorrect(String[] strings) throws IOException { + byte[][] input = new byte[strings.length][]; + for (int i = 0; i < strings.length; i++) { + input[i] = strings[i].getBytes("ISO8859-1"); + } + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void testEmptyInput() throws IOException { + byte[][] input = new byte[][] {}; + FSA s = FSABuilder.build(input); + + checkSerialization(input, s); + } + + /** + * + */ + @Test + public void test_abc() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa"))); + } + + /** + * + */ + @Test + public void test_minimal() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa"))); + } + + /** + * + */ + @Test + public void test_minimal2() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa"))); + } + + /** + * + */ + @Test + public void test_en_tst() throws IOException { + testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict"))); + } + + private void testBuiltIn(FSA fsa) throws IOException { + final ArrayList sequences = new ArrayList(); + + sequences.clear(); + for (ByteBuffer bb : fsa) { + sequences.add(Arrays.copyOf(bb.array(), bb.remaining())); + } + + Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); + + final byte[][] in = sequences.toArray(new byte[sequences.size()][]); + FSA root = FSABuilder.build(in); + + // Check if the DFSA is correct first. + FSATestUtils.checkCorrect(in, root); + + // Check serialization. + checkSerialization(in, root); + } + + /** */ + private void checkSerialization(byte[][] input, FSA root) + throws IOException { + checkSerialization0(createSerializer(), input, root); + if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) { + checkSerialization0(createSerializer().withNumbers(), input, root); + } + } + + /** */ + private void checkSerialization0(FSASerializer serializer, + final byte[][] in, FSA root) throws IOException { + final byte[] fsaData = serializer.serialize(root, + new ByteArrayOutputStream()).toByteArray(); + + FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); + checkCorrect(in, fsa); + } + + /** + * Check if the FSA is correct with respect to the given input. + */ + protected void checkCorrect(byte[][] input, FSA fsa) { + // (1) All input sequences are in the right language. + HashSet rl = new HashSet(); + for (ByteBuffer bb : fsa) { + byte[] array = bb.array(); + int length = bb.remaining(); + rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length))); + } + + HashSet uniqueInput = new HashSet(); + for (byte[] sequence : input) { + uniqueInput.add(ByteBuffer.wrap(sequence)); + } + + for (ByteBuffer sequence : uniqueInput) { + Assert.assertTrue("Not present in the right language: " + + BufferUtils.toString(sequence), rl.remove(sequence)); + } + + // (2) No other sequence _other_ than the input is in the right + // language. + Assert.assertEquals(0, rl.size()); + } + + @Test + public void testAutomatonWithNodeNumbers() throws IOException { + Assume.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS)); + + byte[][] input = new byte[][] { + { 'a' }, + { 'a', 'b', 'a' }, + { 'a', 'c' }, + { 'b' }, + { 'b', 'a' }, + { 'c' }, }; + + Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); + FSA s = FSABuilder.build(input); + + final byte[] fsaData = + createSerializer() + .withNumbers() + .serialize(s, new ByteArrayOutputStream()).toByteArray(); + + FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); + + // Ensure we have the NUMBERS flag set. + assertTrue(fsa.getFlags().contains(NUMBERS)); + + // Get all numbers from nodes. + byte[] buffer = new byte[128]; + final ArrayList result = new ArrayList(); + FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); + + Collections.sort(result); + assertEquals( + Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"), + result); + } + + /** + * + */ + protected abstract FSASerializer createSerializer(); +} diff --git a/morfologik-fsa/src/test/java/morfologik/util/MinMax.java b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java new file mode 100644 index 0000000..4af6118 --- /dev/null +++ b/morfologik-fsa/src/test/java/morfologik/util/MinMax.java @@ -0,0 +1,21 @@ +package morfologik.util; + +/** + * Minimum/maximum and range. + */ +public final class MinMax +{ + public final int min; + public final int max; + + public MinMax(int min, int max) + { + this.min = Math.min(min, max); + this.max = Math.max(min, max); + } + + public int range() + { + return max - min; + } +} \ No newline at end of file diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/abc-numbers.fsa b/morfologik-fsa/src/test/resources/morfologik/fsa/abc-numbers.fsa new file mode 100644 index 0000000..d97091d Binary files /dev/null and b/morfologik-fsa/src/test/resources/morfologik/fsa/abc-numbers.fsa differ diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/abc.fsa b/morfologik-fsa/src/test/resources/morfologik/fsa/abc.fsa new file mode 100644 index 0000000..68c0b96 Binary files /dev/null and b/morfologik-fsa/src/test/resources/morfologik/fsa/abc.fsa differ diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/abc.in b/morfologik-fsa/src/test/resources/morfologik/fsa/abc.in new file mode 100644 index 0000000..7bb8744 --- /dev/null +++ b/morfologik-fsa/src/test/resources/morfologik/fsa/abc.in @@ -0,0 +1,6 @@ +a +aba +ac +b +ba +c diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/en_tst.dict b/morfologik-fsa/src/test/resources/morfologik/fsa/en_tst.dict new file mode 100644 index 0000000..09cc22b Binary files /dev/null and b/morfologik-fsa/src/test/resources/morfologik/fsa/en_tst.dict differ diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.fsa b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.fsa new file mode 100644 index 0000000..9d667b7 Binary files /dev/null and b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.fsa differ diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.in b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.in new file mode 100644 index 0000000..7ae8d81 --- /dev/null +++ b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal.in @@ -0,0 +1,3 @@ ++NP +n+N +n+NP diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.fsa b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.fsa new file mode 100644 index 0000000..e81f6d0 Binary files /dev/null and b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.fsa differ diff --git a/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.in b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.in new file mode 100644 index 0000000..d28708d --- /dev/null +++ b/morfologik-fsa/src/test/resources/morfologik/fsa/minimal2.in @@ -0,0 +1,24 @@ +3-D+A+JJ +3-D+A+NN +4-F+A+NN +4-H+A+JJ +z+A+NN +z-axis+A+NN +zB+A+NN +zZt+A+NNP +za-zen+A+NN +zabaglione+A+NN +zabagliones+B+NNS +zabajone+A+NN +zabajones+B+NNS +zabaione+A+NN +zabaiones+B+NNS +zabra+A+NN +zabras+B+NNS +zack+A+NN +zacaton+A+NN +zacatons+B+NNS +zacatun+A+NN +zaddik+A+NN +zaddiks+B+NNS +zaffar+A+NN \ No newline at end of file diff --git a/morfologik-polish/pom.xml b/morfologik-polish/pom.xml new file mode 100644 index 0000000..57f9456 --- /dev/null +++ b/morfologik-polish/pom.xml @@ -0,0 +1,58 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-polish + jar + + Morfologik Stemming Dictionary for Polish + Morfologik Stemming Dictionary for Polish. + + + + org.carrot2 + morfologik-stemming + ${project.version} + + + + com.carrotsearch + junit-benchmarks + test + + + + junit + junit + test + + + + + install + + + + org.apache.maven.plugins + maven-jar-plugin + + + + true + + + + + + + diff --git a/morfologik-polish/src/main/java/morfologik/stemming/PolishStemmer.java b/morfologik-polish/src/main/java/morfologik/stemming/PolishStemmer.java new file mode 100644 index 0000000..ac346e9 --- /dev/null +++ b/morfologik-polish/src/main/java/morfologik/stemming/PolishStemmer.java @@ -0,0 +1,54 @@ +package morfologik.stemming; + +import java.util.*; + +/** + * A dictionary-based stemmer for the Polish language. This stemmer requires an + * FSA-compiled dictionary to be present in classpath resources. + * + * Objects of this class are not thread safe. + * + * @see morfologik.stemming.DictionaryLookup + */ +public final class PolishStemmer implements IStemmer, Iterable { + /** + * Dictionary lookup delegate. + */ + private final List delegate = new ArrayList(); + + /* + * + */ + public PolishStemmer() { + delegate.add(new DictionaryLookup(Dictionary.getForLanguage("pl"))); + } + + /** + * {@inheritDoc} + */ + public List lookup(CharSequence word) { + if (delegate.size() == 1) { + return delegate.get(0).lookup(word); + } else { + List forms = null; + for (DictionaryLookup lookup : delegate) { + forms = lookup.lookup(word); + if (forms.size() > 0) + break; + } + return forms; + } + } + + /** + * Iterates over all dictionary forms stored in this stemmer. + */ + public Iterator iterator() { + if (delegate.size() == 1) { + return delegate.get(0).iterator(); + } else { + throw new RuntimeException("No iteration over compound stemmer forms: " + + Arrays.toString(delegate.toArray())); + } + } +} diff --git a/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-en b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-en new file mode 100644 index 0000000..292c1ca --- /dev/null +++ b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-en @@ -0,0 +1,67 @@ +README + +Morfologik is a project aiming at generating Polish morphosyntactic +dictionaries (hence the name) used for part-of-speech tagging and +part-of-speech synthesis. + +VERSION: 2.0 PoliMorf + +BUILD: 8 mar 2013 15:53:45 + +LICENCE + +Copyright (c) 2013, Marcin Mikowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +1. morfologik.txt is a tab-separated file, containing the following format: + +inflected-formHTbase-formHTtags + +where HT means a horizontal tab. + +2. polish.dict is a binary dictionary file for morphological analysis in +fsa_morph program by Jan Daciuk +(see http://www.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/fsa.html), +usable also in LanguageTool grammar checker. + +3. polish_synth.dict is a binary file for grammatical synthesis, usable +by morfologik-stemming library. To get an inflected word, use the +following syntax in fsa_morph: + +| + +For example: + +niemiecki|adjp + +gives "niemiecku+". + +4. polish.info and polish_synth.info are required for using the binary +dictionaries in morfologik-stemming Java library. + +TAGSET + +The tagset used is roughly similar to IPI/NKJP corpus tagset, and described in more detail in the readme_pl.txt file. +See also www.nkjp.pl. + +Morfologik, (c) 2007-2013 Marcin Mikowski. \ No newline at end of file diff --git a/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-pl b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-pl new file mode 100644 index 0000000..01ce8c7 --- /dev/null +++ b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.README-pl @@ -0,0 +1,141 @@ +INFORMACJA + +Morfologik to projekt tworzenia polskich słowników morfosyntaktycznych (stąd nazwa) służących do znakowania +morfosyntaktycznego i syntezy gramatycznej. + +LICENCJA + +Copyright © 2013 Marcin Miłkowski +Wszelkie prawa zastrzeżone +Redystrybucja i używanie, czy to w formie kodu źródłowego, czy w formie kodu wykonawczego, są dozwolone pod warunkiem spełnienia poniższych warunków: +1. Redystrybucja kodu źródłowego musi zawierać powyższą notę copyrightową, niniejszą listę warunków oraz poniższe oświadczenie o wyłączeniu odpowiedzialności. +2. Redystrybucja kodu wykonawczego musi zawierać powyższą notę copyrightową, niniejszą listę warunków oraz poniższe oświadczenie o wyłączeniu odpowiedzialności w dokumentacji i/lub w innych materiałach dostarczanych wraz z kopią oprogramowania. +TO OPROGRAMOWANIE JEST DOSTARCZONE PRZEZ „TAKIM, JAKIE JEST”. KAŻDA, DOROZUMIANA LUB BEZPOŚREDNIO WYRAŻONA GWARANCJA, NIE WYŁĄCZAJĄC DOROZUMIANEJ GWARANCJI PRZYDATNOŚCI HANDLOWEJ I PRZYDATNOŚCI DO OKREŚLONEGO ZASTOSOWANIA, JEST WYŁĄCZONA. W ŻADNYM WYPADKU NIE MOGĄ BYĆ ODPOWIEDZIALNI ZA JAKIEKOLWIEK BEZPOŚREDNIE, POŚREDNIE, INCYDENTALNE, SPECJALNE, UBOCZNE I WTÓRNE SZKODY (NIE WYŁĄCZAJĄC OBOWIĄZKU DOSTARCZENIA PRODUKTU ZASTĘPCZEGO LUB SERWISU, ODPOWIEDZIALNOŚCI Z TYTUŁU UTRATY WALORÓW UŻYTKOWYCH, UTRATY DANYCH LUB KORZYŚCI, A TAKŻE PRZERW W PRACY PRZEDSIĘBIORSTWA) SPOWODOWANE W JAKIKOLWIEK SPOSÓB I NA PODSTAWIE ISTNIEJĄCEJ W TEORII ODPOWIEDZIALNOŚCI KONTRAKTOWEJ, CAŁKOWITEJ LUB DELIKTOWEJ (WYNIKŁEJ ZARÓWNO Z NIEDBALSTWA JAK INNYCH POSTACI WINY), POWSTAŁE W JAKIKOLWIEK SPOSÓB W WYNIKU UŻYWANIA LUB MAJĄCE ZWIĄZEK Z UŻYWANIEM OPROGRAMOWANIA, NAWET JEŚLI O MOŻLIWOŚCI POWSTANIA TAKICH SZKÓD OSTRZEŻONO. + +ŹRÓDŁO + +Dane pochodzą ze słownika sjp.pl oraz słownika PoliMorf i są licencjonowane na powyższej licencji. Dane źródłowe pochodzą z polskiego słownika ispell, następnie redagowanego na stronach kurnik.pl/slownik i sjp.pl, a także Słownika Gramatycznego Języka Polskiego. Autorzy: (1) ispell: Mirosław Prywata, Piotr Gackiewicz, Włodzimierz Macewicz, Łukasz Szałkiewicz, Marek Futrega. +(2) SGJP: Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin Woliński, Robert Wołosz. + +PLIKI + +1. morfologik.txt to plik tekstowy z polami rozdzielanymi tabulatorem, +o następującym formacie: + +forma-odmienionaHTforma-podstawowaHTznaczniki + +gdzie HT oznacza tabulator poziomy. + +Kodowanie: UTF-8 + +2. polish.dict to binarny plik słownika dla programu fsa_morph Jana Daciuka +(zob. http://www.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/fsa.html), +wykorzystywany również bezpośrednio przez korektor gramatyczny LanguageTool. + +3. polish_synth.dict to binarny plik słownika syntezy gramatycznej, +używany w fsa_morph i LanguageTool. Aby uzyskać formę odmienioną, +należy używać następującej składni w zapytaniu programu fsa_morph: + +| + +Na przykład: + +niemiecki|adjp + +daje "niemiecku+". + +4. polish.info i polish_synth.info - pliki wymagane do użycia plików +binarnych w bibliotece morfologik-stemming. + +Do wykonania skryptów źródłowych są potrzebne następujące pliki: + +Plik: odm.txt - aktualny słownik z witryny www.kurnik.pl/slownik (słownik odmian) +Strona: http://www.kurnik.pl/slownik/odmiany/ +Pobieranie: http://www.kurnik.org/dictionary/odm/alt-odm-.tar.gz + +Plik: polish.all - ze słownika alternatywnego +Strona: http://www.kurnik.pl/slownik/ort/ +Pobieranie: http://www.kurnik.org/dictionary/alt-ispell-pl-src.tar.bz2 + +Plik : pl_PL.aff (plik afiksów) +Strona: http://www.kurnik.pl/slownik/ort/ +Pobieranie: http://www.kurnik.org/dictionary/alt-myspell-pl.tar.bz2 + +Należy też ze strony http://www.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/fsa_polski.html +Pobrać ftp://ftp.pg.gda.pl/pub/software/xtras-PG/fsa/fsa_current.tar.gz i zbudować *fsa_build +Z tego pakietu wykorzystany bedzie skrypt *morph_infix.awk*. +W pliku Makefile należy sprawdzić, czy ścieżka do pliku jest prawidłowa. + + +ZNACZNIKI MORFOSYNTAKTYCZNE + +Zestaw znaczników jest zbliżony do zestawu korpusu IPI (www.korpus.pl). + + * adj - przymiotnik (np. "niemiecki") + * adjp - przymiotnik poprzyimkowy (np. "niemiecku") + * adv - przysłówek (np. "głupio") + * depr - forma deprecjatywna + * ger - rzeczownik odsłowny + * conj - spójnik + * num - liczebnik + * pact - imiesłów przymiotnikowy czynny + * pant - imiesłów przysłówkowy uprzedni + * pcon - imiesłów przysłówkowy współczesny + * ppas - imiesłów przymiotnikowy bierny + * ppron12 - zaimek nietrzecioosobowy + * ppron3 - zaimek trzecioosobowy + * pred - predykatyw (np. "trzeba") + * prep - przyimek + * siebie - zaimek "siebie" + * subst - rzeczownik + * verb - czasownik + * brev - skrót + * interj - wykrzyknienie + * xxx - jednostka obca + +Atrybuty podstawowych form: + + * sg - liczba pojedyncza + * pl - liczba mnoga + * irreg - forma nieregularna (nierozpoznana dokładniej pod względem wartości atrybutów, np. subst:irreg) + * nom - mianownik + * gen - dopełniacz + * acc - biernik + * dat - celownik + * inst - narzędnik + * loc - miejscownik + * voc - wołacz + * pos - stopień równy + * comp - stopień wyższy + * sup - stopień najwyższy + * m1, m2, m3 - rodzaje męskie + * n1, n2 - rodzaje nijakie + * f - rodzaj żeński + * pri - pierwsza osoba + * sec - druga osoba + * tri - trzecia osoba + * aff - forma niezanegowana + * neg - forma zanegowana + * refl - forma zwrotna czasownika [nie występuje w znacznikach IPI] + * perf - czasownik dokonany + * imperf - czasownik niedokonany + * imperf.perf - czasownik, który może występować zarówno jako dokonany, jak i jako niedokonany + * nakc - forma nieakcentowana zaimka + * akc - forma akcentowana zaimka + * praep - forma poprzyimkowa + * npraep - forma niepoprzyimkowa + * ger - rzeczownik odsłowny + * imps - forma bezosobowa + * impt - tryb rozkazujący + * inf - bezokolicznik + * fin - forma nieprzeszła + * bedzie - forma przyszła "być" + * praet - forma przeszła czasownika (pseudoimiesłów) + * pot - tryb przypuszczający [nie występuje w znacznikach IPI] + * nstd - forma niestandardowa, np. archaiczna [nie występuje w znacznikach IPI] + * pun - skrót z kropką [za NKJP] + * npun - bez kropki [za NKJP] + +W znacznikach Morfologika nie występuje i nie będzie występować znacznik aglt, a to ze względu na inną zasadę segmentacji wyrazów. + +Morfologik, (c) 2007-2013 Marcin Miłkowski. \ No newline at end of file diff --git a/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.dict b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.dict new file mode 100644 index 0000000..459a733 Binary files /dev/null and b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.dict differ diff --git a/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.info b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.info new file mode 100644 index 0000000..c0a0a42 --- /dev/null +++ b/morfologik-polish/src/main/resources/morfologik/dictionaries/pl.info @@ -0,0 +1,36 @@ +# +# Morfologik Polish dictionary. +# Version: 2.0 PoliMorf +# Copyright (c) 2013, Marcin Mikowski +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +fsa.dict.author=morfologik.blogspot.com +fsa.dict.created=24 oct 2013 18:18:00 +fsa.dict.license=BSD. http://morfologik.blogspot.com + +fsa.dict.separator=+ +fsa.dict.encoding=UTF-8 + +fsa.dict.uses-prefixes=true +fsa.dict.uses-infixes=false diff --git a/morfologik-polish/src/test/java/morfologik/stemming/PerformanceTest.java b/morfologik-polish/src/test/java/morfologik/stemming/PerformanceTest.java new file mode 100644 index 0000000..d93216d --- /dev/null +++ b/morfologik-polish/src/test/java/morfologik/stemming/PerformanceTest.java @@ -0,0 +1,73 @@ +package morfologik.stemming; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; + +import org.junit.BeforeClass; +import org.junit.Test; + +import com.carrotsearch.junitbenchmarks.AbstractBenchmark; +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; + +/** + * Simple performance micro-benchmarks. + */ +@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 10) +public class PerformanceTest extends AbstractBenchmark { + /* Guard against escape analysis and HotSpot opts. */ + public volatile int guard; + + /* Test data. */ + static final int sequences = 100000; + static final String[] testWords = new String[sequences]; + static final PolishStemmer stemmer = new PolishStemmer(); + + /** + * Prepare test data. + */ + @BeforeClass + public static void prepare() throws UnsupportedEncodingException + { + final Dictionary dict = Dictionary.getForLanguage("pl"); + int i = 0; + for (ByteBuffer sequence : dict.fsa) { + testWords[i] = new String(sequence.array(), 0, + sequence.remaining(), dict.metadata.getEncoding()); + testWords[i] = testWords[i].substring(0, testWords[i] + .indexOf(dict.metadata.getSeparator())); + i++; + + if (i == testWords.length) + break; + } + } + + @Test + public void traversal_100000() throws IOException { + final Dictionary dict = Dictionary.getForLanguage("pl"); + + int max = sequences; + int guard = 0; + for (ByteBuffer sequence : dict.fsa) { + guard += sequence.remaining(); + if (--max == 0) + break; + } + + this.guard = guard; + } + + @Test + public void stemming_100000() throws IOException { + int guard = 0; + for (String word : testWords) { + for (WordData dta : stemmer.lookup(word)) + { + guard += dta.getStem().length(); + guard += dta.getTag().length(); + } + } + this.guard = guard; + } +} diff --git a/morfologik-polish/src/test/java/morfologik/stemming/PolishMorfologikStemmerTest.java b/morfologik-polish/src/test/java/morfologik/stemming/PolishMorfologikStemmerTest.java new file mode 100644 index 0000000..2b1805b --- /dev/null +++ b/morfologik-polish/src/test/java/morfologik/stemming/PolishMorfologikStemmerTest.java @@ -0,0 +1,141 @@ +package morfologik.stemming; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.*; + +import org.junit.Ignore; +import org.junit.Test; + +/* + * + */ +public class PolishMorfologikStemmerTest { + /* */ + @Test + public void testLexemes() throws IOException { + PolishStemmer s = new PolishStemmer(); + + assertEquals("żywotopisarstwo", stem(s, "żywotopisarstwie")[0]); + assertEquals("abradować", stem(s, "abradowałoby")[0]); + + assertArrayEquals(new String[] { "żywotopisarstwo", "subst:sg:loc:n2" }, + stem(s, "żywotopisarstwie")); + assertArrayEquals(new String[] { "bazia", "subst:pl:inst:f" }, stem(s, + "baziami")); + + // This word is not in the dictionary. + assertNoStemFor(s, "martygalski"); + } + + /* */ + @Test + @Ignore + public void listUniqueTags() throws IOException { + HashSet forms = new HashSet(); + for (WordData wd : new PolishStemmer()) { + final CharSequence chs = wd.getTag(); + if (chs == null) { + System.err.println("Missing tag for: " + wd.getWord()); + continue; + } + forms.add(chs.toString()); + } + + for (String s : new TreeSet(forms)) { + System.out.println(s); + } + } + + /* */ + @Test + public void testWordDataFields() throws IOException { + final IStemmer s = new PolishStemmer(); + + final String word = "liga"; + final List response = s.lookup(word); + assertEquals(2, response.size()); + + final HashSet stems = new HashSet(); + final HashSet tags = new HashSet(); + for (WordData wd : response) { + stems.add(wd.getStem().toString()); + tags.add(wd.getTag().toString()); + assertSame(word, wd.getWord()); + } + assertTrue(stems.contains("ligać")); + assertTrue(stems.contains("liga")); + assertTrue(tags.contains("subst:sg:nom:f")); + assertTrue(tags.contains("verb:fin:sg:ter:imperf:nonrefl+verb:fin:sg:ter:imperf:refl.nonrefl")); + + // Repeat to make sure we get the same values consistently. + for (WordData wd : response) { + stems.contains(wd.getStem().toString()); + tags.contains(wd.getTag().toString()); + } + + final String ENCODING = "UTF-8"; + + // Run the same consistency check for the returned buffers. + final ByteBuffer temp = ByteBuffer.allocate(100); + for (WordData wd : response) { + // Buffer should be copied. + final ByteBuffer copy = wd.getStemBytes(null); + final String stem = new String(copy.array(), copy.arrayOffset() + + copy.position(), copy.remaining(), ENCODING); + // The buffer should be present in stems set. + assertTrue(stem, stems.contains(stem)); + // Buffer large enough to hold the contents. + temp.clear(); + assertSame(temp, wd.getStemBytes(temp)); + // The copy and the clone should be identical. + assertEquals(0, copy.compareTo(temp)); + } + + for (WordData wd : response) { + // Buffer should be copied. + final ByteBuffer copy = wd.getTagBytes(null); + final String tag = new String(copy.array(), copy.arrayOffset() + + copy.position(), copy.remaining(), ENCODING); + // The buffer should be present in tags set. + assertTrue(tag, tags.contains(tag)); + // Buffer large enough to hold the contents. + temp.clear(); + assertSame(temp, wd.getTagBytes(temp)); + // The copy and the clone should be identical. + assertEquals(0, copy.compareTo(temp)); + } + + for (WordData wd : response) { + // Buffer should be copied. + final ByteBuffer copy = wd.getWordBytes(null); + assertNotNull(copy); + assertEquals(0, copy.compareTo(ByteBuffer.wrap(word + .getBytes(ENCODING)))); + } + } + + /* */ + public static String asString(CharSequence s) { + if (s == null) + return null; + return s.toString(); + } + + /* */ + public static String[] stem(IStemmer s, String word) { + ArrayList result = new ArrayList(); + for (WordData wd : s.lookup(word)) { + result.add(asString(wd.getStem())); + result.add(asString(wd.getTag())); + } + return result.toArray(new String[result.size()]); + } + + /* */ + public static void assertNoStemFor(IStemmer s, String word) { + assertArrayEquals(new String[] {}, stem(s, word)); + } +} diff --git a/morfologik-speller/pom.xml b/morfologik-speller/pom.xml new file mode 100644 index 0000000..ac6ecf2 --- /dev/null +++ b/morfologik-speller/pom.xml @@ -0,0 +1,58 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-speller + jar + + Morfologik Speller + Morfologik Speller + + + + org.carrot2 + morfologik-stemming + ${project.version} + + + + junit + junit + test + + + + org.easytesting + fest-assert-core + test + + + + + install + + + + org.apache.maven.plugins + maven-jar-plugin + + + + true + + + + + + + diff --git a/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java b/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java new file mode 100644 index 0000000..848f24e --- /dev/null +++ b/morfologik-speller/src/main/java/morfologik/speller/HMatrix.java @@ -0,0 +1,100 @@ +package morfologik.speller; + +/** + * Keeps track of already computed values of edit distance.
+ * Remarks: To save space, the matrix is kept in a vector. + */ +public class HMatrix { + private int[] p; /* the vector */ + private int rowLength; /* row length of matrix */ + int columnHeight; /* column height of matrix */ + int editDistance; /* edit distance */ + + /** + * Allocates memory and initializes matrix (constructor). + * + * @param distance (int) max edit distance allowed for + * candidates; + * @param maxLength (int) max length of words. + * + * Remarks: See Oflazer. To save space, the matrix is + * stored as a vector. To save time, additional rows and + * columns are added. They are initialized to their distance in + * the matrix, so that no bound checking is necessary during + * access. + */ + public HMatrix(final int distance, final int maxLength) { + rowLength = maxLength + 2; + columnHeight = 2 * distance + 3; + editDistance = distance; + final int size = rowLength * columnHeight; + p = new int[size]; + // Initialize edges of the diagonal band to distance + 1 (i.e. + // distance too big) + for (int i = 0; i < rowLength - distance - 1; i++) { + p[i] = distance + 1; // H(distance + j, j) = distance + 1 + p[size - i - 1] = distance + 1; // H(i, distance + i) = distance + // + 1 + } + // Initialize items H(i,j) with at least one index equal to zero to + // |i - j| + for (int j = 0; j < 2 * distance + 1; j++) { + p[j * rowLength] = distance + 1 - j; // H(i=0..distance+1,0)=i + // FIXME: fordistance == 2 we exceed the array size here. + // there's a bug in spell.cc, Jan Daciuk has been notified about it. + p[Math.min(p.length - 1, (j + distance + 1) * rowLength + j)] = j; // H(0,j=0..distance+1)=j + } + } + + /** + * Provide an item of hMatrix indexed by indices. + * + * @param i + * - (int) row number; + * @param j + * - (int) column number. + * @return Item H[i][j]
+ * Remarks: H matrix is really simulated. What is needed is only + * 2 * edit_distance + 1 wide band around the diagonal. In fact + * this diagonal has been pushed up to the upper border of the + * matrix. + * + * The matrix in the vector looks likes this: + * + *
+	 * 	    +---------------------+
+	 * 	0   |#####################| j=i-e-1
+	 * 	1   |                     | j=i-e
+	 * 	    :                     :
+	 * 	e+1 |                     | j=i-1
+	 * 	    +---------------------+
+	 * 	e+2 |                     | j=i
+	 * 	    +---------------------+
+	 * 	e+3 |                     | j=i+1
+	 * 	    :                     :
+	 * 	2e+2|                     | j=i+e
+	 * 	2e+3|#####################| j=i+e+1
+	 * 	    +---------------------+
+	 * 
+ */ + public int get(final int i, final int j) { + return p[(j - i + editDistance + 1) * rowLength + j]; + } + + /** + * Set an item in hMatrix. + * + * @param i + * - (int) row number; + * @param j + * - (int) column number; + * @param val + * - (int) value to put there. + * + * No checking for i & j is done. They must be correct. + */ + public void set(final int i, final int j, final int val) { + p[(j - i + editDistance + 1) * rowLength + j] = val; + } + +} \ No newline at end of file diff --git a/morfologik-speller/src/main/java/morfologik/speller/Speller.java b/morfologik-speller/src/main/java/morfologik/speller/Speller.java new file mode 100644 index 0000000..f4ee083 --- /dev/null +++ b/morfologik-speller/src/main/java/morfologik/speller/Speller.java @@ -0,0 +1,920 @@ +package morfologik.speller; + +import static morfologik.fsa.MatchResult.EXACT_MATCH; +import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.text.Normalizer; +import java.text.Normalizer.Form; +import java.util.*; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSAFinalStatesIterator; +import morfologik.fsa.FSATraversal; +import morfologik.fsa.MatchResult; +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryMetadata; +import morfologik.util.BufferUtils; + +/** + * Finds spelling suggestions. Implements + * K. Oflazer's algorithm. + * See Jan Daciuk's s_fsa package. + */ +public class Speller { + + /** + * Maximum length of the word to be checked. + */ + public static final int MAX_WORD_LENGTH = 120; + static final int FREQ_RANGES = 'Z' - 'A' + 1; + static final int FIRST_RANGE_CODE = 'A'; // less frequent words + + //FIXME: this is an upper limit for replacement searches, we need + //proper tree traversal instead of generation of all possible candidates + static final int UPPER_SEARCH_LIMIT = 15; + private static final int MIN_WORD_LENGTH = 4; + private static final int MAX_RECURSION_LEVEL = 6; + + private final int editDistance; + private int effectEditDistance; // effective edit distance + + private final HMatrix hMatrix; + + private char[] candidate; /* current replacement */ + private int candLen; + private int wordLen; /* length of word being processed */ + private char[] wordProcessed; /* word being processed */ + + private Map> replacementsAnyToOne = new HashMap>(); + private Map> replacementsAnyToTwo = new HashMap>(); + private Map> replacementsTheRest = new HashMap>(); + + /** + * List of candidate strings, including same additional data such as + * edit distance from the original word. + */ + private final List candidates = new ArrayList(); + + private boolean containsSeparators = true; + + /** + * Internal reusable buffer for encoding words into byte arrays using + * {@link #encoder}. + */ + private ByteBuffer byteBuffer = ByteBuffer.allocate(MAX_WORD_LENGTH); + + /** + * Internal reusable buffer for encoding words into byte arrays using + * {@link #encoder}. + */ + private CharBuffer charBuffer = CharBuffer.allocate(MAX_WORD_LENGTH); + + /** + * Reusable match result. + */ + private final MatchResult matchResult = new MatchResult(); + + /** + * Features of the compiled dictionary. + * + * @see DictionaryMetadata + */ + private final DictionaryMetadata dictionaryMetadata; + + /** + * Charset encoder for the FSA. + */ + private final CharsetEncoder encoder; + + /** + * Charset decoder for the FSA. + */ + private final CharsetDecoder decoder; + + /** An FSA used for lookups. */ + private final FSATraversal matcher; + + /** FSA's root node. */ + private final int rootNode; + + /** + * The FSA we are using. + */ + private final FSA fsa; + + /** An iterator for walking along the final states of {@link #fsa}. */ + private final FSAFinalStatesIterator finalStatesIterator; + + public Speller(final Dictionary dictionary) { + this(dictionary, 1); + } + + public Speller(final Dictionary dictionary, final int editDistance) { + this.editDistance = editDistance; + hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH); + + this.dictionaryMetadata = dictionary.metadata; + this.rootNode = dictionary.fsa.getRootNode(); + this.fsa = dictionary.fsa; + this.matcher = new FSATraversal(fsa); + this.finalStatesIterator = new FSAFinalStatesIterator(fsa, rootNode); + + if (rootNode == 0) { + throw new IllegalArgumentException( + "Dictionary must have at least the root node."); + } + + if (dictionaryMetadata == null) { + throw new IllegalArgumentException( + "Dictionary metadata must not be null."); + } + + encoder = dictionaryMetadata.getEncoder(); + decoder = dictionaryMetadata.getDecoder(); + + // Multibyte separator will result in an exception here. + dictionaryMetadata.getSeparatorAsChar(); + + this.createReplacementsMaps(); + } + + private void createReplacementsMaps() { + for (Map.Entry> entry : dictionaryMetadata + .getReplacementPairs().entrySet()) { + for (String s : entry.getValue()) { + // replacements any to one + // the new key is the target of the replacement pair + if (s.length() == 1) { + if (!replacementsAnyToOne.containsKey(s.charAt(0))) { + List charList = new ArrayList(); + charList.add(entry.getKey().toCharArray()); + replacementsAnyToOne.put(s.charAt(0), charList); + } else { + replacementsAnyToOne.get(s.charAt(0)).add( + entry.getKey().toCharArray()); + } + } + // replacements any to two + // the new key is the target of the replacement pair + else if (s.length() == 2) { + if (!replacementsAnyToTwo.containsKey(s)) { + List charList = new ArrayList(); + charList.add(entry.getKey().toCharArray()); + replacementsAnyToTwo.put(s, charList); + } else { + replacementsAnyToTwo.get(s).add(entry.getKey().toCharArray()); + } + } else { + if (!replacementsTheRest.containsKey(entry.getKey())) { + List charList = new ArrayList(); + charList.add(s); + replacementsTheRest.put(entry.getKey(), charList); + } else { + replacementsTheRest.get(entry.getKey()).add(s); + } + } + } + } + } + + + /** + * Encode a character sequence into a byte buffer, optionally expanding + * buffer. + */ + private ByteBuffer charsToBytes(final CharBuffer chars, ByteBuffer bytes) { + bytes.clear(); + final int maxCapacity = (int) (chars.remaining() * encoder.maxBytesPerChar()); + if (bytes.capacity() <= maxCapacity) { + bytes = ByteBuffer.allocate(maxCapacity); + } + chars.mark(); + encoder.reset(); + if (encoder.encode(chars, bytes, true).isError()) { + // in the case of encoding errors, clear the buffer + bytes.clear(); + } + bytes.flip(); + chars.reset(); + return bytes; + } + + private ByteBuffer charSequenceToBytes(final CharSequence word) { + // Encode word characters into bytes in the same encoding as the FSA's. + charBuffer.clear(); + charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length()); + for (int i = 0; i < word.length(); i++) { + final char chr = word.charAt(i); + charBuffer.put(chr); + } + charBuffer.flip(); + byteBuffer = charsToBytes(charBuffer, byteBuffer); + return byteBuffer; + } + + /** + * Checks whether the word is misspelled, by performing a series of checks according to + * properties of the dictionary. + * + * If the flag fsa.dict.speller.ignore-punctuation is set, then all non-alphabetic + * characters are considered to be correctly spelled. + * + * If the flag fsa.dict.speller.ignore-numbers is set, then all words containing decimal + * digits are considered to be correctly spelled. + * + * If the flag fsa.dict.speller.ignore-camel-case is set, then all CamelCase words are + * considered to be correctly spelled. + * + * If the flag fsa.dict.speller.ignore-all-uppercase is set, then all alphabetic words composed + * of only uppercase characters are considered to be correctly spelled. + * + * Otherwise, the word is checked in the dictionary. If the test fails, and the dictionary does not + * perform any case conversions (as set by fsa.dict.speller.convert-case flag), then the method + * returns false. In case of case conversions, it is checked whether a non-mixed case word is found in its + * lowercase version in the dictionary, and for all-uppercase words, whether the word is found in the dictionary + * with the initial uppercase letter. + * + * @param word - the word to be checked + * @return true if the word is misspelled + **/ + public boolean isMisspelled(final String word) { + // dictionaries usually do not contain punctuation + String wordToCheck = word; + if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { + wordToCheck = Dictionary.convertText(word, + dictionaryMetadata.getInputConversionPairs()).toString(); + } + boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0)); + return wordToCheck.length() > 0 + && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic) + && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck)) + && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck)) + && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck)) + && !isInDictionary(wordToCheck) + && (!dictionaryMetadata.isConvertingCase() || + !(!isMixedCase(wordToCheck) && + (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale())) + || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck))))); + } + + private CharSequence initialUppercase(final String wordToCheck) { + return wordToCheck.substring(0, 1) + + wordToCheck.substring(1). + toLowerCase(dictionaryMetadata.getLocale()); + } + + /** + * Test whether the word is found in the dictionary. + * @param word the word to be tested + * @return True if it is found. + */ + public boolean isInDictionary(final CharSequence word) { + byteBuffer = charSequenceToBytes(word); + + // Try to find a partial match in the dictionary. + final MatchResult match = matcher.match(matchResult, + byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); + + if (match.kind == EXACT_MATCH) { + containsSeparators = false; + return true; + } + + return containsSeparators + && match.kind == SEQUENCE_IS_A_PREFIX + && byteBuffer.remaining() > 0 + && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0; + } + + /** + * Get the frequency value for a word form. + * It is taken from the first entry with this word form. + * @param word the word to be tested + * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent). + */ + + public int getFrequency(final CharSequence word) { + if (!dictionaryMetadata.isFrequencyIncluded()) { + return 0; + } + final byte separator = dictionaryMetadata.getSeparator(); + byteBuffer = charSequenceToBytes(word); + final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, + byteBuffer.remaining(), rootNode); + if (match.kind == SEQUENCE_IS_A_PREFIX) { + final int arc = fsa.getArc(match.node, separator); + if (arc != 0 && !fsa.isArcFinal(arc)) { + finalStatesIterator.restartFrom(fsa.getEndNode(arc)); + if (finalStatesIterator.hasNext()) { + final ByteBuffer bb = finalStatesIterator.next(); + final byte[] ba = bb.array(); + final int bbSize = bb.remaining(); + //the last byte contains the frequency after a separator + return ba[bbSize - 1] - FIRST_RANGE_CODE; + } + } + } + return 0; + } + + /** + * Propose suggestions for misspelled run-on words. This algorithm is inspired by + * spell.cc in s_fsa package by Jan Daciuk. + * + * @param original The original misspelled word. + * @return The list of suggested pairs, as space-concatenated strings. + */ + public List replaceRunOnWords(final String original) { + final List candidates = new ArrayList(); + if (!isInDictionary(Dictionary.convertText(original, + dictionaryMetadata.getInputConversionPairs()).toString()) + && dictionaryMetadata.isSupportingRunOnWords()) { + for (int i = 2; i < original.length(); i++) { + // chop from left to right + final CharSequence firstCh = original.subSequence(0, i); + if (isInDictionary(firstCh) && + isInDictionary(original.subSequence(i, original.length()))) { + if (!dictionaryMetadata.getOutputConversionPairs().isEmpty()) { + candidates.add(firstCh + " " + original.subSequence(i, original.length())); + } else { + candidates.add( + Dictionary.convertText(firstCh + " " + original.subSequence(i, original.length()), + dictionaryMetadata.getOutputConversionPairs()).toString() + ); + } + } + } + } + return candidates; + } + + /** + * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa + * package, spell.cc for further explanation. + * + * @param w + * The original misspelled word. + * @return A list of suggested replacements. + * @throws CharacterCodingException + */ + public List findReplacements(final String w) + throws CharacterCodingException { + String word = w; + if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { + word = Dictionary.convertText(w, + dictionaryMetadata.getInputConversionPairs()).toString(); + } + candidates.clear(); + if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && !isInDictionary(word)) { + List wordsToCheck = new ArrayList(); + if (replacementsTheRest != null + && word.length() > MIN_WORD_LENGTH) { + for (final String wordChecked : getAllReplacements(word, 0, 0)) { + boolean found = false; + if (isInDictionary(wordChecked)) { + candidates.add(new CandidateData(wordChecked, 0)); + found = true; + } else if (dictionaryMetadata.isConvertingCase()) { + String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale()); + String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale()); + if (isInDictionary(lowerWord)) { + //add the word as it is in the dictionary, not mixed-case versions of it + candidates.add(new CandidateData(lowerWord, 0)); + found = true; + } + if (isInDictionary(upperWord)) { + candidates.add(new CandidateData(upperWord, 0)); + found = true; + } + if (lowerWord.length() > 1) { + String firstupperWord = Character.toUpperCase(lowerWord.charAt(0)) + + lowerWord.substring(1); + if (isInDictionary(firstupperWord)) { + candidates.add(new CandidateData(firstupperWord, 0)); + found = true; + } + } + } + if (!found) { + wordsToCheck.add(wordChecked); + } + } + } else { + wordsToCheck.add(word); + } + + //If at least one candidate was found with the replacement pairs (which are usual errors), + //probably there is no need for more candidates + if (candidates.isEmpty()) { + int i = 1; + for (final String wordChecked : wordsToCheck) { + i++; + if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply + break; + } + wordProcessed = wordChecked.toCharArray(); + wordLen = wordProcessed.length; + if (wordLen < MIN_WORD_LENGTH && i > 2) { // three-letter replacements make little sense anyway + break; + } + candidate = new char[MAX_WORD_LENGTH]; + candLen = candidate.length; + effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance; + charBuffer = BufferUtils.ensureCapacity(charBuffer, MAX_WORD_LENGTH); + byteBuffer = BufferUtils.ensureCapacity(byteBuffer, MAX_WORD_LENGTH); + charBuffer.clear(); + byteBuffer.clear(); + final byte[] prevBytes = new byte[0]; + findRepl(0, fsa.getRootNode(), prevBytes, 0, 0); + } + } + } + + Collections.sort(candidates); + + // Use a linked set to avoid duplicates and preserve the ordering of candidates. + final Set candStringSet = new LinkedHashSet(); + for (final CandidateData cd : candidates) { + candStringSet.add(Dictionary.convertText(cd.getWord(), + dictionaryMetadata.getOutputConversionPairs()).toString()); + } + final List candStringList = new ArrayList(candStringSet.size()); + candStringList.addAll(candStringSet); + return candStringList; + } + + private void findRepl(final int depth, final int node, final byte[] prevBytes, + final int wordIndex, final int candIndex) { + // char separatorChar = dictionaryMetadata.getSeparatorAsChar(); + int dist = 0; + for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { + byteBuffer = BufferUtils.ensureCapacity(byteBuffer, prevBytes.length + 1); + byteBuffer.clear(); + byteBuffer.put(prevBytes); + byteBuffer.put(fsa.getArcLabel(arc)); + final int bufPos = byteBuffer.position(); + byteBuffer.flip(); + decoder.reset(); + final CoderResult c = decoder.decode(byteBuffer, charBuffer, true); + if (c.isMalformed()) { // assume that only valid + // encodings are there + final byte[] prev = new byte[bufPos]; + byteBuffer.position(0); + byteBuffer.get(prev); + if (!fsa.isArcTerminal(arc)) { + findRepl(depth, fsa.getEndNode(arc), prev, wordIndex, candIndex); // note: depth is not incremented + } + byteBuffer.clear(); + } else if (!c.isError()) { // unmappable characters are silently discarded + charBuffer.flip(); + candidate[candIndex] = charBuffer.get(); + charBuffer.clear(); + byteBuffer.clear(); + + int lengthReplacement; + // replacement "any to two" + if ((lengthReplacement = matchAnyToTwo(wordIndex, candIndex)) > 0) { + if (isEndOfCandidate(arc, wordIndex)) { //the replacement takes place at the end of the candidate + if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) { // there is an extra letter in the word after the replacement + dist++; + } + addCandidate(candIndex, dist); + + } + if (isArcNotTerminal(arc, candIndex)) { + int x = hMatrix.get(depth, depth); + hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1)); + findRepl(Math.max(0, depth), fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement - 1, candIndex + 1); + hMatrix.set(depth, depth, x); + + } + } + //replacement "any to one" + if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) { + if (isEndOfCandidate(arc, wordIndex)) { //the replacement takes place at the end of the candidate + if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) { // there is an extra letter in the word after the replacement + dist++; + } + addCandidate(candIndex, dist); + } + if (isArcNotTerminal(arc,candIndex)) { + findRepl(depth, fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement, candIndex + 1); + } + } + //general + if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) { + if ((isEndOfCandidate(arc, wordIndex)) + && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex)) + <= effectEditDistance) { + addCandidate(candIndex, dist); + } + if (isArcNotTerminal(arc,candIndex)) { + findRepl(depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1); + } + } + + } + } + } + + private boolean isArcNotTerminal(final int arc, final int candIndex) { + return !fsa.isArcTerminal(arc) + && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar()); + } + + private boolean isEndOfCandidate(final int arc, final int wordIndex) { + return (fsa.isArcFinal(arc) || isBeforeSeparator(arc)) + //candidate has proper length + && (Math.abs(wordLen - 1 - (wordIndex)) <= effectEditDistance); + } + + private boolean isBeforeSeparator(final int arc) { + if (containsSeparators) { + final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator()); + return arc1 != 0 && !fsa.isArcTerminal(arc1); + } + return false; + } + + private void addCandidate(final int depth, final int dist) { + candidates.add(new CandidateData(String.valueOf(candidate, 0, depth + 1), dist)); + } + + /** + * Calculates edit distance. + * + * @param i length of first word (here: misspelled) - 1; + * @param j length of second word (here: candidate) - 1. + * @return Edit distance between the two words. Remarks: See Oflazer. + */ + public int ed(final int i, final int j, + final int wordIndex, final int candIndex) { + int result; + int a, b, c; + + if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) { + // last characters are the same + result = hMatrix.get(i, j); + } else if (wordIndex > 0 && candIndex > 0 && wordProcessed[wordIndex] == candidate[candIndex - 1] + && wordProcessed[wordIndex - 1] == candidate[candIndex]) { + // last two characters are transposed + a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba + b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba + c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab + result = 1 + min(a, b, c); + } else { + // otherwise + a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb + b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a + c = hMatrix.get(i, j + 1); // insertion e.g. a, ab + result = 1 + min(a, b, c); + } + + hMatrix.set(i + 1, j + 1, result); + return result; + } + + // by Jaume Ortola + private boolean areEqual(final char x, final char y) { + if (x == y) { + return true; + } + if (dictionaryMetadata.getEquivalentChars() != null && + dictionaryMetadata.getEquivalentChars().containsKey(x) + && dictionaryMetadata.getEquivalentChars().get(x).contains(y)) { + return true; + } + if (dictionaryMetadata.isIgnoringDiacritics()) { + String xn = Normalizer.normalize(Character.toString(x), Form.NFD); + String yn = Normalizer.normalize(Character.toString(y), Form.NFD); + if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible + return true; + } + if (dictionaryMetadata.isConvertingCase()) { + //again case conversion only when needed -- we + // do not need String.lowercase because we only check + // single characters, so a cheaper method is enough + if (Character.isLetter(xn.charAt(0))){ + boolean testNeeded = Character.isLowerCase(xn.charAt(0)) + != Character.isLowerCase(yn.charAt(0)); + if (testNeeded) { + return Character.toLowerCase(xn.charAt(0)) == + Character.toLowerCase(yn.charAt(0)); + } + } + } + return xn.charAt(0) == yn.charAt(0); + } + return false; + } + + /** + * Calculates cut-off edit distance. + * + * @param depth current length of candidates. + * @return Cut-off edit distance. Remarks: See Oflazer. + */ + + public int cuted(final int depth, final int wordIndex, final int candIndex) { + final int l = Math.max(0, depth - effectEditDistance); // min chars from word to consider - 1 + final int u = Math.min(wordLen - 1 - (wordIndex - depth), depth + effectEditDistance); // max chars from word to + // consider - 1 + int minEd = effectEditDistance + 1; // what is to be computed + int wi = wordIndex + l - depth; + int d; + + for (int i = l; i <= u; i++, wi++) { + if ((d = ed(i, depth, wi, candIndex)) < minEd) { + minEd = d; + } + } + return minEd; + } + + // Match the last letter of the candidate against two or more letters of the word. + private int matchAnyToOne(final int wordIndex, final int candIndex) { + if (replacementsAnyToOne.containsKey(candidate[candIndex])) { + for (final char[] rep : replacementsAnyToOne.get(candidate[candIndex])) { + int i = 0; + while (i < rep.length && (wordIndex + i) < wordLen + && rep[i] == wordProcessed[wordIndex + i]) { + i++; + } + if (i==rep.length) { + return i; + } + } + } + return 0; + } + + private int matchAnyToTwo(final int wordIndex, final int candIndex) { + if (candIndex > 0 && candIndex < candidate.length + && wordIndex > 0) { + char[] twoChar = {candidate[candIndex - 1],candidate[candIndex]}; + String sTwoChar= new String(twoChar); + if (replacementsAnyToTwo.containsKey(sTwoChar)) { + for (final char[] rep : replacementsAnyToTwo.get(sTwoChar)) { + if (rep.length == 2 && wordIndex < wordLen + && candidate[candIndex - 1] == wordProcessed[wordIndex - 1] + && candidate[candIndex] == wordProcessed[wordIndex]) { + return 0; //unnecessary replacements + } + int i = 0; + while (i < rep.length && (wordIndex - 1 + i) < wordLen + && rep[i] == wordProcessed[wordIndex - 1 + i] ) { + i++; + } + if (i==rep.length) { + return i; + } + } + } + } + return 0; + } + + + private static int min(final int a, final int b, final int c) { + return Math.min(a, Math.min(b, c)); + } + + /** + * Copy-paste of Character.isAlphabetic() (needed as we require only 1.6) + * + * @param codePoint The input character. + * @return True if the character is a Unicode alphabetic character. + */ + static boolean isAlphabetic(final int codePoint) { + return ((1 << Character.UPPERCASE_LETTER + | 1 << Character.LOWERCASE_LETTER + | 1 << Character.TITLECASE_LETTER + | 1 << Character.MODIFIER_LETTER + | 1 << Character.OTHER_LETTER + | 1 << Character.LETTER_NUMBER) >> Character.getType(codePoint) & 1) != 0; + } + + /** + * Checks whether a string contains a digit. Used for ignoring words with + * numbers + * @param s Word to be checked. + * @return True if there is a digit inside the word. + */ + static boolean containsNoDigit(final String s) { + for (int k = 0; k < s.length(); k++) { + if (Character.isDigit(s.charAt(k))) { + return false; + } + } + return true; + } + + /** + * Returns true if str is made up of all-uppercase characters + * (ignoring characters for which no upper-/lowercase distinction exists). + */ + boolean isAllUppercase(final String str) { + for(int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if(Character.isLetter(c) && Character.isLowerCase(c)) { + return false; + } + } + return true; + } + + /** + * Returns true if str is made up of all-lowercase characters + * (ignoring characters for which no upper-/lowercase distinction exists). + */ + boolean isNotAllLowercase(final String str) { + for(int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if(Character.isLetter(c) && !Character.isLowerCase(c)) { + return true; + } + } + return false; + } + + /** + * @param str input string + */ + boolean isNotCapitalizedWord(final String str) { + if (isNotEmpty(str) && Character.isUpperCase(str.charAt(0))) { + for (int i = 1; i < str.length(); i++) { + char c = str.charAt(i); + if (Character.isLetter(c) && !Character.isLowerCase(c)) { + return true; + } + } + return false; + } + return true; + } + + /** + * Helper method to replace calls to "".equals(). + * + * @param str + * String to check + * @return true if string is empty OR null + */ + static boolean isNotEmpty(final String str) { + return str != null && str.length() != 0; + } + + /** + * @param str input str + * @return Returns true if str is MixedCase. + */ + boolean isMixedCase(final String str) { + return !isAllUppercase(str) + && isNotCapitalizedWord(str) + && isNotAllLowercase(str); + } + + /** + * @return Returns true if str is CamelCase. + */ + public boolean isCamelCase(final String str) { + return isNotEmpty(str) + && !isAllUppercase(str) + && isNotCapitalizedWord(str) + && Character.isUpperCase(str.charAt(0)) + && (!(str.length() > 1) || Character.isLowerCase(str.charAt(1))) + && isNotAllLowercase(str); + } + + + /** + * Used to determine whether the dictionary supports case conversions. + * @return boolean value that answers this question in a deep and meaningful way. + * + * @since 1.9 + * + */ + public boolean convertsCase() { + return dictionaryMetadata.isConvertingCase(); + } + + /** + * @param str The string to find the replacements for. + * @param fromIndex The index from which replacements are found. + * @param level The recursion level. The search stops if level is > MAX_RECURSION_LEVEL. + * @return A list of all possible replacements of a {#link str} given string + */ + public List getAllReplacements(final String str, final int fromIndex, final int level) { + List replaced = new ArrayList(); + if (level > MAX_RECURSION_LEVEL) { // Stop searching at some point + replaced.add(str); + return replaced; + } + StringBuilder sb = new StringBuilder(); + sb.append(str); + int index = MAX_WORD_LENGTH; + String key = ""; + int keyLength = 0; + boolean found = false; + // find first possible replacement after fromIndex position + for (final String auxKey : replacementsTheRest.keySet()) { + int auxIndex = sb.indexOf(auxKey, fromIndex); + if (auxIndex > -1 && auxIndex < index && + !(auxKey.length() < keyLength)) { //select the longest possible key + index = auxIndex; + key = auxKey; + keyLength = auxKey.length(); + } + } + if (index < MAX_WORD_LENGTH) { + for (final String rep : replacementsTheRest.get(key)) { + // start a branch without replacement (only once per key) + if (!found) { + replaced.addAll(getAllReplacements(str, index + key.length(), + level + 1)); + found = true; + } + // avoid unnecessary replacements (ex. don't replace L by L·L when L·L already present) + int ind = sb.indexOf(rep, fromIndex - rep.length() + 1); + if (rep.length() > key.length() && ind > -1 + && (ind == index || ind == index - rep.length() + 1)) { + continue; + } + // start a branch with replacement + sb.replace(index, index + key.length(), rep); + replaced.addAll(getAllReplacements(sb.toString(), index + rep.length(), + level + 1)); + sb.setLength(0); + sb.append(str); + } + } + if (!found) { + replaced.add(sb.toString()); + } + return replaced; + } + + + /** + * Sets up the word and candidate. Used only to test the edit distance in + * JUnit tests. + * + * @param word the first word + * @param candidate the second word used for edit distance calculation + */ + void setWordAndCandidate(final String word, final String candidate) { + wordProcessed = word.toCharArray(); + wordLen = wordProcessed.length; + this.candidate = candidate.toCharArray(); + candLen = this.candidate.length; + effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance; + } + + public final int getWordLen() { + return wordLen; + } + + public final int getCandLen() { + return candLen; + } + + public final int getEffectiveED() { + return effectEditDistance; + } + + /** + * Used to sort candidates according to edit distance, and possibly + * according to their frequency in the future. + * + */ + private class CandidateData implements Comparable { + private final String word; + private final int distance; + + CandidateData(final String word, final int distance) { + this.word = word; + this.distance = distance * FREQ_RANGES + FREQ_RANGES - getFrequency(word) - 1; + } + + final String getWord() { + return word; + } + + final int getDistance() { + return distance; + } + + @Override + public int compareTo(final CandidateData cd) { + // Assume no overflow. + return cd.getDistance() > this.distance ? -1 : + cd.getDistance() == this.distance ? 0 : 1; + } + } +} diff --git a/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java new file mode 100644 index 0000000..38aa76d --- /dev/null +++ b/morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java @@ -0,0 +1,21 @@ +package morfologik.speller; + +import static org.junit.Assert.*; + +import morfologik.speller.HMatrix; + +import org.junit.Test; + +public class HMatrixTest { + + private static final int MAX_WORD_LENGTH = 120; + + @Test + public void stressTestInit() { + for (int i = 0; i < 10; i++) { // test if we don't get beyond array limits etc. + HMatrix H = new HMatrix(i, MAX_WORD_LENGTH); + assertEquals(0, H.get(1, 1)); + } + } + +} diff --git a/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java new file mode 100644 index 0000000..48ed2c1 --- /dev/null +++ b/morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java @@ -0,0 +1,272 @@ +package morfologik.speller; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.net.URL; +import java.util.Arrays; +import java.util.List; + +import morfologik.stemming.Dictionary; + +import org.fest.assertions.api.Assertions; +import org.junit.BeforeClass; +import org.junit.Test; + +public class SpellerTest { + private static Dictionary dictionary; + + @BeforeClass + public static void setup() throws Exception { + final URL url = SpellerTest.class.getResource("slownik.dict"); + dictionary = Dictionary.read(url); + } + + /* + @Test + public void testAbka() throws Exception { + final Speller spell = new Speller(dictionary, 2); + System.out.println("Replacements:"); + for (String s : spell.findReplacements("abka")) { + System.out.println(s); + } + } + */ + + @Test + public void testRunonWords() throws IOException { + final Speller spell = new Speller(dictionary); + Assertions.assertThat(spell.replaceRunOnWords("abaka")).isEmpty(); + Assertions.assertThat(spell.replaceRunOnWords("abakaabace")).contains("abaka abace"); + + // Test on an morphological dictionary - should work as well + final URL url1 = getClass().getResource("test-infix.dict"); + final Speller spell1 = new Speller(Dictionary.read(url1)); + assertTrue(spell1.replaceRunOnWords("Rzekunia").isEmpty()); + assertTrue(spell1.replaceRunOnWords("RzekuniaRzeczypospolitej").contains("Rzekunia Rzeczypospolitej")); + assertTrue(spell1.replaceRunOnWords("RzekuniaRze").isEmpty()); //Rze is not found but is a prefix + } + + @Test + public void testIsInDictionary() throws IOException { + // Test on an morphological dictionary, including separators + final URL url1 = getClass().getResource("test-infix.dict"); + final Speller spell1 = new Speller(Dictionary.read(url1)); + assertTrue(spell1.isInDictionary("Rzekunia")); + assertTrue(!spell1.isInDictionary("Rzekunia+")); + assertTrue(!spell1.isInDictionary("Rzekunia+aaa")); + // test UTF-8 dictionary + final URL url = getClass().getResource("test-utf-spell.dict"); + final Speller spell = new Speller(Dictionary.read(url)); + assertTrue(spell.isInDictionary("jaźń")); + assertTrue(spell.isInDictionary("zażółć")); + assertTrue(spell.isInDictionary("żółwiową")); + assertTrue(spell.isInDictionary("ćwikła")); + assertTrue(spell.isInDictionary("Żebrowski")); + assertTrue(spell.isInDictionary("Święto")); + assertTrue(spell.isInDictionary("Świerczewski")); + assertTrue(spell.isInDictionary("abc")); + } + + @Test + public void testFindReplacements() throws IOException { + final Speller spell = new Speller(dictionary, 1); + assertTrue(spell.findReplacements("abka").contains("abak")); + //check if we get only dictionary words... + List reps = spell.findReplacements("bak"); + for (final String word: reps) { + assertTrue(spell.isInDictionary(word)); + } + assertTrue(spell.findReplacements("abka~~").isEmpty()); // 2 characters more -> edit distance too large + assertTrue(!spell.findReplacements("Rezkunia").contains("Rzekunia")); + + final URL url1 = getClass().getResource("test-infix.dict"); + final Speller spell1 = new Speller(Dictionary.read(url1)); + assertTrue(spell1.findReplacements("Rezkunia").contains("Rzekunia")); + //diacritics + assertTrue(spell1.findReplacements("Rzękunia").contains("Rzekunia")); + //we should get no candidates for correct words + assertTrue(spell1.isInDictionary("Rzekunia")); + assertTrue(spell1.findReplacements("Rzekunia").isEmpty()); + //and no for things that are too different from the dictionary + assertTrue(spell1.findReplacements("Strefakibica").isEmpty()); + //nothing for nothing + assertTrue(spell1.findReplacements("").isEmpty()); + //nothing for weird characters + assertTrue(spell1.findReplacements("\u0000").isEmpty()); + //nothing for other characters + assertTrue(spell1.findReplacements("«…»").isEmpty()); + //nothing for separator + assertTrue(spell1.findReplacements("+").isEmpty()); + + } + + @Test + public void testFrequencyNonUTFDictionary() throws IOException { + final URL url1 = getClass().getResource("test_freq_iso.dict"); + final Speller spell = new Speller(Dictionary.read(url1)); + assertTrue(spell.isInDictionary("a")); + assertTrue(!spell.isInDictionary("aõh")); //non-encodable in UTF-8 + } + + @Test + public void testFindReplacementsInUTF() throws IOException { + final URL url = getClass().getResource("test-utf-spell.dict"); + final Speller spell = new Speller(Dictionary.read(url)); + assertTrue(spell.findReplacements("gęslą").contains("gęślą")); + assertTrue(spell.findReplacements("ćwikla").contains("ćwikła")); + assertTrue(spell.findReplacements("Swierczewski").contains("Świerczewski")); + assertTrue(spell.findReplacements("zółwiową").contains("żółwiową")); + assertTrue(spell.findReplacements("Żebrowsk").contains("Żebrowski")); + assertTrue(spell.findReplacements("święto").contains("Święto")); + //note: no diacritics here, but we still get matches! + assertTrue(spell.findReplacements("gesla").contains("gęślą")); + assertTrue(spell.findReplacements("swieto").contains("Święto")); + assertTrue(spell.findReplacements("zolwiowa").contains("żółwiową")); + //using equivalent characters 'x' = 'ź' + assertTrue(spell.findReplacements("jexn").contains("jaźń")); + // 'u' = 'ó', so the edit distance is still small... + assertTrue(spell.findReplacements("zażulv").contains("zażółć")); + // 'rz' = 'ż', so the edit distance is still small, but with string replacements... + assertTrue(spell.findReplacements("zarzulv").contains("zażółć")); + assertTrue(spell.findReplacements("Rzebrowski").contains("Żebrowski")); + assertTrue(spell.findReplacements("rzółw").contains("żółw")); + assertTrue(spell.findReplacements("Świento").contains("Święto")); + // avoid mixed-case words as suggestions when using replacements ('rz' = 'ż') + assertTrue(spell.findReplacements("zArzółć").get(0).equals("zażółć")); + } + + @Test + public void testFindReplacementsUsingFrequency() throws IOException { + final URL url = getClass().getResource("dict-with-freq.dict"); + final Speller spell = new Speller(Dictionary.read(url)); + + //check if we get only dictionary words... + List reps = spell.findReplacements("jist"); + for (final String word: reps) { + assertTrue(spell.isInDictionary(word)); + } + // get replacements ordered by frequency + assertTrue(reps.get(0).equals("just")); + assertTrue(reps.get(1).equals("list")); + assertTrue(reps.get(2).equals("fist")); + assertTrue(reps.get(3).equals("mist")); + assertTrue(reps.get(4).equals("jest")); + assertTrue(reps.get(5).equals("dist")); + assertTrue(reps.get(6).equals("gist")); + } + + @Test + public void testIsMisspelled() throws IOException { + final URL url = getClass().getResource("test-utf-spell.dict"); + final Speller spell = new Speller(Dictionary.read(url)); + assertTrue(!spell.isMisspelled("Paragraf22")); //ignorujemy liczby + assertTrue(!spell.isMisspelled("!")); //ignorujemy znaki przestankowe + assertTrue(spell.isMisspelled("dziekie")); //test, czy znajdujemy błąd + assertTrue(!spell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda + assertTrue(!spell.isMisspelled("Ćwikła")); //i małe litery + assertTrue(!spell.isMisspelled("TOJESTTEST")); //i wielkie litery + final Speller oldStyleSpell = new Speller(dictionary, 1); + assertTrue(oldStyleSpell.isMisspelled("Paragraf22")); // nie ignorujemy liczby + assertTrue(oldStyleSpell.isMisspelled("!")); //nie ignorujemy znaków przestankowych + // assertTrue(oldStyleSpell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda + assertTrue(oldStyleSpell.isMisspelled("Abaka")); //i małe litery + final URL url1 = getClass().getResource("test-infix.dict"); + final Speller spell1 = new Speller(Dictionary.read(url1)); + assertTrue(!spell1.isMisspelled("Rzekunia")); + assertTrue(spell1.isAllUppercase("RZEKUNIA")); + assertTrue(spell1.isMisspelled("RZEKUNIAA")); // finds a typo here + assertTrue(!spell1.isMisspelled("RZEKUNIA")); // but not here + } + + @Test + public void testCamelCase() { + final Speller spell = new Speller(dictionary, 1); + assertTrue(spell.isCamelCase("CamelCase")); + assertTrue(!spell.isCamelCase("Camel")); + assertTrue(!spell.isCamelCase("CAMEL")); + assertTrue(!spell.isCamelCase("camel")); + assertTrue(!spell.isCamelCase("cAmel")); + assertTrue(!spell.isCamelCase("CAmel")); + assertTrue(!spell.isCamelCase("")); + assertTrue(!spell.isCamelCase(null)); + } + + @Test + public void testCapitalizedWord() { + final Speller spell = new Speller(dictionary, 1); + assertTrue(spell.isNotCapitalizedWord("CamelCase")); + assertTrue(!spell.isNotCapitalizedWord("Camel")); + assertTrue(spell.isNotCapitalizedWord("CAMEL")); + assertTrue(spell.isNotCapitalizedWord("camel")); + assertTrue(spell.isNotCapitalizedWord("cAmel")); + assertTrue(spell.isNotCapitalizedWord("CAmel")); + assertTrue(spell.isNotCapitalizedWord("")); + } + + @Test + public void testGetAllReplacements() throws IOException { + final URL url = getClass().getResource("test-utf-spell.dict"); + final Speller spell = new Speller(Dictionary.read(url)); + assertTrue(spell.isMisspelled("rzarzerzarzu")); + assertEquals("[rzarzerzarzu]", + Arrays.toString(spell.getAllReplacements("rzarzerzarzu", 0, 0).toArray())); + } + + @Test + public void testEditDistanceCalculation() throws IOException { + final Speller spell = new Speller(dictionary, 5); + //test examples from Oflazer's paper + assertTrue(getEditDistance(spell, "recoginze", "recognize") == 1); + assertTrue(getEditDistance(spell, "sailn", "failing") == 3); + assertTrue(getEditDistance(spell, "abc", "abcd") == 1); + assertTrue(getEditDistance(spell, "abc", "abcde") == 2); + //test words from fsa_spell output + assertTrue(getEditDistance(spell, "abka", "abaka") == 1); + assertTrue(getEditDistance(spell, "abka", "abakan") == 2); + assertTrue(getEditDistance(spell, "abka", "abaką") == 2); + assertTrue(getEditDistance(spell, "abka", "abaki") == 2); + } + + @Test + public void testCutOffEditDistance() throws IOException { + final Speller spell2 = new Speller(dictionary, 2); //note: threshold = 2 + //test cut edit distance - reprter / repo from Oflazer + assertTrue(getCutOffDistance(spell2, "repo", "reprter") == 1); + assertTrue(getCutOffDistance(spell2, "reporter", "reporter") == 0); + } + + private int getCutOffDistance(final Speller spell, final String word, final String candidate) { + // assuming there is no pair-replacement + spell.setWordAndCandidate(word, candidate); + final int [] ced = new int[spell.getCandLen() - spell.getWordLen()]; + for (int i = 0; i < spell.getCandLen() - spell.getWordLen(); i++) { + + ced[i] = spell.cuted(spell.getWordLen() + i, spell.getWordLen() + i, spell.getWordLen() + i); + } + Arrays.sort(ced); + //and the min value... + if (ced.length > 0) { + return ced[0]; + } + return 0; + } + + private int getEditDistance(final Speller spell, final String word, final String candidate) { + // assuming there is no pair-replacement + spell.setWordAndCandidate(word, candidate); + final int maxDistance = spell.getEffectiveED(); + final int candidateLen = spell.getCandLen(); + final int wordLen = spell.getWordLen(); + int ed = 0; + for (int i = 0; i < candidateLen; i++) { + if (spell.cuted(i, i, i) <= maxDistance) { + if (Math.abs(wordLen - 1 - i) <= maxDistance) { + ed = spell.ed(wordLen - 1, i, wordLen - 1, i); + } + } + } + return ed; + } +} \ No newline at end of file diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict new file mode 100644 index 0000000..609a267 Binary files /dev/null and b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.dict differ diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info new file mode 100644 index 0000000..1203602 --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info @@ -0,0 +1,15 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false + +fsa.dict.frequency-included=true + +fsa.dict.speller.locale=en_US +fsa.dict.speller.ignore-diacritics=true +#fsa.dict.speller.replacement-pairs=ninties 1990s, teached taught, rised rose, a ei, ei a, a ey, ey a, ai ie, ie ai, are air, are ear, are eir, air are, air ere, ere air, ere ear, ere eir, ear are, ear air, ear ere, eir are, eir ere, ch te, te ch, ch ti, ti ch, ch tu, tu ch, ch s, s ch, ch k, k ch, f ph, ph f, gh f, f gh, i igh, igh i, i uy, uy i, i ee, ee i, j di, di j, j gg, gg j, j ge, ge j, s ti, ti s, s ci, ci s, k cc, cc k, k qu, qu k, kw qu, o eau, eau o, o ew, ew o, oo ew, ew oo, ew ui, ui ew, oo ui, ui oo, ew u, u ew, oo u, u oo, u oe, oe u, u ieu, ieu u, ue ew, ew ue, uff ough, oo ieu, ieu oo, ier ear, ear ier, ear air, air ear, w qu, qu w, z ss, ss z, shun tion, shun sion, shun cion \ No newline at end of file diff --git a/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt new file mode 100644 index 0000000..a12876c --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt @@ -0,0 +1,21 @@ +ageist+C +deist+G +didst+A +digest+J +direst+E +dist+G +divest+I +fist+J +gist+G +grist+I +heist+I +hist+A +jest+H +jilt+D +joist+F +just+P +licit+F +list+O +mist+J +weest+A +wist+C diff --git a/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict b/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict new file mode 100644 index 0000000..b650702 Binary files /dev/null and b/morfologik-speller/src/test/resources/morfologik/speller/slownik.dict differ diff --git a/morfologik-speller/src/test/resources/morfologik/speller/slownik.info b/morfologik-speller/src/test/resources/morfologik/speller/slownik.info new file mode 100644 index 0000000..25aef99 --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/slownik.info @@ -0,0 +1,14 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=Cp1250 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false + +fsa.dict.speller.ignore-diacritics=false +fsa.dict.speller.ignore-numbers=false +fsa.dict.speller.convert-case=false +fsa.dict.speller.ignore-punctuation=false \ No newline at end of file diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict new file mode 100644 index 0000000..cc91f70 Binary files /dev/null and b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.dict differ diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info new file mode 100644 index 0000000..9ba1066 --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/test-infix.info @@ -0,0 +1,10 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 + +fsa.dict.uses-prefixes=true +fsa.dict.uses-infixes=true +fsa.dict.speller.ignore-all-uppercase=false \ No newline at end of file diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict new file mode 100644 index 0000000..63ff635 Binary files /dev/null and b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.dict differ diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info new file mode 100644 index 0000000..b13d12f --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info @@ -0,0 +1,15 @@ +# +# Dictionary properties. +# UTF-8 encoding or native2ascii has to be used for non-ASCII data. +# + +fsa.dict.separator=+ +fsa.dict.encoding=utf-8 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false + +fsa.dict.speller.locale=pl_PL +fsa.dict.speller.ignore-diacritics=true +fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u +fsa.dict.speller.replacement-pairs=rz ż, ż rz, ch h, h ch, ę en, en ę diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict new file mode 100644 index 0000000..69c5a99 Binary files /dev/null and b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.dict differ diff --git a/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info new file mode 100644 index 0000000..353deac --- /dev/null +++ b/morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info @@ -0,0 +1,16 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false + +fsa.dict.frequency-included=true + +fsa.dict.speller.locale=pl_PL +fsa.dict.speller.ignore-diacritics=true +fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u +fsa.dict.speller.replacement-pairs=ź zi, ł eu, ć ci, ć dż, ć dź, ć dz, c dz, ch h, ci ć, cz czy, dź ć, dź dzi, dż ć, dz ć, dzi dź, edzil ędził, ę em, ę en, ei eja, eja ei, em ę, en ę, eu ł, h ch, he chę, śi ś, ii ija, ija ii, iosc ość, ise się, loz łos, ni ń, ńi ń, ń ni, ą oł, oł ą, oi oja, oja oi, ą om, om ą, ą on, on ą, ru kró, ż rz, rz ż, rz sz, scia ścią, ś si, si ś, sić ść, s sną, sz ż, sz rz, tro rot, u y, wu wy, yi yja, yja yi, zal rzał, zekac rzekać, zi ź, zl azł, z żn, z rz, chłopcowi chłopcu, bratowi bratu, aleji alei, lubieć lubić, nei nie, źmie zmie, piatek piątek, pokuj pokój, poszłem poszedłem, prosze proszę, rząda żąda, sa są, sei się, standart standard, trzcionk czcionk, szłem szedłem, pry przy \ No newline at end of file diff --git a/morfologik-stemming/pom.xml b/morfologik-stemming/pom.xml new file mode 100644 index 0000000..475b9da --- /dev/null +++ b/morfologik-stemming/pom.xml @@ -0,0 +1,71 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-stemming + jar + + Morfologik Stemming APIs + Morfologik Stemming APIs. + + + + org.carrot2 + morfologik-fsa + ${project.version} + + + + + com.carrotsearch + hppc + + + + + + com.carrotsearch + junit-benchmarks + test + + + + junit + junit + test + + + + org.easytesting + fest-assert-core + + + + + install + + + + org.apache.maven.plugins + maven-jar-plugin + + + + true + + + + + + + diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java new file mode 100644 index 0000000..4c3f877 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java @@ -0,0 +1,111 @@ +package morfologik.stemming; + +import java.util.*; + +/** + * A view over a range of an array. + */ +@SuppressWarnings("serial") +final class ArrayViewList extends AbstractList + implements RandomAccess, java.io.Serializable +{ + /** Backing array. */ + private E[] a; + private int start; + private int length; + + /* + * + */ + ArrayViewList(E[] array, int start, int length) { + if (array == null) + throw new IllegalArgumentException(); + wrap(array, start, length); + } + + /* + * + */ + public int size() { + return length; + } + + /* + * + */ + public E get(int index) { + return a[start + index]; + } + + /* + * + */ + public E set(int index, E element) { + throw new UnsupportedOperationException(); + } + + /* + * + */ + public void add(int index, E element) { + throw new UnsupportedOperationException(); + } + + /* + * + */ + public E remove(int index) { + throw new UnsupportedOperationException(); + } + + /* + * + */ + public boolean addAll(int index, Collection c) { + throw new UnsupportedOperationException(); + } + + /* + * + */ + public int indexOf(Object o) { + if (o == null) { + for (int i = start; i < start + length; i++) + if (a[i] == null) + return i - start; + } else { + for (int i = start; i < start + length; i++) + if (o.equals(a[i])) + return i - start; + } + return -1; + } + + public ListIterator listIterator() { + return listIterator(0); + } + + /* + * + */ + public ListIterator listIterator(final int index) { + return Arrays.asList(a).subList(start, start + length).listIterator( + index); + } + + /* + * + */ + public boolean contains(Object o) { + return indexOf(o) != -1; + } + + /* + * + */ + void wrap(E[] array, int start, int length) { + this.a = array; + this.start = start; + this.length = length; + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java new file mode 100644 index 0000000..d72c85c --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java @@ -0,0 +1,233 @@ +package morfologik.stemming; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.WeakHashMap; + +import morfologik.fsa.FSA; +import morfologik.util.FileUtils; +import morfologik.util.ResourceUtils; + +/** + * A dictionary combines {@link FSA} automaton and metadata describing the + * internals of dictionary entries' coding ({@link DictionaryMetadata}. + * + *

+ * A dictionary consists of two files: + *

    + *
  • an actual compressed FSA file, + *
  • a metadata file, describing the dictionary. + *
+ * Use static methods in this class to read dictionaries and their metadata. + */ +public final class Dictionary { + /** + * Expected metadata file extension. + */ + public final static String METADATA_FILE_EXTENSION = "info"; + + /** + * {@link FSA} automaton with the compiled dictionary data. + */ + public final FSA fsa; + + /** + * Metadata associated with the dictionary. + */ + public final DictionaryMetadata metadata; + + /** + * Default loaded dictionaries. + */ + public static final WeakHashMap defaultDictionaries = new WeakHashMap(); + + /** + * It is strongly recommended to use static methods in this class for + * reading dictionaries. + * + * @param fsa + * An instantiated {@link FSA} instance. + * + * @param metadata + * A map of attributes describing the compression format and + * other settings not contained in the FSA automaton. For an + * explanation of available attributes and their possible values, + * see {@link DictionaryMetadata}. + */ + public Dictionary(FSA fsa, DictionaryMetadata metadata) { + this.fsa = fsa; + this.metadata = metadata; + } + + /** + * Attempts to load a dictionary using the path to the FSA file and the + * expected metadata extension. + */ + public static Dictionary read(File fsaFile) throws IOException { + final File featuresFile = new File(fsaFile.getParent(), + getExpectedFeaturesName(fsaFile.getName())); + + FileUtils.assertExists(featuresFile, true, false); + + return readAndClose( + new FileInputStream(fsaFile), + new FileInputStream(featuresFile)); + } + + /** + *

+ * Attempts to load a dictionary using the URL to the FSA file and the + * expected metadata extension. + * + *

+ * This method can be used to load resource-based dictionaries, but be aware + * of JAR resource-locking issues that arise from resource URLs. + */ + public static Dictionary read(URL fsaURL) throws IOException { + final String fsa = fsaURL.toExternalForm(); + final String features = getExpectedFeaturesName(fsa); + + return readAndClose( + ResourceUtils.openInputStream(fsa), + ResourceUtils.openInputStream(features)); + } + + /** + * Attempts to load a dictionary from opened streams of FSA dictionary data + * and associated metadata. + */ + public static Dictionary readAndClose(InputStream fsaData, InputStream featuresData) + throws IOException + { + try { + Map map = new HashMap(); + final Properties properties = new Properties(); + properties.load(new InputStreamReader(featuresData, "UTF-8")); + + // Handle back-compatibility for encoder specification. + if (!properties.containsKey(DictionaryAttribute.ENCODER.propertyName)) { + boolean usesSuffixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-suffixes", "true")); + boolean usesPrefixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-prefixes", "false")); + boolean usesInfixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-infixes", "false")); + + if (usesInfixes) { + map.put(DictionaryAttribute.ENCODER, EncoderType.INFIX.name()); + } else if (usesPrefixes) { + map.put(DictionaryAttribute.ENCODER, EncoderType.PREFIX.name()); + } else if (usesSuffixes) { + map.put(DictionaryAttribute.ENCODER, EncoderType.SUFFIX.name()); + } else { + map.put(DictionaryAttribute.ENCODER, EncoderType.NONE.name()); + } + + properties.remove("fsa.dict.uses-suffixes"); + properties.remove("fsa.dict.uses-prefixes"); + properties.remove("fsa.dict.uses-infixes"); + } + + for (Enumeration e = properties.propertyNames(); e.hasMoreElements();) { + String key = (String) e.nextElement(); + map.put(DictionaryAttribute.fromPropertyName(key), properties.getProperty(key)); + } + final DictionaryMetadata features = new DictionaryMetadata(map); + final FSA fsa = FSA.read(fsaData); + + return new Dictionary(fsa, features); + } finally { + FileUtils.close(fsaData, featuresData); + } + } + + /** + * Returns the expected name of the metadata file, based on the name of the + * FSA dictionary file. The expected name is resolved by truncating any + * suffix of name and appending + * {@link #METADATA_FILE_EXTENSION}. + */ + public static String getExpectedFeaturesName(String name) { + final int dotIndex = name.lastIndexOf('.'); + final String featuresName; + if (dotIndex >= 0) { + featuresName = name.substring(0, dotIndex) + "." + + METADATA_FILE_EXTENSION; + } else { + featuresName = name + "." + METADATA_FILE_EXTENSION; + } + + return featuresName; + } + + /** + * Return a built-in dictionary for a given ISO language code. Dictionaries + * are cached internally for potential reuse. + * + * @throws RuntimeException + * Throws a {@link RuntimeException} if the dictionary is not + * bundled with the library. + */ + public static Dictionary getForLanguage(String languageCode) { + if (languageCode == null || "".equals(languageCode)) { + throw new IllegalArgumentException( + "Language code must not be empty."); + } + + synchronized (defaultDictionaries) { + Dictionary dict = defaultDictionaries.get(languageCode); + if (dict != null) + return dict; + + try { + final String dictPath = "morfologik/dictionaries/" + languageCode + ".dict"; + final String metaPath = Dictionary + .getExpectedFeaturesName(dictPath); + + dict = Dictionary.readAndClose( + ResourceUtils.openInputStream(dictPath), + ResourceUtils.openInputStream(metaPath)); + + defaultDictionaries.put(languageCode, dict); + return dict; + } catch (IOException e) { + throw new RuntimeException( + "Default dictionary resource for language '" + + languageCode + "not found.", e); + } + } + } + + /** + * Converts the words on input or output according to conversion tables. + * + * Useful if the input words need to be normalized (i.e., ligatures, + * apostrophes and such). + * + * @param str - input character sequence to be converted + * @param conversionMap - conversion map used to convert the string (a map + * from String to String) + * @return a converted string. + * + * @since 1.9.0 + * + */ + public static CharSequence convertText(final CharSequence str, final Map conversionMap) { + StringBuilder sb = new StringBuilder(); + sb.append(str); + for (final String auxKey : conversionMap.keySet()) { + int index = sb.indexOf(auxKey); + while (index != -1) { + sb.replace(index, index + auxKey.length(), conversionMap.get(auxKey)); + index = sb.indexOf(auxKey); + } + } + return sb.toString(); + } + +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java new file mode 100644 index 0000000..1249800 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java @@ -0,0 +1,333 @@ +package morfologik.stemming; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * Attributes applying to {@link Dictionary} and {@link DictionaryMetadata}. + */ +public enum DictionaryAttribute { + /** + * Logical fields separator inside the FSA. + */ + SEPARATOR("fsa.dict.separator") { + @Override + public Character fromString(String separator) { + if (separator == null || separator.length() != 1) { + throw new IllegalArgumentException("Attribute " + propertyName + + " must be a single character."); + } + + char charValue = separator.charAt(0); + if (Character.isHighSurrogate(charValue) || + Character.isLowSurrogate(charValue)) { + throw new IllegalArgumentException( + "Field separator character cannot be part of a surrogate pair: " + separator); + } + + return charValue; + } + }, + + /** + * Character to byte encoding used for strings inside the FSA. + */ + ENCODING("fsa.dict.encoding") { + @Override + public Charset fromString(String charsetName) { + return Charset.forName(charsetName); + } + }, + + /** + * If the FSA dictionary includes frequency data. + */ + FREQUENCY_INCLUDED("fsa.dict.frequency-included") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to ignore words containing digits + */ + IGNORE_NUMBERS("fsa.dict.speller.ignore-numbers") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to ignore punctuation. + */ + IGNORE_PUNCTUATION("fsa.dict.speller.ignore-punctuation") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to ignore CamelCase words. + */ + IGNORE_CAMEL_CASE("fsa.dict.speller.ignore-camel-case") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to ignore ALL UPPERCASE words. + */ + IGNORE_ALL_UPPERCASE("fsa.dict.speller.ignore-all-uppercase") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to ignore diacritics, so that + * 'a' would be treated as equivalent to 'ą'. + */ + IGNORE_DIACRITICS("fsa.dict.speller.ignore-diacritics") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * if the spelling dictionary is supposed to treat upper and lower case + * as equivalent. + */ + CONVERT_CASE("fsa.dict.speller.convert-case") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** + * If the spelling dictionary is supposed to split runOnWords. + */ + RUN_ON_WORDS("fsa.dict.speller.runon-words") { + @Override + public Boolean fromString(String value) { + return booleanValue(value); + } + }, + + /** Locale associated with the dictionary. */ + LOCALE("fsa.dict.speller.locale") { + @Override + public Locale fromString(String value) { + return new Locale(value); + } + }, + + /** Locale associated with the dictionary. */ + ENCODER("fsa.dict.encoder") { + @Override + public EncoderType fromString(String value) { + return EncoderType.valueOf(value.toUpperCase(Locale.ROOT)); + } + }, + + /** + * Input conversion pairs to replace non-standard characters before search in a speller dictionary. + * For example, common ligatures can be replaced here. + */ + INPUT_CONVERSION("fsa.dict.input-conversion") { + @Override + public Map fromString(String value) throws IllegalArgumentException { + Map conversionPairs = new HashMap(); + final String[] replacements = value.split(",\\s*"); + for (final String stringPair : replacements) { + final String[] twoStrings = stringPair.trim().split(" "); + if (twoStrings.length == 2) { + if (!conversionPairs.containsKey(twoStrings[0])) { + conversionPairs.put(twoStrings[0], twoStrings[1]); + } else { + throw new IllegalArgumentException( + "Input conversion cannot specify different values for the same input string: " + twoStrings[0]); + } + } else { + throw new IllegalArgumentException("Attribute " + propertyName + + " is not in the proper format: " + value); + } + } + return conversionPairs; + } + }, + + /** + * Output conversion pairs to replace non-standard characters before search in a speller dictionary. + * For example, standard characters can be replaced here into ligatures. + * + * Useful for dictionaries that do have certain standards imposed. + * + */ + OUTPUT_CONVERSION ("fsa.dict.output-conversion") { + @Override + public Map fromString(String value) throws IllegalArgumentException { + Map conversionPairs = new HashMap(); + final String[] replacements = value.split(",\\s*"); + for (final String stringPair : replacements) { + final String[] twoStrings = stringPair.trim().split(" "); + if (twoStrings.length == 2) { + if (!conversionPairs.containsKey(twoStrings[0])) { + conversionPairs.put(twoStrings[0], twoStrings[1]); + } else { + throw new IllegalArgumentException( + "Input conversion cannot specify different values for the same input string: " + twoStrings[0]); + } + } else { + throw new IllegalArgumentException("Attribute " + propertyName + + " is not in the proper format: " + value); + } + } + return conversionPairs; + } + }, + + /** + * Replacement pairs for non-obvious candidate search in a speller dictionary. + * For example, Polish rz is phonetically equivalent to ż, + * and this may be specified here to allow looking for replacements of rz with ż + * and vice versa. + */ + REPLACEMENT_PAIRS("fsa.dict.speller.replacement-pairs") { + @Override + public Map> fromString(String value) throws IllegalArgumentException { + Map> replacementPairs = new HashMap>(); + final String[] replacements = value.split(",\\s*"); + for (final String stringPair : replacements) { + final String[] twoStrings = stringPair.trim().split(" "); + if (twoStrings.length == 2) { + if (!replacementPairs.containsKey(twoStrings[0])) { + List strList = new ArrayList(); + strList.add(twoStrings[1]); + replacementPairs.put(twoStrings[0], strList); + } else { + replacementPairs.get(twoStrings[0]).add(twoStrings[1]); + } + } else { + throw new IllegalArgumentException("Attribute " + propertyName + + " is not in the proper format: " + value); + } + } + return replacementPairs; + } + }, + + /** + * Equivalent characters (treated similarly as equivalent chars with and without + * diacritics). For example, Polish ł can be specified as equivalent to l. + * + *

This implements a feature similar to hunspell MAP in the affix file. + */ + EQUIVALENT_CHARS("fsa.dict.speller.equivalent-chars") { + @Override + public Map> fromString(String value) throws IllegalArgumentException { + Map> equivalentCharacters = + new HashMap>(); + final String[] eqChars = value.split(",\\s*"); + for (final String characterPair : eqChars) { + final String[] twoChars = characterPair.trim().split(" "); + if (twoChars.length == 2 + && twoChars[0].length() == 1 + && twoChars[1].length() == 1) { + char fromChar = twoChars[0].charAt(0); + char toChar = twoChars[1].charAt(0); + if (!equivalentCharacters.containsKey(fromChar)) { + List chList = new ArrayList(); + equivalentCharacters.put(fromChar, chList); + } + equivalentCharacters.get(fromChar).add(toChar); + } else { + throw new IllegalArgumentException("Attribute " + propertyName + + " is not in the proper format: " + value); + } + } + return equivalentCharacters; + } + }, + + /** + * Dictionary license attribute. + */ + LICENSE("fsa.dict.license"), + + /** + * Dictionary author. + */ + AUTHOR("fsa.dict.author"), + + /** + * Dictionary creation date. + */ + CREATION_DATE("fsa.dict.created"); + + /** + * Property name for this attribute. + */ + public final String propertyName; + + /** + * Converts a string to the given attribute's value (covariants used). + * + * @throws IllegalArgumentException + * If the input string cannot be converted to the attribute's + * value. + */ + public Object fromString(String value) throws IllegalArgumentException { + return value; + } + + /** + * Return an {@link DictionaryAttribute} by its {@link #propertyName}. + */ + public static DictionaryAttribute fromPropertyName(String propertyName) { + DictionaryAttribute value = attrsByPropertyName.get(propertyName); + if (value == null) { + throw new IllegalArgumentException("No attribute for property: " + propertyName); + } + return value; + } + + private static final Map attrsByPropertyName; + static { + attrsByPropertyName = new HashMap(); + for (DictionaryAttribute attr : DictionaryAttribute.values()) { + if (attrsByPropertyName.put(attr.propertyName, attr) != null) { + throw new RuntimeException("Duplicate property key for: " + attr); + } + } + } + + /** + * Private enum instance constructor. + */ + private DictionaryAttribute(String propertyName) { + this.propertyName = propertyName; + } + + private static Boolean booleanValue(String value) { + value = value.toLowerCase(); + if ("true".equals(value) || "yes".equals(value) || "on".equals(value)) { + return Boolean.TRUE; + } + if ("false".equals(value) || "no".equals(value) || "off".equals(value)) { + return Boolean.FALSE; + } + throw new IllegalArgumentException("Not a boolean value: " + value); + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java new file mode 100644 index 0000000..104ff58 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java @@ -0,0 +1,143 @@ +package morfologik.stemming; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.util.Iterator; + +import morfologik.util.BufferUtils; + +/** + * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems + * can be decoded from compressed format or the compressed form can be + * preserved. + */ +public final class DictionaryIterator implements Iterator { + private final CharsetDecoder decoder; + private final Iterator entriesIter; + private final WordData entry; + private final byte separator; + private final DictionaryMetadata dictionaryMetadata; + private final boolean decodeStems; + + private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0); + private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0); + private ByteBuffer temp = ByteBuffer.allocate(0); + + public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder, + boolean decodeStems) { + this.entriesIter = dictionary.fsa.iterator(); + this.separator = dictionary.metadata.getSeparator(); + this.dictionaryMetadata = dictionary.metadata; + this.decoder = decoder; + this.entry = new WordData(decoder); + this.decodeStems = decodeStems; + } + + public boolean hasNext() { + return entriesIter.hasNext(); + } + + public WordData next() { + final ByteBuffer entryBuffer = entriesIter.next(); + entry.reset(); + + /* + * Entries are typically: inflectedcodedBasetag so try to find + * this split. + */ + byte[] ba = entryBuffer.array(); + int bbSize = entryBuffer.remaining(); + + int sepPos; + for (sepPos = 0; sepPos < bbSize; sepPos++) { + if (ba[sepPos] == separator) + break; + } + + if (sepPos == bbSize) { + throw new RuntimeException("Invalid dictionary " + + "entry format (missing separator)."); + } + + inflectedBuffer.clear(); + inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos); + inflectedBuffer.put(ba, 0, sepPos); + inflectedBuffer.flip(); + + inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer); + entry.wordBuffer = inflectedBuffer; + entry.wordCharSequence = inflectedCharBuffer; + + temp.clear(); + temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos); + sepPos++; + temp.put(ba, sepPos, bbSize - sepPos); + temp.flip(); + + ba = temp.array(); + bbSize = temp.remaining(); + + /* + * Find the next separator byte's position splitting word form and tag. + */ + sepPos = 0; + for (; sepPos < bbSize; sepPos++) { + if (ba[sepPos] == separator) + break; + } + + /* + * Decode the stem into stem buffer. + */ + entry.stemBuffer.clear(); + if (decodeStems) { + entry.stemBuffer = DictionaryLookup.decodeBaseForm(entry.stemBuffer, + ba, sepPos, inflectedBuffer, dictionaryMetadata); + } else { + entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer, sepPos); + entry.stemBuffer.put(ba, 0, sepPos); + } + entry.stemBuffer.flip(); + + // Skip separator character, if present. + if (sepPos + 1 <= bbSize) { + sepPos++; + } + + /* + * Decode the tag data. + */ + entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize + - sepPos); + entry.tagBuffer.clear(); + entry.tagBuffer.put(ba, sepPos, bbSize - sepPos); + entry.tagBuffer.flip(); + + return entry; + } + + /** + * Decode the byte buffer, optionally expanding the char buffer. + */ + private CharBuffer bytesToChars(ByteBuffer bytes, CharBuffer chars) { + chars.clear(); + final int maxCapacity = (int) (bytes.remaining() * decoder + .maxCharsPerByte()); + if (chars.capacity() <= maxCapacity) { + chars = CharBuffer.allocate(maxCapacity); + } + + bytes.mark(); + decoder.reset(); + decoder.decode(bytes, chars, true); + chars.flip(); + bytes.reset(); + + return chars; + } + + public void remove() { + throw new UnsupportedOperationException(); + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java new file mode 100644 index 0000000..5bb0716 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java @@ -0,0 +1,403 @@ +package morfologik.stemming; + +import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSAFinalStatesIterator; +import morfologik.fsa.FSATraversal; +import morfologik.fsa.MatchResult; +import morfologik.util.BufferUtils; + +/** + * This class implements a dictionary lookup over an FSA dictionary. The + * dictionary for this class should be prepared from a text file using Jan + * Daciuk's FSA package (see link below). + * + *

+ * Important: finite state automatons in Jan Daciuk's implementation use + * bytes not unicode characters. Therefore objects of this class always + * have to be constructed with an encoding used to convert Java strings to byte + * arrays and the other way around. You can use UTF-8 encoding, as it + * should not conflict with any control sequences and separator characters. + * + * @see FSA package Web + * site + */ +public final class DictionaryLookup implements IStemmer, Iterable { + private static final int REMOVE_EVERYTHING = 255; + + /** An FSA used for lookups. */ + private final FSATraversal matcher; + + /** An iterator for walking along the final states of {@link #fsa}. */ + private final FSAFinalStatesIterator finalStatesIterator; + + /** FSA's root node. */ + private final int rootNode; + + /** Expand buffers and arrays by this constant. */ + private final static int EXPAND_SIZE = 10; + + /** Private internal array of reusable word data objects. */ + private WordData[] forms = new WordData[0]; + + /** A "view" over an array implementing */ + private final ArrayViewList formsList = new ArrayViewList( + forms, 0, forms.length); + + /** + * Features of the compiled dictionary. + * + * @see DictionaryMetadata + */ + private final DictionaryMetadata dictionaryMetadata; + + /** + * Charset encoder for the FSA. + */ + private final CharsetEncoder encoder; + + /** + * Charset decoder for the FSA. + */ + private final CharsetDecoder decoder; + + /** + * The FSA we are using. + */ + private final FSA fsa; + + /** + * @see #getSeparatorChar() + */ + private final char separatorChar; + + /** + * Internal reusable buffer for encoding words into byte arrays using + * {@link #encoder}. + */ + private ByteBuffer byteBuffer = ByteBuffer.allocate(0); + + /** + * Internal reusable buffer for encoding words into byte arrays using + * {@link #encoder}. + */ + private CharBuffer charBuffer = CharBuffer.allocate(0); + + /** + * Reusable match result. + */ + private final MatchResult matchResult = new MatchResult(); + + /** + * The {@link Dictionary} this lookup is using. + */ + private final Dictionary dictionary; + + /** + *

+ * Creates a new object of this class using the given FSA for word lookups + * and encoding for converting characters to bytes. + * + * @throws IllegalArgumentException + * if FSA's root node cannot be acquired (dictionary is empty). + */ + public DictionaryLookup(Dictionary dictionary) + throws IllegalArgumentException { + this.dictionary = dictionary; + this.dictionaryMetadata = dictionary.metadata; + this.rootNode = dictionary.fsa.getRootNode(); + this.fsa = dictionary.fsa; + this.matcher = new FSATraversal(fsa); + this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode()); + + if (rootNode == 0) { + throw new IllegalArgumentException( + "Dictionary must have at least the root node."); + } + + if (dictionaryMetadata == null) { + throw new IllegalArgumentException( + "Dictionary metadata must not be null."); + } + + decoder = dictionary.metadata.getDecoder(); + encoder = dictionary.metadata.getEncoder(); + separatorChar = dictionary.metadata.getSeparatorAsChar(); + } + + /** + * Searches the automaton for a symbol sequence equal to word, + * followed by a separator. The result is a stem (decompressed accordingly + * to the dictionary's specification) and an optional tag data. + */ + @Override + public List lookup(CharSequence word) { + final byte separator = dictionaryMetadata.getSeparator(); + + if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { + word = Dictionary.convertText(word, dictionaryMetadata.getInputConversionPairs()); + } + + // Reset the output list to zero length. + formsList.wrap(forms, 0, 0); + + // Encode word characters into bytes in the same encoding as the FSA's. + charBuffer.clear(); + charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length()); + for (int i = 0; i < word.length(); i++) { + char chr = word.charAt(i); + if (chr == separatorChar) + return formsList; + charBuffer.put(chr); + } + charBuffer.flip(); + byteBuffer = charsToBytes(charBuffer, byteBuffer); + + // Try to find a partial match in the dictionary. + final MatchResult match = matcher.match(matchResult, byteBuffer + .array(), 0, byteBuffer.remaining(), rootNode); + + if (match.kind == SEQUENCE_IS_A_PREFIX) { + /* + * The entire sequence exists in the dictionary. A separator should + * be the next symbol. + */ + final int arc = fsa.getArc(match.node, separator); + + /* + * The situation when the arc points to a final node should NEVER + * happen. After all, we want the word to have SOME base form. + */ + if (arc != 0 && !fsa.isArcFinal(arc)) { + // There is such a word in the dictionary. Return its base forms. + int formsCount = 0; + + finalStatesIterator.restartFrom(fsa.getEndNode(arc)); + while (finalStatesIterator.hasNext()) { + final ByteBuffer bb = finalStatesIterator.next(); + final byte[] ba = bb.array(); + final int bbSize = bb.remaining(); + + if (formsCount >= forms.length) { + forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE); + for (int k = 0; k < forms.length; k++) { + if (forms[k] == null) + forms[k] = new WordData(decoder); + } + } + + /* + * Now, expand the prefix/ suffix 'compression' and store + * the base form. + */ + final WordData wordData = forms[formsCount++]; + wordData.reset(); + + wordData.wordBuffer = byteBuffer; + if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) { + wordData.wordCharSequence = word; + } else { + wordData.wordCharSequence = Dictionary.convertText(word, + dictionaryMetadata.getOutputConversionPairs()); + } + + /* + * Find the separator byte's position splitting the inflection instructions + * from the tag. + */ + int sepPos; + for (sepPos = 0; sepPos < bbSize; sepPos++) { + if (ba[sepPos] == separator) + break; + } + + /* + * Decode the stem into stem buffer. + */ + wordData.stemBuffer.clear(); + wordData.stemBuffer = decodeBaseForm(wordData.stemBuffer, ba, + sepPos, byteBuffer, dictionaryMetadata); + wordData.stemBuffer.flip(); + + // Skip separator character. + sepPos++; + + /* + * Decode the tag data. + */ + final int tagSize = bbSize - sepPos; + if (tagSize > 0) { + wordData.tagBuffer = BufferUtils.ensureCapacity( + wordData.tagBuffer, tagSize); + wordData.tagBuffer.clear(); + wordData.tagBuffer.put(ba, sepPos, tagSize); + wordData.tagBuffer.flip(); + } + } + + formsList.wrap(forms, 0, formsCount); + } + } else { + /* + * this case is somewhat confusing: we should have hit the separator + * first... I don't really know how to deal with it at the time + * being. + */ + } + return formsList; + } + + /** + * Decode the base form of an inflected word and save its decoded form into + * a byte buffer. + * + * @param output + * The byte buffer to save the result to. A new buffer may be + * allocated if the capacity of bb is not large + * enough to store the result. The buffer is not flipped upon + * return. + * + * @param inflectedForm + * Inflected form's bytes (decoded properly). + * + * @param encoded + * Bytes of the encoded base form, starting at 0 index. + * + * @param encodedLen + * Length of the encode base form. + * + * @return Returns either bb or a new buffer whose capacity is + * large enough to store the output of the decoded data. + */ + public static ByteBuffer decodeBaseForm( + ByteBuffer output, + byte[] encoded, + int encodedLen, + ByteBuffer inflectedForm, + DictionaryMetadata metadata) { + + // FIXME: We should eventually get rid of this method and use + // each encoder's #decode method. The problem is that we'd have to include + // HPPC or roundtrip via HPPC to a ByteBuffer, which would slow things down. + // Since this is performance-crucial routine, I leave it for now. + + // Prepare the buffer. + output.clear(); + + assert inflectedForm.position() == 0; + + // Increase buffer size (overallocating), if needed. + final byte[] src = inflectedForm.array(); + final int srcLen = inflectedForm.remaining(); + if (output.capacity() < srcLen + encodedLen) { + output = ByteBuffer.allocate(srcLen + encodedLen); + } + + switch (metadata.getEncoderType()) { + case SUFFIX: + int suffixTrimCode = encoded[0]; + int truncateBytes = suffixTrimCode - 'A' & 0xFF; + if (truncateBytes == REMOVE_EVERYTHING) { + truncateBytes = srcLen; + } + output.put(src, 0, srcLen - truncateBytes); + output.put(encoded, 1, encodedLen - 1); + break; + + case PREFIX: + int truncatePrefixBytes = encoded[0] - 'A' & 0xFF; + int truncateSuffixBytes = encoded[1] - 'A' & 0xFF; + if (truncatePrefixBytes == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + truncatePrefixBytes = srcLen; + truncateSuffixBytes = 0; + } + output.put(src, truncatePrefixBytes, srcLen - (truncateSuffixBytes + truncatePrefixBytes)); + output.put(encoded, 2, encodedLen - 2); + break; + + case INFIX: + int infixIndex = encoded[0] - 'A' & 0xFF; + int infixLength = encoded[1] - 'A' & 0xFF; + truncateSuffixBytes = encoded[2] - 'A' & 0xFF; + if (infixLength == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + infixIndex = 0; + infixLength = srcLen; + truncateSuffixBytes = 0; + } + output.put(src, 0, infixIndex); + output.put(src, infixIndex + infixLength, srcLen - (infixIndex + infixLength + truncateSuffixBytes)); + output.put(encoded, 3, encodedLen - 3); + break; + + case NONE: + output.put(encoded, 0, encodedLen); + break; + + default: + throw new RuntimeException("Unhandled switch/case: " + metadata.getEncoderType()); + } + + return output; + } + + /** + * Encode a character sequence into a byte buffer, optionally expanding + * buffer. + */ + private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) { + bytes.clear(); + final int maxCapacity = (int) (chars.remaining() * encoder + .maxBytesPerChar()); + if (bytes.capacity() <= maxCapacity) { + bytes = ByteBuffer.allocate(maxCapacity); + } + + chars.mark(); + encoder.reset(); + if (encoder.encode(chars, bytes, true).isError()) { + // remove everything, we don't want to accept malformed input + bytes.clear(); + } + bytes.flip(); + chars.reset(); + + return bytes; + } + + /** + * Return an iterator over all {@link WordData} entries available in the + * embedded {@link Dictionary}. + */ + @Override + public Iterator iterator() { + return new DictionaryIterator(dictionary, decoder, true); + } + + /** + * @return Return the {@link Dictionary} used by this object. + */ + public Dictionary getDictionary() { + return dictionary; + } + + /** + * @return Returns the logical separator character splitting inflected form, + * lemma correction token and a tag. Note that this character is a best-effort + * conversion from a byte in {@link DictionaryMetadata#separator} and + * may not be valid in the target encoding (although this is highly unlikely). + */ + public char getSeparatorChar() { + return separatorChar; + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java new file mode 100644 index 0000000..1475de6 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java @@ -0,0 +1,298 @@ +package morfologik.stemming; + +import static morfologik.stemming.DictionaryAttribute.CONVERT_CASE; +import static morfologik.stemming.DictionaryAttribute.ENCODING; +import static morfologik.stemming.DictionaryAttribute.FREQUENCY_INCLUDED; +import static morfologik.stemming.DictionaryAttribute.IGNORE_ALL_UPPERCASE; +import static morfologik.stemming.DictionaryAttribute.IGNORE_CAMEL_CASE; +import static morfologik.stemming.DictionaryAttribute.IGNORE_DIACRITICS; +import static morfologik.stemming.DictionaryAttribute.IGNORE_NUMBERS; +import static morfologik.stemming.DictionaryAttribute.IGNORE_PUNCTUATION; +import static morfologik.stemming.DictionaryAttribute.RUN_ON_WORDS; +import static morfologik.stemming.DictionaryAttribute.SEPARATOR; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Collections; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * Description of attributes, their types and default values. + * + * @see Dictionary + */ +public final class DictionaryMetadata { + /** + * Default attribute values. + */ + private static Map DEFAULT_ATTRIBUTES = new DictionaryMetadataBuilder() + .separator('+') + .encoder(EncoderType.SUFFIX) + .frequencyIncluded() + .ignorePunctuation() + .ignoreNumbers() + .ignoreCamelCase() + .ignoreAllUppercase() + .ignoreDiacritics() + .convertCase() + .supportRunOnWords() + .toMap(); + + /** + * Required attributes. + */ + private static EnumSet REQUIRED_ATTRIBUTES = EnumSet.of( + SEPARATOR, + ENCODING); + + /** + * A separator character between fields (stem, lemma, form). The character + * must be within byte range (FSA uses bytes internally). + */ + private byte separator; + private char separatorChar; + + /** + * Encoding used for converting bytes to characters and vice versa. + */ + private String encoding; + + private Charset charset; + private Locale locale = Locale.getDefault(); + + /** + * Replacement pairs for non-obvious candidate search in a speller dictionary. + */ + private Map> replacementPairs = Collections.emptyMap(); + + /** + * Conversion pairs for input conversion, for example to replace ligatures. + */ + private Map inputConversion = Collections.emptyMap(); + + /** + * Conversion pairs for output conversion, for example to replace ligatures. + */ + private Map outputConversion = Collections.emptyMap(); + + /** + * Equivalent characters (treated similarly as equivalent chars with and without + * diacritics). For example, Polish ł can be specified as equivalent to l. + * + * This implements a feature similar to hunspell MAP in the affix file. + */ + private Map> equivalentChars = Collections.emptyMap(); + + /** + * All attributes. + */ + private final EnumMap attributes; + + /** + * All "enabled" boolean attributes. + */ + private final EnumMap boolAttributes; + + /** + * Sequence encoder. + */ + private EncoderType encoderType; + + /** + * Return all attributes. + */ + public Map getAttributes() { + return Collections.unmodifiableMap(attributes); + } + + // Cached attrs. + public String getEncoding() { return encoding; } + public byte getSeparator() { return separator; } + public Locale getLocale() { return locale; } + + public Map getInputConversionPairs() { return inputConversion; } + public Map getOutputConversionPairs() { return outputConversion; } + + public Map> getReplacementPairs() { return replacementPairs; } + public Map> getEquivalentChars() { return equivalentChars; } + + // Dynamically fetched. + public boolean isFrequencyIncluded() { return boolAttributes.get(FREQUENCY_INCLUDED); } + public boolean isIgnoringPunctuation() { return boolAttributes.get(IGNORE_PUNCTUATION); } + public boolean isIgnoringNumbers() { return boolAttributes.get(IGNORE_NUMBERS); } + public boolean isIgnoringCamelCase() { return boolAttributes.get(IGNORE_CAMEL_CASE); } + public boolean isIgnoringAllUppercase() { return boolAttributes.get(IGNORE_ALL_UPPERCASE); } + public boolean isIgnoringDiacritics() { return boolAttributes.get(IGNORE_DIACRITICS); } + public boolean isConvertingCase() { return boolAttributes.get(CONVERT_CASE); } + public boolean isSupportingRunOnWords() { return boolAttributes.get(RUN_ON_WORDS); } + + /** + * Create an instance from an attribute map. + * + * @see DictionaryMetadataBuilder + */ + public DictionaryMetadata(Map userAttrs) { + this.boolAttributes = new EnumMap(DictionaryAttribute.class); + this.attributes = new EnumMap(DictionaryAttribute.class); + this.attributes.putAll(userAttrs); + + EnumMap attrs = new EnumMap(DEFAULT_ATTRIBUTES); + attrs.putAll(userAttrs); + + // Convert some attrs from the map to local fields for performance reasons. + EnumSet requiredAttributes = EnumSet.copyOf(REQUIRED_ATTRIBUTES); + + for (Map.Entry e : attrs.entrySet()) { + requiredAttributes.remove(e.getKey()); + + // Run validation and conversion on all of them. + Object value = e.getKey().fromString(e.getValue()); + switch (e.getKey()) { + case ENCODING: + this.encoding = e.getValue(); + if (!Charset.isSupported(encoding)) { + throw new IllegalArgumentException("Encoding not supported on this JVM: " + + encoding); + } + this.charset = (Charset) value; + break; + + case SEPARATOR: + this.separatorChar = (Character) value; + break; + + case LOCALE: + this.locale = (Locale) value; + break; + + case ENCODER: + this.encoderType = (EncoderType) value; + break; + + case INPUT_CONVERSION: + { + @SuppressWarnings("unchecked") + Map gvalue = (Map) value; + this.inputConversion = gvalue; + } + break; + + case OUTPUT_CONVERSION: + { + @SuppressWarnings("unchecked") + Map gvalue = (Map) value; + this.outputConversion = gvalue; + } + break; + + case REPLACEMENT_PAIRS: + { + @SuppressWarnings("unchecked") + Map> gvalue = (Map>) value; + this.replacementPairs = gvalue; + } + break; + + case EQUIVALENT_CHARS: + { + @SuppressWarnings("unchecked") + Map> gvalue = (Map>) value; + this.equivalentChars = gvalue; + } + break; + + case IGNORE_PUNCTUATION: + case IGNORE_NUMBERS: + case IGNORE_CAMEL_CASE: + case IGNORE_ALL_UPPERCASE: + case IGNORE_DIACRITICS: + case CONVERT_CASE: + case RUN_ON_WORDS: + case FREQUENCY_INCLUDED: + this.boolAttributes.put(e.getKey(), (Boolean) value); + break; + + case AUTHOR: + case LICENSE: + case CREATION_DATE: + // Just run validation. + e.getKey().fromString(e.getValue()); + break; + + default: + throw new RuntimeException("Unexpected code path (attribute should be handled but is not): " + e.getKey()); + } + } + + if (!requiredAttributes.isEmpty()) { + throw new IllegalArgumentException("At least one the required attributes was not provided: " + + requiredAttributes.toString()); + } + + // Sanity check. + CharsetEncoder encoder = getEncoder(); + try { + ByteBuffer encoded = encoder.encode(CharBuffer.wrap(new char [] { separatorChar })); + if (encoded.remaining() > 1) { + throw new IllegalArgumentException("Separator character is not a single byte in encoding " + + encoding + ": " + separatorChar); + } + this.separator = encoded.get(); + } catch (CharacterCodingException e) { + throw new IllegalArgumentException("Separator character cannot be converted to a byte in " + + encoding + ": " + separatorChar, e); + } + } + + /** + * Returns a new {@link CharsetDecoder} for the {@link #encoding}. + */ + public CharsetDecoder getDecoder() { + try { + return charset.newDecoder().onMalformedInput( + CodingErrorAction.REPORT).onUnmappableCharacter( + CodingErrorAction.REPORT); + } catch (UnsupportedCharsetException e) { + throw new RuntimeException( + "FSA's encoding charset is not supported: " + encoding); + } + } + + /** + * Returns a new {@link CharsetEncoder} for the {@link #encoding}. + */ + public CharsetEncoder getEncoder() { + try { + return charset.newEncoder(); + } catch (UnsupportedCharsetException e) { + throw new RuntimeException( + "FSA's encoding charset is not supported: " + encoding); + } + } + + /** + * Return sequence encoder type. + */ + public EncoderType getEncoderType() { + return encoderType; + } + + /** + * Returns the {@link #separator} byte converted to a single char. Throws + * a {@link RuntimeException} if this conversion is for some reason impossible + * (the byte is a surrogate pair, FSA's {@link #encoding} is not available). + */ + public char getSeparatorAsChar() { + return separatorChar; + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java new file mode 100644 index 0000000..7e85ecb --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java @@ -0,0 +1,139 @@ +package morfologik.stemming; + +import java.nio.charset.Charset; +import java.util.EnumMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * Helper class to build {@link DictionaryMetadata} instances. + */ +public final class DictionaryMetadataBuilder { + private final EnumMap attrs + = new EnumMap(DictionaryAttribute.class); + + public DictionaryMetadataBuilder separator(char c) { + this.attrs.put(DictionaryAttribute.SEPARATOR, Character.toString(c)); + return this; + } + + public DictionaryMetadataBuilder encoding(Charset charset) { + return encoding(charset.name()); + } + + public DictionaryMetadataBuilder encoding(String charsetName) { + this.attrs.put(DictionaryAttribute.ENCODING, charsetName); + return this; + } + + public DictionaryMetadataBuilder frequencyIncluded() { return frequencyIncluded(false); } + public DictionaryMetadataBuilder frequencyIncluded(boolean v) { this.attrs.put(DictionaryAttribute.FREQUENCY_INCLUDED, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder ignorePunctuation() { return ignorePunctuation(true); } + public DictionaryMetadataBuilder ignorePunctuation(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_PUNCTUATION, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder ignoreNumbers() { return ignoreNumbers(true); } + public DictionaryMetadataBuilder ignoreNumbers(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_NUMBERS, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder ignoreCamelCase() { return ignoreCamelCase(true); } + public DictionaryMetadataBuilder ignoreCamelCase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_CAMEL_CASE, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder ignoreAllUppercase() { return ignoreAllUppercase(true); } + public DictionaryMetadataBuilder ignoreAllUppercase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_ALL_UPPERCASE, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder ignoreDiacritics() { return ignoreDiacritics(true); } + public DictionaryMetadataBuilder ignoreDiacritics(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_DIACRITICS, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder convertCase() { return convertCase(true); } + public DictionaryMetadataBuilder convertCase(boolean v) { this.attrs.put(DictionaryAttribute.CONVERT_CASE, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder supportRunOnWords() { return supportRunOnWords(true); } + public DictionaryMetadataBuilder supportRunOnWords(boolean v) { this.attrs.put(DictionaryAttribute.RUN_ON_WORDS, Boolean.valueOf(v).toString()); return this; } + + public DictionaryMetadataBuilder encoder(EncoderType type) { + this.attrs.put(DictionaryAttribute.ENCODER, type.name()); + return this; + } + + public DictionaryMetadataBuilder locale(Locale locale) { + return locale(locale.toString()); + } + + public DictionaryMetadataBuilder locale(String localeName) { + this.attrs.put(DictionaryAttribute.LOCALE, localeName); + return this; + } + + public DictionaryMetadataBuilder withReplacementPairs(Map> replacementPairs) { + StringBuilder builder = new StringBuilder(); + for (Map.Entry> e : replacementPairs.entrySet()) { + String k = e.getKey(); + for (String v : e.getValue()) { + if (builder.length() > 0) builder.append(", "); + builder.append(k).append(" ").append(v); + } + } + this.attrs.put(DictionaryAttribute.REPLACEMENT_PAIRS, builder.toString()); + return this; + } + + public DictionaryMetadataBuilder withEquivalentChars(Map> equivalentChars) { + StringBuilder builder = new StringBuilder(); + for (Map.Entry> e : equivalentChars.entrySet()) { + Character k = e.getKey(); + for (Character v : e.getValue()) { + if (builder.length() > 0) builder.append(", "); + builder.append(k).append(" ").append(v); + } + } + this.attrs.put(DictionaryAttribute.EQUIVALENT_CHARS, builder.toString()); + return this; + } + + public DictionaryMetadataBuilder withInputConversionPairs(Map conversionPairs) { + StringBuilder builder = new StringBuilder(); + for (Map.Entry e : conversionPairs.entrySet()) { + String k = e.getKey(); + if (builder.length() > 0) builder.append(", "); + builder.append(k).append(" ").append(conversionPairs.get(k)); + } + this.attrs.put(DictionaryAttribute.INPUT_CONVERSION, builder.toString()); + return this; + } + + public DictionaryMetadataBuilder withOutputConversionPairs(Map conversionPairs) { + StringBuilder builder = new StringBuilder(); + for (Map.Entry e : conversionPairs.entrySet()) { + String k = e.getKey(); + if (builder.length() > 0) builder.append(", "); + builder.append(k).append(" ").append(conversionPairs.get(k)); + } + this.attrs.put(DictionaryAttribute.OUTPUT_CONVERSION, builder.toString()); + return this; + } + + + public DictionaryMetadataBuilder author(String author) { + this.attrs.put(DictionaryAttribute.AUTHOR, author); + return this; + } + + public DictionaryMetadataBuilder creationDate(String creationDate) { + this.attrs.put(DictionaryAttribute.CREATION_DATE, creationDate); + return this; + } + + public DictionaryMetadataBuilder license(String license) { + this.attrs.put(DictionaryAttribute.LICENSE, license); + return this; + } + + public DictionaryMetadata build() { + return new DictionaryMetadata(attrs); + } + + public EnumMap toMap() { + return new EnumMap(attrs); + } +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java new file mode 100644 index 0000000..093cfbb --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java @@ -0,0 +1,11 @@ +package morfologik.stemming; + +/** + * Sequence encoder type. + */ +public enum EncoderType { + SUFFIX, + PREFIX, + INFIX, + NONE; +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java new file mode 100644 index 0000000..6e59526 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java @@ -0,0 +1,20 @@ +package morfologik.stemming; + +import java.util.List; + +/** + * A generic "stemmer" interface in Morfologik. + */ +public interface IStemmer { + /** + * Returns a list of {@link WordData} entries for a given word. The returned + * list is never null. Depending on the stemmer's + * implementation the {@link WordData} may carry the stem and additional + * information (tag) or just the stem. + *

+ * The returned list and any object it contains are not usable after a + * subsequent call to this method. Any data that should be stored in between + * must be copied by the caller. + */ + public List lookup(CharSequence word); +} diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java new file mode 100644 index 0000000..a1bdaa0 --- /dev/null +++ b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java @@ -0,0 +1,254 @@ +package morfologik.stemming; + +import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.*; + +import morfologik.util.BufferUtils; + +/** + * Stem and tag data associated with a given word. + * + *

+ * Important notes: + *

    + *
  • Objects of this class are volatile (their content changes on + * subsequent calls to {@link DictionaryLookup} class. If you need a copy of the + * stem or tag data for a given word, you have to create a custom buffer + * yourself and copy the associated data, perform {@link #clone()} or create + * strings (they are immutable) using {@link #getStem()} and then + * {@link CharSequence#toString()}.
  • + *
  • Objects of this class must not be used in any Java collections. In fact + * both equals and hashCode methods are overridden and throw exceptions to + * prevent accidental damage.
  • + *
+ */ +public final class WordData implements Cloneable { + /** + * Error information if somebody puts us in a Java collection. + */ + private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use" + + " in Java collections framework (volatile content). Refer to documentation."; + + /** Character encoding in internal buffers. */ + private final CharsetDecoder decoder; + + /** + * Inflected word form data. + */ + CharSequence wordCharSequence; + + /** + * Character sequence after converting {@link #stemBuffer} using + * {@link #decoder}. + */ + private CharBuffer stemCharSequence; + + /** + * Character sequence after converting {@link #tagBuffer} using + * {@link #decoder}. + */ + private CharBuffer tagCharSequence; + + /** Byte buffer holding the inflected word form data. */ + ByteBuffer wordBuffer; + + /** Byte buffer holding stem data. */ + ByteBuffer stemBuffer; + + /** Byte buffer holding tag data. */ + ByteBuffer tagBuffer; + + /** + * Package scope constructor. + */ + WordData(CharsetDecoder decoder) { + this.decoder = decoder; + + stemBuffer = ByteBuffer.allocate(0); + tagBuffer = ByteBuffer.allocate(0); + stemCharSequence = CharBuffer.allocate(0); + tagCharSequence = CharBuffer.allocate(0); + } + + /** + * A constructor for tests only. + */ + WordData(String stem, String tag, String encoding) { + this(Charset.forName(encoding).newDecoder()); + + try { + if (stem != null) + stemBuffer.put(stem.getBytes(encoding)); + if (tag != null) + tagBuffer.put(tag.getBytes(encoding)); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + /** + * Copy the stem's binary data (no charset decoding) to a custom byte + * buffer. If the buffer is null or not large enough to hold the result, a + * new buffer is allocated. + * + * @param target + * Target byte buffer to copy the stem buffer to or + * null if a new buffer should be allocated. + * + * @return Returns target or the new reallocated buffer. + */ + public ByteBuffer getStemBytes(ByteBuffer target) { + target = BufferUtils.ensureCapacity(target, stemBuffer.remaining()); + stemBuffer.mark(); + target.put(stemBuffer); + stemBuffer.reset(); + target.flip(); + return target; + } + + /** + * Copy the tag's binary data (no charset decoding) to a custom byte buffer. + * If the buffer is null or not large enough to hold the result, a new + * buffer is allocated. + * + * @param target + * Target byte buffer to copy the tag buffer to or + * null if a new buffer should be allocated. + * + * @return Returns target or the new reallocated buffer. + */ + public ByteBuffer getTagBytes(ByteBuffer target) { + target = BufferUtils.ensureCapacity(target, tagBuffer.remaining()); + tagBuffer.mark(); + target.put(tagBuffer); + tagBuffer.reset(); + target.flip(); + return target; + } + + /** + * Copy the inflected word's binary data (no charset decoding) to a custom + * byte buffer. If the buffer is null or not large enough to hold the + * result, a new buffer is allocated. + * + * @param target + * Target byte buffer to copy the word buffer to or + * null if a new buffer should be allocated. + * + * @return Returns target or the new reallocated buffer. + */ + public ByteBuffer getWordBytes(ByteBuffer target) { + target = BufferUtils.ensureCapacity(target, wordBuffer.remaining()); + wordBuffer.mark(); + target.put(wordBuffer); + wordBuffer.reset(); + target.flip(); + return target; + } + + /** + * @return Return tag data decoded to a character sequence or + * null if no associated tag data exists. + */ + public CharSequence getTag() { + tagCharSequence = decode(tagBuffer, tagCharSequence); + return tagCharSequence.remaining() == 0 ? null : tagCharSequence; + } + + /** + * @return Return stem data decoded to a character sequence or + * null if no associated stem data exists. + */ + public CharSequence getStem() { + stemCharSequence = decode(stemBuffer, stemCharSequence); + return stemCharSequence.remaining() == 0 ? null : stemCharSequence; + } + + /** + * @return Return inflected word form data. Usually the parameter passed to + * {@link DictionaryLookup#lookup(CharSequence)}. + */ + public CharSequence getWord() { + return wordCharSequence; + } + + /* + * + */ + @Override + public boolean equals(Object obj) { + throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); + } + + /* + * + */ + @Override + public int hashCode() { + throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); + } + + @Override + public String toString() { + return "WordData[" + + this.getWord() + "," + + this.getStem() + "," + + this.getTag() + "]"; + } + + /** + * Declare a covariant of {@link Object#clone()} that returns a deep copy of + * this object. The content of all internal buffers is copied. + */ + @Override + protected WordData clone() { + final WordData clone = new WordData(this.decoder); + clone.wordCharSequence = cloneCharSequence(wordCharSequence); + clone.wordBuffer = getWordBytes(null); + clone.stemBuffer = getStemBytes(null); + clone.tagBuffer = getTagBytes(null); + return clone; + } + + /** + * Clone char sequences only if not immutable. + */ + private CharSequence cloneCharSequence(CharSequence chs) { + if (chs instanceof String) + return chs; + return chs.toString(); + } + + /** + * Reset internal structures for storing another word's data. + */ + void reset() { + this.wordCharSequence = null; + this.wordBuffer = null; + this.stemCharSequence.clear(); + this.tagCharSequence.clear(); + this.stemBuffer.clear(); + this.tagBuffer.clear(); + } + + /** + * Decode byte buffer, optionally expanding the char buffer to. + */ + private CharBuffer decode(ByteBuffer bytes, CharBuffer chars) { + chars.clear(); + final int maxCapacity = (int) (bytes.remaining() * decoder.maxCharsPerByte()); + if (chars.capacity() <= maxCapacity) { + chars = CharBuffer.allocate(maxCapacity); + } + + bytes.mark(); + decoder.reset(); + decoder.decode(bytes, chars, true); + chars.flip(); + bytes.reset(); + + return chars; + } +} diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java new file mode 100644 index 0000000..1fd4e62 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java @@ -0,0 +1,247 @@ +package morfologik.stemming; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSABuilder; +import morfologik.fsa.FSAUtils; + +import org.junit.Test; + +/* + * + */ +public class DictionaryLookupTest { + /* */ + @Test + public void testPrefixDictionaries() throws IOException { + final URL url = this.getClass().getResource("test-prefix.dict"); + final IStemmer s = new DictionaryLookup(Dictionary.read(url)); + + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "Rzeczypospolitej")); + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "Rzecząpospolitą")); + + // This word is not in the dictionary. + assertNoStemFor(s, "martygalski"); + } + + @Test + public void testInputConversion() throws IOException { + final URL url = this.getClass().getResource("test-prefix.dict"); + final IStemmer s = new DictionaryLookup(Dictionary.read(url)); + + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "Rzecz\\apospolit\\a")); + + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "krowa\\apospolit\\a")); + } + + /* */ + @Test + public void testInfixDictionaries() throws IOException { + final URL url = this.getClass().getResource("test-infix.dict"); + final IStemmer s = new DictionaryLookup(Dictionary.read(url)); + + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "Rzeczypospolitej")); + assertArrayEquals(new String[] { "Rzeczycki", "adj:pl:nom:m" }, stem(s, + "Rzeczyccy")); + assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, + stem(s, "Rzecząpospolitą")); + + // This word is not in the dictionary. + assertNoStemFor(s, "martygalski"); + assertNoStemFor(s, "Rzeczyckiõh"); + } + + /* */ + @Test + public void testWordDataIterator() throws IOException { + final URL url = this.getClass().getResource("test-infix.dict"); + final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); + + final HashSet entries = new HashSet(); + for (WordData wd : s) { + entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag()); + } + + // Make sure a sample of the entries is present. + assertTrue(entries.contains("Rzekunia Rzekuń subst:sg:gen:m")); + assertTrue(entries + .contains("Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n")); + assertTrue(entries + .contains("Rzecząpospolitą Rzeczpospolita subst:irreg")); + assertTrue(entries + .contains("Rzeczypospolita Rzeczpospolita subst:irreg")); + assertTrue(entries + .contains("Rzeczypospolitych Rzeczpospolita subst:irreg")); + assertTrue(entries + .contains("Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f")); + } + + /* */ + @Test + public void testWordDataCloning() throws IOException { + final URL url = this.getClass().getResource("test-infix.dict"); + final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); + + ArrayList words = new ArrayList(); + for (WordData wd : s) { + WordData clone = wd.clone(); + words.add(clone); + } + + // Reiterate and verify that we have the same entries. + final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url)); + int i = 0; + for (WordData wd : s2) { + WordData clone = words.get(i++); + assertEqualSequences(clone.getStem(), wd.getStem()); + assertEqualSequences(clone.getTag(), wd.getTag()); + assertEqualSequences(clone.getWord(), wd.getWord()); + assertEqualSequences(clone.wordCharSequence, wd.wordCharSequence); + } + + // Check collections contract. + final HashSet entries = new HashSet(); + try { + entries.add(words.get(0)); + fail(); + } catch (RuntimeException e) { + // Expected. + } + } + + private void assertEqualSequences(CharSequence s1, CharSequence s2) { + assertEquals(s1.toString(), s2.toString()); + } + + /* */ + @Test + public void testMultibyteEncodingUTF8() throws IOException { + final URL url = this.getClass().getResource("test-diacritics-utf8.dict"); + Dictionary read = Dictionary.read(url); + final IStemmer s = new DictionaryLookup(read); + + for (byte[] ba : FSAUtils.rightLanguage(read.fsa, read.fsa.getRootNode())) { + System.out.println(new String(ba, "UTF-8")); + } + + assertArrayEquals(new String[] { "merge", "001" }, stem(s, "mergeam")); + assertArrayEquals(new String[] { "merge", "002" }, stem(s, "merseserăm")); + } + + /* */ + @Test + public void testSynthesis() throws IOException { + final URL url = this.getClass().getResource("test-synth.dict"); + final IStemmer s = new DictionaryLookup(Dictionary.read(url)); + + assertArrayEquals(new String[] { "miała", null }, stem(s, + "mieć|verb:praet:sg:ter:f:?perf")); + assertArrayEquals(new String[] { "a", null }, stem(s, "a|conj")); + assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n")); + + // This word is not in the dictionary. + assertNoStemFor(s, "martygalski"); + } + + /* */ + @Test + public void testInputWithSeparators() throws IOException { + final URL url = this.getClass().getResource("test-separators.dict"); + final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); + + /* + * Attemp to reconstruct input sequences using WordData iterator. + */ + ArrayList sequences = new ArrayList(); + for (WordData wd : s) { + sequences.add("" + wd.getWord() + " " + wd.getStem() + " " + + wd.getTag()); + } + Collections.sort(sequences); + + assertEquals("token1 null null", sequences.get(0)); + assertEquals("token2 null null", sequences.get(1)); + assertEquals("token3 null +", sequences.get(2)); + assertEquals("token4 token2 null", sequences.get(3)); + assertEquals("token5 token2 null", sequences.get(4)); + assertEquals("token6 token2 +", sequences.get(5)); + assertEquals("token7 token2 token3+", sequences.get(6)); + assertEquals("token8 token2 token3++", sequences.get(7)); + } + + /* */ + @Test + public void testSeparatorInLookupTerm() throws IOException { + FSA fsa = FSABuilder.build(toBytes("iso8859-1", new String [] { + "l+A+LW", + "l+A+NN1d", + })); + + DictionaryMetadata metadata = new DictionaryMetadataBuilder() + .separator('+') + .encoding("iso8859-1") + .encoder(EncoderType.INFIX) + .build(); + + final DictionaryLookup s = new DictionaryLookup(new Dictionary(fsa, metadata)); + assertEquals(0, s.lookup("l+A").size()); + } + + /* */ + @Test + public void testGetSeparator() throws IOException { + final URL url = this.getClass().getResource("test-separators.dict"); + final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); + assertEquals('+', s.getSeparatorChar()); + } + + private static byte[][] toBytes(String charset, String[] strings) { + byte [][] out = new byte [strings.length][]; + for (int i = 0; i < strings.length; i++) { + try { + out[i] = strings[i].getBytes(charset); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + return out; + } + + /* */ + public static String asString(CharSequence s) { + if (s == null) + return null; + return s.toString(); + } + + /* */ + public static String[] stem(IStemmer s, String word) { + ArrayList result = new ArrayList(); + for (WordData wd : s.lookup(word)) { + result.add(asString(wd.getStem())); + result.add(asString(wd.getTag())); + } + return result.toArray(new String[result.size()]); + } + + /* */ + public static void assertNoStemFor(IStemmer s, String word) { + assertArrayEquals(new String[] {}, stem(s, word)); + } +} diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java new file mode 100644 index 0000000..32e7fc7 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java @@ -0,0 +1,49 @@ +package morfologik.stemming; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.fest.assertions.api.Assertions; +import org.junit.Test; + +/* + * + */ +public class DictionaryMetadataBuilderTest { + /* */ + @Test + public void testAllConstantsHaveBuilderMethods() throws IOException { + Set keySet = new DictionaryMetadataBuilder() + .convertCase() + .encoding(Charset.defaultCharset()) + .encoding("UTF-8") + .frequencyIncluded() + .ignoreAllUppercase() + .ignoreCamelCase() + .ignoreDiacritics() + .ignoreNumbers() + .ignorePunctuation() + .separator('+') + .supportRunOnWords() + .encoder(EncoderType.SUFFIX) + .withEquivalentChars(Collections.>emptyMap()) + .withReplacementPairs(Collections.>emptyMap()) + .withInputConversionPairs(Collections.emptyMap()) + .withOutputConversionPairs(Collections.emptyMap()) + .locale(Locale.getDefault()) + .license("") + .author("") + .creationDate("") + .toMap().keySet(); + + Set all = EnumSet.allOf(DictionaryAttribute.class); + all.removeAll(keySet); + + Assertions.assertThat(all).isEmpty(); + } +} diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java new file mode 100644 index 0000000..13c61d7 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java @@ -0,0 +1,27 @@ +package morfologik.stemming; + +import static org.junit.Assert.assertEquals; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +/* + * + */ +public class DictionaryTest { + /* */ + + @Test + public void testConvertText() { + Map conversion = new HashMap(); + conversion.put("'", "`"); + conversion.put("fi", "fi"); + conversion.put("\\a", "ą"); + conversion.put("Barack", "George"); + assertEquals("filut", Dictionary.convertText("filut", conversion)); + assertEquals("fizdrygałką", Dictionary.convertText("fizdrygałk\\a", conversion)); + assertEquals("George Bush", Dictionary.convertText("Barack Bush", conversion)); + } +} diff --git a/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java new file mode 100644 index 0000000..e8c6c17 --- /dev/null +++ b/morfologik-stemming/src/test/java/morfologik/stemming/StringDecoderBenchmarkTest.java @@ -0,0 +1,62 @@ +package morfologik.stemming; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; + +import org.junit.Ignore; +import org.junit.Test; + +import com.carrotsearch.junitbenchmarks.AbstractBenchmark; +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; + +@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 20) +@Ignore +public class StringDecoderBenchmarkTest extends AbstractBenchmark { + /* Guard against escape analysis and HotSpot opts. */ + public volatile int guard; + + private final int sequences = 1000000; + + final String input = "dbaoidbhoei"; + final CharBuffer chars = CharBuffer.allocate(100); + final ByteBuffer bytes = ByteBuffer.allocate(100); + final CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder(); + + /** + * This is a simple comparison of performance converting a string to bytes + * using String.getBytes and CharsetEncoder (which String.getBytes uses + * internally in SUN's JDK). + */ + @Test + public void stringGetBytes() throws Exception { + int guard = 0; + for (int i = 0; i < sequences; i++) { + guard += input.getBytes("UTF-8").length; + } + this.guard = guard; + } + + @Test + public void charsetEncoder() throws Exception { + int guard = 0; + for (int i = 0; i < sequences; i++) { + chars.clear(); + for (int j = 0; j < input.length(); j++) { + chars.put(input.charAt(j)); + } + chars.flip(); + + bytes.clear(); + chars.mark(); + encoder.encode(chars, bytes, true); + bytes.flip(); + chars.reset(); + + guard += chars.remaining(); + } + + this.guard = guard; + } +} diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict new file mode 100644 index 0000000..2a62f21 Binary files /dev/null and b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.dict differ diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info new file mode 100644 index 0000000..058aed2 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info @@ -0,0 +1,9 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=UTF-8 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict new file mode 100644 index 0000000..cc91f70 Binary files /dev/null and b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.dict differ diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info new file mode 100644 index 0000000..535fac3 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info @@ -0,0 +1,8 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 + +fsa.dict.uses-infixes=true \ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict new file mode 100644 index 0000000..d0bed4c Binary files /dev/null and b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.dict differ diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info new file mode 100644 index 0000000..520739e --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info @@ -0,0 +1,11 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 + +fsa.dict.uses-prefixes=true +fsa.dict.uses-infixes=false + +fsa.dict.input-conversion=\\a ą, krowa Rzecz \ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict new file mode 100644 index 0000000..a71b9e7 Binary files /dev/null and b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.dict differ diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info new file mode 100644 index 0000000..8ec14c3 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info @@ -0,0 +1,10 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso8859-1 + +fsa.dict.uses-prefixes=false +fsa.dict.uses-infixes=false +fsa.dict.uses-suffixes=false diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt new file mode 100644 index 0000000..cd77945 --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt @@ -0,0 +1,8 @@ +token1+ +token2++ +token3+++ +token4+token2 +token5+token2+ +token6+token2++ +token7+token2+token3+ +token8+token2+token3++ \ No newline at end of file diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict new file mode 100644 index 0000000..6890253 Binary files /dev/null and b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.dict differ diff --git a/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info new file mode 100644 index 0000000..ffce33e --- /dev/null +++ b/morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info @@ -0,0 +1,6 @@ +# +# Dictionary properties. +# + +fsa.dict.separator=+ +fsa.dict.encoding=iso-8859-2 \ No newline at end of file diff --git a/morfologik-tools/pom.xml b/morfologik-tools/pom.xml new file mode 100644 index 0000000..cafbb46 --- /dev/null +++ b/morfologik-tools/pom.xml @@ -0,0 +1,159 @@ + + + + + 4.0.0 + + + org.carrot2 + morfologik-parent + 1.9.0 + ../pom.xml + + + morfologik-tools + jar + + Morfologik Command Line Tools + Morfologik Command Line Tools + + + + org.carrot2 + morfologik-fsa + ${project.version} + + + + com.carrotsearch + hppc + + + + org.carrot2 + morfologik-stemming + ${project.version} + + + + org.carrot2 + morfologik-polish + ${project.version} + + + + commons-cli + commons-cli + + + + commons-lang + commons-lang + + + + junit + junit + test + + + + com.google.guava + guava + test + + + + com.carrotsearch.randomizedtesting + randomizedtesting-runner + test + + + + org.easytesting + fest-assert-core + test + + + + + install + + + + org.apache.maven.plugins + maven-jar-plugin + + + + morfologik.tools.Launcher + true + + + + + + + com.pyx4me + proguard-maven-plugin + 2.0.4 + + + package + + proguard + + + + + + + + org.carrot2 + morfologik-fsa + + + org.carrot2 + morfologik-stemming + + + org.carrot2 + morfologik-polish + + + com.carrotsearch + hppc + + + commons-cli + commons-cli + + + commons-lang + commons-lang + + + + + true + true + standalone + true + ${project.build.directory} + ${basedir}/src/proguard/rules.pro + + + + + net.sf.proguard + proguard + 4.6-customized + system + ${project.basedir}/lib/proguard.jar + + + + + + diff --git a/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java b/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java new file mode 100644 index 0000000..687b6cb --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/FSABuildTool.java @@ -0,0 +1,541 @@ +package morfologik.tools; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +import morfologik.fsa.CFSA2Serializer; +import morfologik.fsa.FSA; +import morfologik.fsa.FSA5Serializer; +import morfologik.fsa.FSABuilder; +import morfologik.fsa.FSAFlags; +import morfologik.fsa.FSAInfo; +import morfologik.fsa.FSASerializer; +import morfologik.fsa.FSAUtils; +import morfologik.fsa.IMessageLogger; +import morfologik.fsa.StateVisitor; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang.StringEscapeUtils; + +import com.carrotsearch.hppc.IntIntOpenHashMap; +import com.carrotsearch.hppc.cursors.IntIntCursor; + +/** + * Convert from plain text input to a serialized FSA in any of the + * available {@link Format}s. + */ +public final class FSABuildTool extends Tool { + /** + * One megabyte. + */ + private final static int MB = 1024 * 1024; + + /** + * The serialization format to use for the binary output. + */ + public enum Format { + FSA5, + CFSA2; + + public FSASerializer getSerializer() { + switch (this) { + case FSA5: + return new FSA5Serializer(); + + case CFSA2: + return new CFSA2Serializer(); + + default: + throw new RuntimeException(); + } + } + } + + /** + * Be more verbose about progress. + */ + private boolean printProgress; + + /** + * Serializer used for emitting the FSA. + */ + private FSASerializer serializer; + + /** + * Output format name. + */ + private Format format; + + /** + * Warn about CR characters in the input (usually not what you want). + */ + private boolean crWarning = false; + + /** + * If true, the input is not buffered and sorted in-memory, but + * must be sorted externally (using the "C" convention: unsigned byte values). + */ + private boolean inputSorted; + + /** + * Print additional statistics about the output automaton. + */ + private boolean statistics; + + /** + * The actual construction of the FSA. + */ + private FSABuilder builder = new FSABuilder(); + + /** + * Start time. + */ + private long start = System.currentTimeMillis(); + + private IMessageLogger logger; + + /** + * Gets fed with the lines read from the input. + */ + private static interface LineConsumer { + /** + * Process the buffer, return the same buffer or a new buffer (for + * swapping). + */ + byte[] process(byte[] buffer, int pos); + } + + /** + * To help break out of the anonymous delegate on error. + */ + @SuppressWarnings("serial") + private static class TerminateProgramException extends RuntimeException { + public TerminateProgramException(String msg) { + super(msg); + } + + public synchronized Throwable fillInStackTrace() { + return null; + } + } + + /** + * Command line entry point after parsing arguments. + */ + protected void go(CommandLine line) throws Exception { + String[] args = line.getArgs(); + if (args.length != 0) { + printUsage(); + return; + } + + // Parse the input options. + parseOptions(line); + + logger = new WriterMessageLogger(new PrintWriter(System.err)); + this.serializer.withLogger(logger); + + BufferedInputStream inputStream = null; + try { + inputStream = initializeInput(line); + + if (inputSorted) { + logger.log("Assuming input is already sorted"); + } + + checkUtf8Bom(inputStream); + + final FSA fsa; + if (inputSorted) { + fsa = processSortedInput(inputStream); + } else { + fsa = processUnsortedInput(inputStream); + } + + if (crWarning) logger.log("Warning: input contained carriage returns?"); + + if (statistics) { + logger.startPart("Statistics"); + FSAInfo info = new FSAInfo(fsa); + TreeMap fanout = FSAUtils.calculateFanOuts(fsa, fsa.getRootNode()); + logger.endPart(); + + final IntIntOpenHashMap numbers = new IntIntOpenHashMap(); + fsa.visitInPostOrder(new StateVisitor() { + public boolean accept(int state) { + int thisNodeNumber = 0; + for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { + thisNodeNumber += + (fsa.isArcFinal(arc) ? 1 : 0) + + (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc))); + } + numbers.put(state, thisNodeNumber); + return true; + } + }); + + int singleRLC = 0; + for (IntIntCursor c : numbers) { + if (c.value == 1) singleRLC++; + } + + logger.log("Nodes", info.nodeCount); + logger.log("Arcs", info.arcsCount); + logger.log("Tail nodes", singleRLC); + + logger.log("States with the given # of outgoing arcs:"); + for (Map.Entry e : fanout.entrySet()) { + logger.log(" #" + e.getKey(), e.getValue()); + } + + logger.log("FSA builder properties:"); + for (Map.Entry e : builder.getInfo().entrySet()) { + logger.log(e.getKey().toString(), e.getValue()); + } + } + + // Save the result. + logger.startPart("Serializing " + format); + serializer.serialize(fsa, initializeOutput(line)).close(); + logger.endPart(); + } catch (OutOfMemoryError e) { + logger.log("Error: Out of memory. Pass -Xmx1024m argument (or more) to java."); + } finally { + if (inputStream != System.in && inputStream != null) { + inputStream.close(); + } + } + } + + /** + * Warn in case UTF-8 BOM is detected as this is 99% a mistake. + */ + private void checkUtf8Bom(InputStream is) throws IOException { + if (!is.markSupported()) { + // throw a hard assertion. + throw new AssertionError("Mark should be supported on input stream."); + } + + is.mark(3); + if (is.read() == 0xef && + is.read() == 0xbb && + is.read() == 0xbf) { + System.err.println("Warning: input starts with UTF-8 BOM bytes which is" + + " most likely not what you want. Use header-less UTF-8 file (unless you are" + + " encoding plain bytes in which case this message doesn't apply)."); + } + is.reset(); + } + + /** + * Process unsorted input (sort and construct FSA). + */ + private FSA processUnsortedInput(InputStream inputStream) + throws IOException { + final FSA root; + logger.startPart("Reading input"); + final ArrayList input = readInput(inputStream); + logger.endPart(); + + logger.log("Input sequences", input.size()); + + logger.startPart("Sorting"); + Collections.sort(input, FSABuilder.LEXICAL_ORDERING); + logger.endPart(); + + logger.startPart("Building FSA"); + for (byte [] bb : input) + builder.add(bb, 0, bb.length); + root = builder.complete(); + logger.endPart(); + return root; + } + + /** + * + */ + private FSA processSortedInput(InputStream inputStream) + throws IOException { + + int lines = forAllLines(inputStream, new LineConsumer() { + private byte [] current; + private byte [] previous = null; + private int previousLen; + + public byte[] process(byte[] current, int currentLen) { + // Verify the order. + if (previous != null) { + if (FSABuilder.compare(previous, 0, previousLen, current, 0, currentLen) > 0) { + logger.log("\n\nERROR: The input is not sorted: \n" + + dumpLine(previous, previousLen) + "\n" + + dumpLine(current, currentLen)); + throw new TerminateProgramException("Input is not sorted."); + } + } + + // Add to the automaton. + builder.add(current, 0, currentLen); + + // Swap buffers. + this.current = previous != null ? previous : new byte [current.length]; + this.previous = current; + this.previousLen = currentLen; + + return this.current; + } + }); + + logger.startPart("Building FSA"); + FSA fsa = builder.complete(); + logger.endPart(); + logger.log("Input sequences", lines); + + return fsa; + } + + /** + * Dump input line, byte-by-byte. + */ + protected String dumpLine(byte[] line, int length) { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < length; i++) { + if (i > 0) builder.append(" "); + builder.append(String.format("%02x", line[i])); + } + builder.append(" | "); + for (int i = 0; i < length; i++) { + if (Character.isLetterOrDigit(line[i])) + builder.append((char) line[i]); + else + builder.append("."); + } + return builder.toString(); + } + + /** + * Parse input options. + */ + private void parseOptions(CommandLine line) { + String opt; + + opt = SharedOptions.outputFormatOption.getOpt(); + if (line.hasOption(opt)) { + String formatValue = line.getOptionValue(opt); + try { + format = Format.valueOf(formatValue.toUpperCase()); + } catch (IllegalArgumentException e) { + throw new TerminateProgramException("Not a valid format: " + + formatValue); + } + } else { + format = Format.FSA5; + } + serializer = format.getSerializer(); + + Charset defaultCharset = Charset.defaultCharset(); + opt = SharedOptions.fillerCharacterOption.getLongOpt(); + if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) { + String chr = StringEscapeUtils.unescapeJava(line.getOptionValue(opt)); + checkSingleByte(chr, defaultCharset); + serializer.withFiller(chr.getBytes()[0]); + } + + opt = SharedOptions.annotationSeparatorCharacterOption.getLongOpt(); + if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) { + String chr = StringEscapeUtils.unescapeJava(line.getOptionValue(opt)); + checkSingleByte(chr, defaultCharset); + serializer.withAnnotationSeparator(chr.getBytes()[0]); + } + + opt = SharedOptions.withNumbersOption.getOpt(); + if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.NUMBERS)) { + serializer.withNumbers(); + } + + opt = SharedOptions.progressOption.getLongOpt(); + if (line.hasOption(opt)) { + printProgress = true; + } + + opt = SharedOptions.inputSortedOption.getLongOpt(); + if (line.hasOption(opt)) { + inputSorted = true; + } + + opt = SharedOptions.statistics.getLongOpt(); + if (line.hasOption(opt)) { + statistics = true; + } + } + + private boolean requiredCapability(String opt, FSAFlags flag) { + if (!serializer.getFlags().contains(flag)) { + throw new RuntimeException("This serializer does not support option: " + opt); + } + return true; + } + + /** + * Check if the argument is a single byte after conversion using platform-default + * encoding. + */ + public static byte checkSingleByte(String chr, Charset charset) { + byte bytes [] = chr.getBytes(charset); + if (bytes.length == 1) + return bytes[0]; + + throw new IllegalArgumentException("Filler and annotation characters must be single" + + "-byte values, " + chr + " has " + chr.getBytes().length + " bytes."); + } + + /** + * Read all the input lines, unsorted. + */ + private ArrayList readInput(InputStream is) throws IOException { + final ArrayList result = new ArrayList(); + forAllLines(is, new LineConsumer() { + public byte[] process(byte[] buffer, int pos) { + result.add(java.util.Arrays.copyOf(buffer, pos)); + return buffer; + } + }); + return result; + } + + /** + * Apply line consumer to all non-empty lines. + */ + private int forAllLines(InputStream is, LineConsumer lineConsumer) throws IOException { + int lines = 0; + byte[] buffer = new byte[0]; + int line = 0, b, pos = 0; + while ((b = is.read()) != -1) { + if (b == '\r' && !crWarning) { + crWarning = true; + } + + if (b == '\n') { + if (pos > 0) { + buffer = lineConsumer.process(buffer, pos); + pos = 0; + lines++; + } + + if (printProgress && line++ > 0 && (line % 1000000) == 0) { + logger.log(String.format(Locale.ENGLISH, "%6.2fs, sequences: %d", elapsedTime(), line)); + } + } else { + if (pos >= buffer.length) { + buffer = java.util.Arrays.copyOf(buffer, buffer.length + 10); + } + buffer[pos++] = (byte) b; + } + } + + if (pos > 0) { + lineConsumer.process(buffer, pos); + lines++; + } + + return lines; + } + + private double elapsedTime() { + return (System.currentTimeMillis() - start) / 1000.0d; + } + + @Override + protected void printUsage() { + final HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(this.getClass().getName(), options, true); + } + + @Override + protected void initializeOptions(Options options) { + options.addOption(SharedOptions.inputFileOption); + options.addOption(SharedOptions.outputFileOption); + + options.addOption(SharedOptions.outputFormatOption); + + options.addOption(SharedOptions.fillerCharacterOption); + options.addOption(SharedOptions.annotationSeparatorCharacterOption); + + options.addOption(SharedOptions.withNumbersOption); + options.addOption(SharedOptions.progressOption); + + options.addOption(SharedOptions.inputSortedOption); + + options.addOption(SharedOptions.statistics); + } + + /** + * + */ + @SuppressWarnings("resource") + private static OutputStream initializeOutput(CommandLine line) + throws IOException, ParseException { + final OutputStream output; + final String opt = SharedOptions.outputFileOption.getOpt(); + if (line.hasOption(opt)) { + // Use output file. + output = new FileOutputStream((File) line.getParsedOptionValue(opt)); + } else { + // Use standard output. + output = System.out; + } + return new BufferedOutputStream(output); + } + + /** + * + */ + @SuppressWarnings("resource") + private BufferedInputStream initializeInput(CommandLine line) + throws IOException, ParseException { + final InputStream input; + final String opt = SharedOptions.inputFileOption.getOpt(); + + if (line.hasOption(opt)) { + // Use input file. + File inputFile = (File) line.getParsedOptionValue(opt); + if (!inputSorted && inputFile.length() > 20 * MB) { + logger.log("WARN: The input file is quite large, avoid\n" + + " in-memory sorting by piping pre-sorted\n" + + " input directly to fsa_build. Linux:\n" + + " export LC_ALL=C && \\\n" + + " sort input | \\\n" + + " java -jar morfologik.jar fsa_build --sorted -o dict.fsa"); + } + + input = new FileInputStream(inputFile); + } else { + // Use standard input. + input = System.in; + } + return new BufferedInputStream(input); + } + + /** + * Command line entry point. + */ + public static void main(String... args) throws Exception { + final FSABuildTool tool = new FSABuildTool(); + tool.go(args); + } +} \ No newline at end of file diff --git a/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java b/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java new file mode 100644 index 0000000..510d8e5 --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/FSADumpTool.java @@ -0,0 +1,287 @@ +package morfologik.tools; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; + +import morfologik.fsa.FSA; +import morfologik.fsa.FSA5; +import morfologik.fsa.FSAInfo; +import morfologik.fsa.FSAUtils; +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryAttribute; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.WordData; +import morfologik.util.FileUtils; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Options; + +/** + * This utility will dump the information and contents of a given {@link FSA} + * dictionary. It can dump dictionaries in the raw form (as fed to the + * fsa_build program) or decoding compressed stem forms. + */ +public final class FSADumpTool extends Tool { + /** + * Direct binary stream used for dictionary dumps. + */ + private OutputStream os; + + /** + * A writer for messages and any text-based output. + */ + private Writer w; + + /** + * Print raw data only, no headers. + */ + private boolean dataOnly; + + /** + * Decode from prefix/infix/suffix encodings. + */ + private boolean decode; + + /** + * Dump graphviz DOT file instead of automaton sequences. + */ + private boolean dot; + + /** + * Command line entry point after parsing arguments. + */ + protected void go(CommandLine line) throws Exception { + final File dictionaryFile = (File) line + .getParsedOptionValue(SharedOptions.fsaDictionaryFileOption + .getOpt()); + + dataOnly = line.hasOption(SharedOptions.dataOnly.getOpt()); + decode = line.hasOption(SharedOptions.decode.getOpt()); + dot = line.hasOption(SharedOptions.dot.getLongOpt()); + + FileUtils.assertExists(dictionaryFile, true, false); + + dump(dictionaryFile); + } + + /** + * Dumps the content of a dictionary to a file. + */ + private void dump(File dictionaryFile) + throws UnsupportedEncodingException, IOException { + final long start = System.currentTimeMillis(); + + final Dictionary dictionary; + final FSA fsa; + + if (!dictionaryFile.canRead()) { + printWarning("Dictionary file does not exist: " + + dictionaryFile.getAbsolutePath()); + return; + } + + this.os = new BufferedOutputStream(System.out, 1024 * 32); + this.w = new OutputStreamWriter(os, "UTF-8"); + + if (hasMetadata(dictionaryFile)) { + dictionary = Dictionary.read(dictionaryFile); + fsa = dictionary.fsa; + + final String encoding = dictionary.metadata.getEncoding(); + if (!Charset.isSupported(encoding)) { + printWarning("Dictionary's charset is not supported " + + "on this JVM: " + encoding); + return; + } + } else { + dictionary = null; + fsa = FSA.read(new FileInputStream(dictionaryFile)); + printWarning("Warning: FSA automaton without metadata file."); + } + + printExtra("FSA properties"); + printExtra("--------------"); + printExtra("FSA implementation : " + fsa.getClass().getName()); + printExtra("Compiled with flags : " + fsa.getFlags().toString()); + + if (!dataOnly) { + final FSAInfo info = new FSAInfo(fsa); + printExtra("Number of arcs : " + info.arcsCount + "/" + info.arcsCountTotal); + printExtra("Number of nodes : " + info.nodeCount); + printExtra("Number of final states : " + info.finalStatesCount); + printExtra(""); + } + + // Separator for dumping. + char separator = '\t'; + + if (fsa instanceof FSA5) { + printExtra("FSA5 properties"); + printExtra("---------------"); + printFSA5((FSA5) fsa); + printExtra(""); + } + + if (dictionary != null) { + printExtra("Dictionary metadata"); + printExtra("-------------------"); + + Map values = + new LinkedHashMap(dictionary.metadata.getAttributes()); + values.put(DictionaryAttribute.ENCODING, dictionary.metadata.getEncoding()); + values.put(DictionaryAttribute.SEPARATOR, "0x" + + Integer.toHexString(dictionary.metadata.getSeparator()) + + " ('" + dictionary.metadata.getSeparatorAsChar() + "')"); + + for (Map.Entry e : values.entrySet()) { + printExtra(String.format(Locale.ENGLISH, + "%-40s: %s", + e.getKey().propertyName, + e.getValue())); + } + printExtra(""); + } + + int sequences = 0; + if (decode) { + if (dictionary == null) { + printWarning("No dictionary metadata available."); + return; + } + + printExtra("Decoded FSA data (in the encoding above)"); + printExtra("----------------------------------------"); + + final DictionaryLookup dl = new DictionaryLookup(dictionary); + final StringBuilder builder = new StringBuilder(); + final OutputStreamWriter osw = new OutputStreamWriter(os, dictionary.metadata.getEncoding()); + + CharSequence t; + for (WordData wd : dl) { + builder.setLength(0); + builder.append(wd.getWord()); + builder.append(separator); + + t = wd.getStem(); + if (t == null) + t = ""; + builder.append(t); + builder.append(separator); + + t = wd.getTag(); + if (t == null) + t = ""; + builder.append(t); + builder.append('\n'); + + osw.write(builder.toString()); + sequences++; + } + osw.flush(); + } else { + if (dot) { + FSAUtils.toDot(w, fsa, fsa.getRootNode()); + w.flush(); + } else { + printExtra("FSA data (raw bytes in the encoding above)"); + printExtra("------------------------------------------"); + + for (ByteBuffer bb : fsa) { + os.write(bb.array(), 0, bb.remaining()); + os.write(0x0a); + sequences++; + } + } + } + + printExtra("--------------------"); + + final long millis = Math.max(1, System.currentTimeMillis() - start); + printExtra(String + .format( + Locale.ENGLISH, + "Dictionary dumped in %.3f second(s), %d sequences (%d sequences/sec.).", + millis / 1000.0, sequences, + (int) (sequences / (millis / 1000.0)))); + + os.flush(); + } + + /** + * Print {@link FSA5}-specific stuff. + */ + private void printFSA5(FSA5 fsa) throws IOException { + printExtra("GTL : " + fsa.gtl); + printExtra("Node extra data : " + fsa.nodeDataLength); + printExtra("Annotation separator : " + byteAsChar(fsa.annotation)); + printExtra("Filler character : " + byteAsChar(fsa.filler)); + } + + /** + * Convert a byte to a character, no charset decoding, simple ASCII range mapping. + */ + private char byteAsChar(byte v) { + char chr = (char) (v & 0xff); + if (chr < 127) + return chr; + else + return '?'; + } + + /* + * + */ + private void printExtra(String msg) throws IOException { + if (dataOnly) + return; + w.write(msg); + w.write('\n'); + w.flush(); + } + + /* + * + */ + private void printWarning(String msg) { + System.err.println(msg); + } + + /** + * Check if there is a metadata file for the given FSA automaton. + */ + private static boolean hasMetadata(File fsaFile) { + final File featuresFile = new File(fsaFile.getParent(), Dictionary + .getExpectedFeaturesName(fsaFile.getName())); + + return featuresFile.canRead(); + } + + /** + * Command line options for the tool. + */ + protected void initializeOptions(Options options) { + options.addOption(SharedOptions.fsaDictionaryFileOption); + options.addOption(SharedOptions.dataOnly); + options.addOption(SharedOptions.decode); + options.addOption(SharedOptions.dot); + } + + /** + * Command line entry point. + */ + public static void main(String... args) throws Exception { + final FSADumpTool fsaDump = new FSADumpTool(); + fsaDump.go(args); + } +} \ No newline at end of file diff --git a/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java b/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java new file mode 100644 index 0000000..e913b7f --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/InflectionFramesTool.java @@ -0,0 +1,112 @@ +package morfologik.tools; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.*; +import java.util.*; +import java.util.Map.Entry; + +import morfologik.stemming.*; +import morfologik.stemming.Dictionary; + +/** + * Calculate inflection frames from the Polish dictionary. + */ +public class InflectionFramesTool { + public static void main(String[] args) throws IOException { + new InflectionFramesTool().inflectionFrames(); + } + + /* */ + @SuppressWarnings( { "unused" }) + public void inflectionFrames() throws IOException { + final Dictionary pl = Dictionary.getForLanguage("pl"); + final DictionaryLookup dict = new DictionaryLookup(pl); + final CharsetDecoder decoder = pl.metadata.getDecoder(); + + final HashMap> forms = + new HashMap>(); + + ByteBuffer stemBuffer = ByteBuffer.allocate(0); + ByteBuffer inflBuffer = ByteBuffer.allocate(0); + ByteBuffer stemDecoded = ByteBuffer.allocate(0); + + int limit = Integer.MAX_VALUE; + + final Iterator i = new DictionaryIterator(pl, decoder, false); + while (i.hasNext() && limit-- > 0) { + final WordData wd = i.next(); + + final CharSequence inflected = wd.getWord(); + final CharSequence stemEncoded = wd.getStem(); + final CharSequence tag = wd.getTag(); + if (tag == null) + continue; + + inflBuffer.clear(); + inflBuffer = wd.getWordBytes(inflBuffer); + + stemBuffer.clear(); + stemBuffer = wd.getStemBytes(stemBuffer); + + stemDecoded = DictionaryLookup.decodeBaseForm(stemDecoded, stemBuffer + .array(), stemBuffer.remaining(), inflBuffer, pl.metadata); + stemDecoded.flip(); + + final String stem = decoder.decode(stemDecoded).toString(); + final String form = tag.toString().intern(); + + ArrayList frames = forms.get(stem); + if (frames == null) { + forms.put(stem, frames = new ArrayList()); + } + + if (!frames.contains(form)) { + frames.add(form); + } + } + + // Sort the forms so that we get a unique key. Then iteratively add them + // to another hash (by form this time). + final HashMap> frames = + new HashMap>(); + + StringBuilder key = new StringBuilder(); + for (Map.Entry> e : forms.entrySet()) { + Collections.sort(e.getValue()); + + key.setLength(0); + for (String s : e.getValue()) + key.append(s).append(" "); + + final String k = key.toString(); + ArrayList words = frames.get(k); + if (words == null) { + frames.put(k, words = new ArrayList()); + } + words.add(e.getKey()); + + e.setValue(null); + } + + // Print inflection frames. + ArrayList>> entries = + new ArrayList>>(); + + entries.addAll(frames.entrySet()); + Collections.sort(entries, + new Comparator>>() { + public int compare(Entry> o1, + Entry> o2) { + return o2.getValue().size() - o1.getValue().size(); + } + }); + + for (Map.Entry> e : entries) { + System.out.println(String.format("%6d %s %s", + e.getValue().size(), e.getKey(), e.getValue())); + } + + System.out.println("Total frames: " + frames.size()); + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/Launcher.java b/morfologik-tools/src/main/java/morfologik/tools/Launcher.java new file mode 100644 index 0000000..320c1dc --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/Launcher.java @@ -0,0 +1,158 @@ +package morfologik.tools; + +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Method; +import java.net.URL; +import java.util.Enumeration; +import java.util.Iterator; +import java.util.TreeMap; +import java.util.jar.Manifest; + +import morfologik.util.FileUtils; + +/** + * A launcher for other command-line tools. + */ +public final class Launcher { + /** + * Tool description. + */ + final static class ToolInfo { + public final Class clazz; + public final String info; + + public ToolInfo(Class clazz, String info) { + this.clazz = clazz; + this.info = info; + } + + public void invoke(String[] subArgs) throws Exception { + final Method m = clazz.getMethod("main", + new Class[] { String[].class }); + m.invoke(null, new Object[] { subArgs }); + } + } + + /** + * Command line entry point. + */ + public static void main(String[] args) throws Exception { + // If so, tools are unavailable and a classpath error has been logged. + final TreeMap tools = initTools(); + + if (tools == null) + { + return; + } + + if (args.length == 0) { + System.out.println("Provide tool name and its command-line options. " + + "Available tools:"); + for (String key : tools.keySet()) { + final ToolInfo toolInfo = tools.get(key); + System.out.println(String.format(" %-10s - %s", key, + toolInfo.info)); + } + } else { + final String toolName = args[0]; + if (!tools.containsKey(toolName)) { + System.out.println("Unknown tool: " + toolName); + return; + } + + final String[] subArgs = new String[args.length - 1]; + System.arraycopy(args, 1, subArgs, 0, subArgs.length); + + final ToolInfo toolInfo = (ToolInfo) tools.get(toolName); + toolInfo.invoke(subArgs); + } + } + + /** + * Initialize and check tools' availability. + */ + static TreeMap initTools() { + TreeMap tools = new TreeMap(); + + tools.put("fsa_build", new ToolInfo(FSABuildTool.class, + "Create an automaton from plain text files.")); + + tools.put("fsa_dump", new ToolInfo(FSADumpTool.class, + "Dump an FSA dictionary.")); + + tools.put("tab2morph", new ToolInfo(MorphEncodingTool.class, + "Convert tabbed dictionary to fsa encoding format.")); + + tools.put("plstem", new ToolInfo(PolishStemmingTool.class, + "Apply Polish dictionary stemming to the input.")); + + // Prune unavailable tools. + for (Iterator i = tools.values().iterator(); i.hasNext();) { + ToolInfo ti = i.next(); + try { + ti.clazz.newInstance().isAvailable(); + } catch (NoClassDefFoundError e) { + logJarWarning(); + return null; + } catch (Throwable e) { + System.out.println("Tools could not be initialized because" + + " of an exception during initialization: " + + e.getClass().getName() + ", " + e.getMessage()); + return null; + } + } + + return tools; + } + + /** + * Log a warning about missing JAR dependencies. + */ + private static void logJarWarning() { + System.out.println("Tools are unavailable, at least one JAR dependency missing."); + + try { + final Class clazz = Launcher.class; + final ClassLoader classLoader = clazz.getClassLoader(); + + final String clazzName = clazz.getName().replace('.', '/') + ".class"; + // Figure out our own class path location. + final URL launcherLocation = classLoader.getResource(clazzName); + if (launcherLocation == null) + return; + + String launcherPrefix = launcherLocation.toString() + .replace(clazzName, ""); + + // Figure our our location's MANIFEST.MF (class loader may be hitting a few). + URL manifestResource = null; + Enumeration manifests = classLoader.getResources("META-INF/MANIFEST.MF"); + while (manifests.hasMoreElements()) + { + URL candidate = manifests.nextElement(); + if (candidate.toString().startsWith(launcherPrefix)) + { + manifestResource = candidate; + break; + } + } + + if (manifestResource == null) + return; + + InputStream stream = null; + try { + stream = manifestResource.openStream(); + Manifest manifest = new Manifest(stream); + + System.out.println("Required JARs: " + + manifest.getMainAttributes().getValue("Class-Path")); + } catch (IOException e) { + FileUtils.close(stream); + } + } catch (IOException e) { + // Ignore. + } + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java b/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java new file mode 100644 index 0000000..dfade2d --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/MorphEncodingTool.java @@ -0,0 +1,255 @@ +package morfologik.tools; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Locale; + +import morfologik.fsa.FSA5; +import morfologik.stemming.EncoderType; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang.StringEscapeUtils; + + +/** + * This utility converts the dictionary in a text (tabbed) format into + * the format accepted by the fsa building tools. It is meant to replace + * the Perl and AWK scripts from the original FSA package. + */ +class MorphEncodingTool extends Tool { + private static Charset US_ASCII = Charset.forName("US-ASCII"); + private boolean noWarn = false; + private SequenceAssembler encoder; + private byte separatorByte; + private char separator; + + /** + * + */ + protected void go(final CommandLine line) throws Exception { + noWarn = line.hasOption(SharedOptions.noWarnIfTwoFields.getOpt()); + + EncoderType encType = EncoderType.SUFFIX; + if (line.hasOption(SharedOptions.encoder.getOpt())) { + String encValue = line.getOptionValue(SharedOptions.encoder.getOpt()); + try { + encType = EncoderType.valueOf(encValue.toUpperCase()); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Invalid encoder: " + encValue + ", " + + "allowed values: " + Arrays.toString(EncoderType.values())); + } + } + + separator = FSA5.DEFAULT_ANNOTATION; + if (line.hasOption(SharedOptions.annotationSeparatorCharacterOption.getLongOpt())) { + String sep = line.getOptionValue(SharedOptions.annotationSeparatorCharacterOption.getLongOpt()); + + // Decode escape sequences. + sep = StringEscapeUtils.unescapeJava(sep); + if (sep.length() != 1) { + throw new IllegalArgumentException("Field separator must be a single character: " + sep); + } + if (sep.charAt(0) > 0xff) { + throw new IllegalArgumentException("Field separator not within byte range: " + (int) sep.charAt(0)); + } + separator = sep.charAt(0); + separatorByte = FSABuildTool.checkSingleByte(Character.toString(separator), Charset.defaultCharset()); + } + + encoder = new SequenceAssembler(SequenceEncoders.forType(encType), (byte) separator); + + // Determine input and output streams. + final DataInputStream input = initializeInput(line); + final DataOutputStream output = initializeOutput(line); + + try { + process(input, output); + output.flush(); + + } finally { + input.close(); + output.close(); + } + } + + /** + * Process input stream, writing to output stream. + * + */ + protected void process(final DataInputStream input, final DataOutputStream output) + throws IOException { + long lnumber = 0; + try { + int bufPos = 0; + byte[] buf = new byte[0]; + ArrayList columns = new ArrayList(); + int dataByte; + do { + dataByte = input.read(); + switch (dataByte) { + case '\r': + // Ignore CR + continue; + + case '\t': + columns.add(Arrays.copyOf(buf, bufPos)); + bufPos = 0; + break; + + case -1: + // Process EOF as if we encountered \n. fall-through. + + case '\n': + lnumber++; + if (bufPos == 0 && columns.isEmpty()) { + if (dataByte != -1) { + System.err.println(String.format(Locale.ROOT, + "Ignoring empty line %d.", lnumber)); + } + break; + } + + columns.add(Arrays.copyOf(buf, bufPos)); + + if (columns.size() < 2 || columns.size() > 3) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, "Every \\n-delimited 'line' must contain 2 or 3 columns, line %d has %d. US-ASCII version of this line: %s", + lnumber, + columns.size(), + toAscii(columns))); + } + + if (columns.size() == 2 && !noWarn) { + System.err.println(String.format(Locale.ROOT, + "Line %d has %d columns. US-ASCII version of this line: %s", + lnumber, + columns.size(), + toAscii(columns))); + } + + byte [] wordForm = columns.get(0); + byte [] wordLemma = columns.get(1); + if (contains(wordForm, separatorByte) || + contains(wordLemma, separatorByte)) { + throw new IllegalArgumentException( + String.format(Locale.ROOT, + "Either word or lemma in line %d contain the annotation byte '%s': %s", + lnumber, + separator, + toAscii(columns))); + } + + output.write(encoder.encode( + wordForm, + wordLemma, + columns.size() > 2 ? columns.get(2) : null)); + + output.writeByte('\n'); + + bufPos = 0; + columns.clear(); + break; + + default: + if (bufPos >= buf.length) { + buf = Arrays.copyOf(buf, buf.length + 1024); + } + buf[bufPos++] = (byte) dataByte; + } + } while (dataByte != -1); + } finally { + input.close(); + } + } + + private boolean contains(byte [] seq, byte b) { + for (int i = 0; i < seq.length; i++) { + if (seq[i] == b) return true; + } + return false; + } + + private String toAscii(ArrayList columns) + { + StringBuilder b = new StringBuilder(); + for (int i = 0; i < columns.size(); i++) { + if (i > 0) b.append("\t"); + b.append(new String(columns.get(i), US_ASCII)); + } + return b.toString(); + } + + /** + * Command line options for the tool. + */ + protected void initializeOptions(Options options) { + options.addOption(SharedOptions.inputFileOption); + options.addOption(SharedOptions.outputFileOption); + options.addOption(SharedOptions.encoder); + options.addOption(SharedOptions.noWarnIfTwoFields); + options.addOption(SharedOptions.annotationSeparatorCharacterOption); + } + + /** + * + */ + private static DataOutputStream initializeOutput(CommandLine line) + throws IOException, ParseException { + final DataOutputStream output; + final String opt = SharedOptions.outputFileOption.getOpt(); + if (line.hasOption(opt)) { + // Use output file. + output = new DataOutputStream( + new BufferedOutputStream( + new FileOutputStream((File) line + .getParsedOptionValue(opt)))); + } else { + // Use standard output. + output = new DataOutputStream( + new BufferedOutputStream( + System.out)); + } + return output; + } + + /** + * + */ + private static DataInputStream initializeInput(CommandLine line) + throws IOException, ParseException { + final DataInputStream input; + final String opt = SharedOptions.inputFileOption.getOpt(); + if (line.hasOption(opt)) { + // Use input file. + input = new DataInputStream ( + new BufferedInputStream( + new FileInputStream((File) line + .getParsedOptionValue(opt)))); + } else { + // Use standard input. + input = new DataInputStream( + new BufferedInputStream( + System.in)); + } + return input; + } + + /** + * Command line entry point. + */ + public static void main(String... args) throws Exception { + final MorphEncodingTool tool = new MorphEncodingTool(); + tool.go(args); + } +} \ No newline at end of file diff --git a/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java b/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java new file mode 100644 index 0000000..22c84c3 --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/PolishStemmingTool.java @@ -0,0 +1,193 @@ +package morfologik.tools; + +import java.io.*; +import java.text.MessageFormat; +import java.util.List; +import java.util.Locale; + +import morfologik.stemming.*; + +import org.apache.commons.cli.*; + +/** + * This utility parses input text, tokenizes it on whitespace and stems input + * words, writing them to the output in column-based format: + * + *
+ * word   stem   form
+ * word   stem   form
+ * 
+ * + * Words for which no stems or forms are available have empty values in each + * respective column. Columns are tab-delimited. + */ +class PolishStemmingTool extends Tool { + /** + * + */ + protected void go(CommandLine line) throws Exception { + // Determine input/ output encoding. + final String inputEncoding = getEncodingOption(line, + SharedOptions.inputEncodingOption.getOpt()); + + final String outputEncoding = getEncodingOption(line, + SharedOptions.outputEncodingOption.getOpt()); + + System.err.println("Input encoding: " + inputEncoding); + System.err.println("Output encoding: " + outputEncoding); + + // Determine input and output streams. + final Reader input = initializeInput(line, inputEncoding); + final Writer output = initializeOutput(line, outputEncoding); + + final long start = System.currentTimeMillis(); + try { + final long count = process(input, output); + + output.flush(); + + final long millis = System.currentTimeMillis() - start; + final double time = millis / 1000.0; + final double wordsPerSec = time > 0 ? (count / time) + : Double.POSITIVE_INFINITY; + System.out + .println(new MessageFormat( + "Processed {0} words in {1,number,#.###} seconds ({2,number,#} words per second).", + Locale.ENGLISH).format(new Object[] { + new Long(count), new Double(millis / 1000.0), + new Double(wordsPerSec) })); + } finally { + input.close(); + output.close(); + } + + } + + /** + * Process input stream, writing to output stream. + * + * @return Returns the number of processed words. + */ + protected long process(Reader input, Writer output) throws IOException { + final IStemmer stemmer = new PolishStemmer(); + final StreamTokenizer st = new StreamTokenizer(input); + st.eolIsSignificant(false); + st.wordChars('+', '+'); + + long count = 0; + int token; + while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) { + if (token == StreamTokenizer.TT_WORD) { + final String word = st.sval; + + count++; + final List stems = stemmer.lookup(word); + if (stems.size() == 0) { + output.write(word); + output.write("\t-\t-\n"); + } else { + for (WordData wd : stems) { + output.write(word); + output.write("\t"); + output.write(asString(wd.getStem())); + output.write("\t"); + output.write(asString(wd.getTag())); + output.write("\n"); + } + } + } + } + + return count; + } + + private String asString(CharSequence stem) { + if (stem == null) + return "-"; + return stem.toString(); + } + + /** + * Command line options for the tool. + */ + protected void initializeOptions(Options options) { + options.addOption(SharedOptions.inputFileOption); + options.addOption(SharedOptions.inputEncodingOption); + options.addOption(SharedOptions.outputFileOption); + options.addOption(SharedOptions.outputEncodingOption); + } + + /** + * + */ + private Writer initializeOutput(CommandLine line, String outputEncoding) + throws IOException, ParseException { + final Writer output; + final String opt = SharedOptions.outputFileOption.getOpt(); + if (line.hasOption(opt)) { + // Use output file. + output = new OutputStreamWriter( + new BufferedOutputStream(new FileOutputStream((File) line + .getParsedOptionValue(opt))), outputEncoding); + } else { + // Use standard output. + System.err.println("Using standard output for output."); + output = new OutputStreamWriter(System.out, outputEncoding); + } + return output; + } + + /** + * + */ + private Reader initializeInput(CommandLine line, String inputEncoding) + throws IOException, ParseException { + final Reader input; + final String opt = SharedOptions.inputFileOption.getOpt(); + + if (line.hasOption(opt)) { + // Use input file. + input = new InputStreamReader( + new BufferedInputStream(new FileInputStream((File) line + .getParsedOptionValue(opt))), inputEncoding); + } else { + // Use standard input. + System.err.println("Using standard input for input."); + input = new InputStreamReader(System.in, inputEncoding); + } + return input; + } + + /** + * + */ + private String getEncodingOption(CommandLine line, String opt) { + String encoding = System.getProperty("file.encoding", "iso-8859-1"); + if (line.hasOption(opt)) { + encoding = line.getOptionValue(opt); + } + return encoding; + } + + /* + * Check if the dictionary is available. + */ + @Override + protected boolean isAvailable() { + boolean available = true; + try { + new PolishStemmer(); + } catch (Throwable t) { + available = false; + } + return available; + } + + /** + * Command line entry point. + */ + public static void main(String[] args) throws Exception { + final PolishStemmingTool tool = new PolishStemmingTool(); + tool.go(args); + } +} \ No newline at end of file diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java new file mode 100644 index 0000000..e5fd388 --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceAssembler.java @@ -0,0 +1,46 @@ +package morfologik.tools; + +import morfologik.fsa.FSA5; +import morfologik.tools.SequenceEncoders.IEncoder; + +import com.carrotsearch.hppc.ByteArrayList; + +final class SequenceAssembler { + private final byte annotationSeparator; + + private final ByteArrayList src = new ByteArrayList(); + private final ByteArrayList dst = new ByteArrayList(); + private final ByteArrayList tmp = new ByteArrayList(); + + private final IEncoder encoder; + + public SequenceAssembler(SequenceEncoders.IEncoder encoder) { + this(encoder, FSA5.DEFAULT_ANNOTATION); + } + + public SequenceAssembler(SequenceEncoders.IEncoder encoder, byte annotationSeparator) { + this.annotationSeparator = annotationSeparator; + this.encoder = encoder; + } + + byte [] encode(byte [] wordForm, byte [] wordLemma, byte [] wordTag) + { + src.clear(); + dst.clear(); + tmp.clear(); + + tmp.add(wordForm); + tmp.add(annotationSeparator); + + src.add(wordForm); + dst.add(wordLemma); + encoder.encode(src, dst, tmp); + + tmp.add(annotationSeparator); + if (wordTag != null) { + tmp.add(wordTag); + } + + return tmp.toArray(); + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java new file mode 100644 index 0000000..37cd0cc --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/SequenceEncoders.java @@ -0,0 +1,361 @@ +package morfologik.tools; + +import morfologik.stemming.EncoderType; + +import com.carrotsearch.hppc.ByteArrayList; + +/** + * Container class for sequence encoders. + */ +public final class SequenceEncoders { + private SequenceEncoders() {} + + /** + * Maximum encodable single-byte code. + */ + private static final int REMOVE_EVERYTHING = 255; + + public static interface IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer); + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer); + public EncoderType type(); + } + + /** + * Encodes dst relative to src by trimming + * whatever non-equal suffix src has. The output code is (bytes): + *
+     * {K}{suffix}
+     * 
+ * where (K - 'A') bytes should be trimmed from the end of src + * and then the suffix should be appended to the resulting byte sequence. + * + *

Examples:

+ *
+     * src: foo
+     * dst: foobar
+     * encoded: Abar
+     * 
+     * src: foo
+     * dst: bar
+     * encoded: Dbar
+     * 
+ * + *

Note: The code length is a single byte. If equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence + * should be discarded.

+ */ + public static class TrimSuffixEncoder implements IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + int sharedPrefix = sharedPrefixLength(src, dst); + int truncateBytes = src.size() - sharedPrefix; + if (truncateBytes >= REMOVE_EVERYTHING) { + truncateBytes = REMOVE_EVERYTHING; + sharedPrefix = 0; + } + + final byte suffixTrimCode = (byte) (truncateBytes + 'A'); + encoded.add(suffixTrimCode); + encoded.add(dst.buffer, sharedPrefix, dst.size() - sharedPrefix); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int suffixTrimCode = encoded.get(0); + int truncateBytes = (suffixTrimCode - 'A') & 0xFF; + if (truncateBytes == REMOVE_EVERYTHING) { + truncateBytes = src.size(); + } + + dst.add(src.buffer, 0, src.size() - truncateBytes); + dst.add(encoded.buffer, 1, encoded.size() - 1); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.SUFFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Encodes dst relative to src by trimming + * whatever non-equal suffix and prefix src and dst have. + * The output code is (bytes): + *
+     * {P}{K}{suffix}
+     * 
+ * where (P - 'A') bytes should be trimmed from the start of src, + * (K - 'A') bytes should be trimmed from the end of src + * and then the suffix should be appended to the resulting byte sequence. + * + *

Examples:

+ *
+     * src: abc
+     * dst: abcd
+     * encoded: AAd
+     * 
+     * src: abc
+     * dst: xyz
+     * encoded: ADxyz
+     * 
+ * + *

Note: Each code's length is a single byte. If any is equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence + * should be discarded.

+ */ + public static class TrimPrefixAndSuffixEncoder implements IEncoder { + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + // Search for the maximum matching subsequence that can be encoded. + int maxSubsequenceLength = 0; + int maxSubsequenceIndex = 0; + for (int i = 0; i < src.size(); i++) { + // prefix at i => shared subsequence (infix) + int sharedPrefix = sharedPrefixLength(src, i, dst, 0); + // Only update maxSubsequenceLength if we will be able to encode it. + if (sharedPrefix > maxSubsequenceLength + && i < REMOVE_EVERYTHING + && (src.size() - (i + sharedPrefix)) < REMOVE_EVERYTHING) { + maxSubsequenceLength = sharedPrefix; + maxSubsequenceIndex = i; + } + } + + // Determine how much to remove (and where) from src to get a prefix of dst. + int truncatePrefixBytes = maxSubsequenceIndex; + int truncateSuffixBytes = (src.size() - (maxSubsequenceIndex + maxSubsequenceLength)); + if (truncatePrefixBytes >= REMOVE_EVERYTHING || + truncateSuffixBytes >= REMOVE_EVERYTHING) { + maxSubsequenceIndex = maxSubsequenceLength = 0; + truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING; + } + + encoded.add((byte) ((truncatePrefixBytes + 'A') & 0xFF)); + encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); + encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int truncatePrefixBytes = (encoded.get(0) - 'A') & 0xFF; + int truncateSuffixBytes = (encoded.get(1) - 'A') & 0xFF; + + if (truncatePrefixBytes == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + truncatePrefixBytes = src.size(); + truncateSuffixBytes = 0; + } + + dst.add(src.buffer, truncatePrefixBytes, src.size() - (truncateSuffixBytes + truncatePrefixBytes)); + dst.add(encoded.buffer, 2, encoded.size() - 2); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.PREFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Encodes dst relative to src by trimming + * whatever non-equal suffix and infix src and dst have. + * The output code is (bytes): + *
+     * {X}{L}{K}{suffix}
+     * 
+ * where src's infix at position (X - 'A') and of length + * (L - 'A') should be removed, then (K - 'A') bytes + * should be trimmed from the end + * and then the suffix should be appended to the resulting byte sequence. + * + *

Examples:

+ *
+     * src: ayz
+     * dst: abc
+     * encoded: AACbc
+     * 
+     * src: aillent
+     * dst: aller
+     * encoded: BBCr
+     * 
+ * + *

Note: Each code's length is a single byte. If any is equal to + * {@link SequenceEncoders#REMOVE_EVERYTHING} the entire src sequence + * should be discarded.

+ */ + public static class TrimInfixAndSuffixEncoder implements IEncoder { + ByteArrayList scratch = new ByteArrayList(); + + public ByteArrayList encode(ByteArrayList src, ByteArrayList dst, ByteArrayList encoded) { + // Search for the infix that can we can encode and remove from src + // to get a maximum-length prefix of dst. This could be done more efficiently + // by running a smarter longest-common-subsequence algorithm and some pruning (?). + // + // For now, naive loop should do. + + // There can be only two positions for the infix to delete: + // 1) we remove leading bytes, even if they are partially matching (but a longer match + // exists somewhere later on). + // 2) we leave max. matching prefix and remove non-matching bytes that follow. + int maxInfixIndex = 0; + int maxSubsequenceLength = sharedPrefixLength(src, dst); + int maxInfixLength = 0; + for (int i : new int [] {0, maxSubsequenceLength}) { + for (int j = 1; j <= src.size() - i; j++) { + // Compute temporary src with the infix removed. + // Concatenate in scratch space for simplicity. + scratch.clear(); + scratch.add(src.buffer, 0, i); + scratch.add(src.buffer, i + j, src.size() - (i + j)); + + int sharedPrefix = sharedPrefixLength(scratch, dst); + + // Only update maxSubsequenceLength if we will be able to encode it. + if (sharedPrefix > 0 && + sharedPrefix > maxSubsequenceLength && + i < REMOVE_EVERYTHING && + j < REMOVE_EVERYTHING) { + maxSubsequenceLength = sharedPrefix; + maxInfixIndex = i; + maxInfixLength = j; + } + } + } + + int truncateSuffixBytes = src.size() - (maxInfixLength + maxSubsequenceLength); + + // Special case: if we're removing the suffix in the infix code, move it + // to the suffix code instead. + if (truncateSuffixBytes == 0 && + maxInfixIndex + maxInfixLength == src.size()) { + truncateSuffixBytes = maxInfixLength; + maxInfixIndex = maxInfixLength = 0; + } + + + if (maxInfixIndex >= REMOVE_EVERYTHING || + maxInfixLength >= REMOVE_EVERYTHING || + truncateSuffixBytes >= REMOVE_EVERYTHING) { + maxInfixIndex = maxSubsequenceLength = 0; + maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING; + } + + encoded.add((byte) ((maxInfixIndex + 'A') & 0xFF)); + encoded.add((byte) ((maxInfixLength + 'A') & 0xFF)); + encoded.add((byte) ((truncateSuffixBytes + 'A') & 0xFF)); + encoded.add(dst.buffer, maxSubsequenceLength, dst.size() - maxSubsequenceLength); + + return encoded; + } + + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList dst) { + int infixIndex = (encoded.get(0) - 'A') & 0xFF; + int infixLength = (encoded.get(1) - 'A') & 0xFF; + int truncateSuffixBytes = (encoded.get(2) - 'A') & 0xFF; + + if (infixLength == REMOVE_EVERYTHING || + truncateSuffixBytes == REMOVE_EVERYTHING) { + infixIndex = 0; + infixLength = src.size(); + truncateSuffixBytes = 0; + } + + dst.add(src.buffer, 0, infixIndex); + dst.add(src.buffer, infixIndex + infixLength, src.size() - (infixIndex + infixLength + truncateSuffixBytes)); + dst.add(encoded.buffer, 3, encoded.size() - 3); + + return dst; + } + + @Override + public EncoderType type() { + return EncoderType.INFIX; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * + */ + public static class CopyEncoder implements IEncoder { + @Override + public ByteArrayList encode(ByteArrayList src, ByteArrayList derived, ByteArrayList encodedBuffer) + { + encodedBuffer.add(derived.buffer, 0, derived.size()); + return encodedBuffer; + } + + @Override + public ByteArrayList decode(ByteArrayList src, ByteArrayList encoded, ByteArrayList derivedBuffer) + { + derivedBuffer.add(encoded.buffer, 0, encoded.size()); + return derivedBuffer; + } + + @Override + public EncoderType type() { + return EncoderType.NONE; + } + + @Override + public String toString() { + return getClass().getSimpleName(); + } + } + + /** + * Compute the length of the shared prefix between two byte sequences. + */ + private static int sharedPrefixLength(ByteArrayList a, ByteArrayList b) { + final int max = Math.min(a.size(), b.size()); + int i = 0; + while (i < max && a.get(i) == b.get(i)) { + i++; + } + return i; + } + + /** + * Compute the length of the shared prefix between two byte sequences. + */ + private static int sharedPrefixLength(ByteArrayList a, int aStart, ByteArrayList b, int bStart) { + + int i = 0; + while (aStart < a.size() && + bStart < b.size() && + a.get(aStart++) == b.get(bStart++)) { + i++; + } + return i; + } + + public static IEncoder forType(EncoderType encType) + { + switch (encType) { + case INFIX: return new TrimInfixAndSuffixEncoder(); + case PREFIX: return new TrimPrefixAndSuffixEncoder(); + case SUFFIX: return new TrimSuffixEncoder(); + case NONE: return new CopyEncoder(); + } + throw new RuntimeException("Unknown encoder: " + encType); + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java b/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java new file mode 100644 index 0000000..11b42aa --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/SharedOptions.java @@ -0,0 +1,152 @@ +package morfologik.tools; + +import java.io.File; +import java.util.Arrays; + +import morfologik.stemming.EncoderType; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; + +/** + * Options shared between tools. + */ +@SuppressWarnings("static-access") +final class SharedOptions { + public final static Option fsaDictionaryFileOption = OptionBuilder + .hasArg() + .withArgName("file") + .withDescription("Path to the FSA dictionary.") + .withLongOpt("dictionary") + .withType(File.class) + .isRequired(true) + .create("d"); + + public final static Option decode = OptionBuilder + .withDescription("Decode prefix/ infix/ suffix forms (if available).") + .withLongOpt("decode") + .isRequired(false) + .create("x"); + + public final static Option dataOnly = OptionBuilder + .withDescription("Dump only raw FSA data.") + .withLongOpt("raw-data") + .isRequired(false) + .create("r"); + + public final static Option dot = OptionBuilder + .withDescription("Dump the automaton as graphviz DOT file.") + .withLongOpt("dot") + .isRequired(false) + .create(); + + public final static Option inputEncodingOption = OptionBuilder + .hasArg() + .withArgName("codepage") + .withDescription("Input stream encoding.") + .withLongOpt("input-encoding") + .isRequired(false) + .create("ie"); + + public final static Option outputEncodingOption = OptionBuilder + .hasArg() + .withArgName("codepage") + .withDescription("Output stream encoding.") + .withLongOpt("output-encoding") + .isRequired(false) + .create("oe"); + + public final static Option inputFileOption = OptionBuilder + .hasArg() + .withArgName("file") + .withDescription("Input file. If missing, standard input is used.") + .withLongOpt("input") + .withType(File.class) + .isRequired(false) + .create("i"); + + public final static Option outputFileOption = OptionBuilder + .hasArg() + .withArgName("file") + .withDescription("Output file. If missing, standard output is used.") + .withLongOpt("output") + .withType(File.class) + .isRequired(false) + .create("o"); + + public final static Option outputFormatOption = OptionBuilder + .hasArg() + .withArgName("format") + .withDescription("Name of the binary output format. Allowed values: " + Arrays.toString(FSABuildTool.Format.values())) + .withLongOpt("format") + .isRequired(false) + .create("f"); + + public final static Option fillerCharacterOption = OptionBuilder + .hasArg() + .withArgName("char") + .withDescription("Custom filler character") + .isRequired(false) + .withLongOpt("filler") + .create(); + + public final static Option annotationSeparatorCharacterOption = OptionBuilder + .hasArg() + .withArgName("char") + .withDescription("Custom annotation separator character") + .isRequired(false) + .withLongOpt("annotation") + .create(); + + public final static Option withNumbersOption = OptionBuilder + .withDescription("Include numbers required for perfect hashing (larger automaton)") + .isRequired(false) + .withLongOpt("with-numbers") + .create("n"); + + public final static Option progressOption = OptionBuilder + .withDescription("Print more verbose progress information") + .isRequired(false) + .withLongOpt("progress") + .create(); + + public final static Option inputSortedOption = OptionBuilder + .withDescription("Assume the input is already sorted using C-sort (builds FSA directly, no in-memory sorting)") + .isRequired(false) + .withLongOpt("sorted") + .create(); + + public final static Option encoder = OptionBuilder + .withDescription("Encoder used for compressing inflected forms. Any of: " + + Arrays.toString(EncoderType.values())) + .withLongOpt("encoder") + .hasArg(true) + .withArgName("name") + .isRequired(false) + .create("e"); + + public final static Option noWarnIfTwoFields = OptionBuilder + .withDescription("Suppress warning for lines with only two fields (for stemming dictionaries)") + .withLongOpt("nowarn") + .isRequired(false) + .create("nw"); + + public final static Option statistics = OptionBuilder + .withDescription("Print extra statistics.") + .isRequired(false) + .withLongOpt("stats") + .create(); + + public final static Option help = OptionBuilder + .withDescription("Help on available options.") + .withLongOpt("help") + .isRequired(false) + .create(); + + /** + * No instances. Use static fields. + */ + private SharedOptions() { + // empty + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/Tool.java b/morfologik-tools/src/main/java/morfologik/tools/Tool.java new file mode 100644 index 0000000..27dac3f --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/Tool.java @@ -0,0 +1,102 @@ +package morfologik.tools; + +import org.apache.commons.cli.*; + +/** + * Base class for command-line applications. + */ +abstract class Tool { + /** Command line options. */ + protected final Options options = new Options(); + + /** + * Initializes application context. + */ + protected final void go(String[] args) { + options.addOption(SharedOptions.help); + initializeOptions(options); + + // Commons-cli is pretty dumb in terms of option parsing because it + // validates immediately and there is no way to determine + // if an option exists without bailing out with an exception. This + // is a hardcoded workaround for --help + for (String arg : args) { + if ("--help".equals(arg)) { + printUsage(); + return; + } + } + + final Parser parser = new GnuParser(); + final CommandLine line; + try { + line = parser.parse(options, args); + if (line.hasOption(SharedOptions.help.getLongOpt())) { + printUsage(); + return; + } + if (line.getArgList().size() > 0) { + printError("Unreconized left over command line arguments: " + + line.getArgList()); + return; + } + + try { + go(line); + } catch (Throwable e) { + printError("Unhandled program error occurred.", e); + } + } catch (MissingArgumentException e) { + printError("Provide the required argument for option: " + + e.getMessage()); + } catch (MissingOptionException e) { + printError("Provide the required option: " + e.getMessage()); + } catch (UnrecognizedOptionException e) { + printError(e.getMessage()); + } catch (ParseException e) { + printError("Could not parse command line: " + e.getMessage()); + } + } + + /** + * Print an error and an associated exception. + */ + protected void printError(String msg, Throwable t) { + printError(msg); + t.printStackTrace(System.err); + } + + /** + * Print an error without an exception. + */ + protected void printError(String msg) { + System.err.println(); + System.err.println(msg); + System.err.println("Invoke with '--help' for help."); + } + + /** + * Prints usage (options). + */ + protected void printUsage() { + final HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(this.getClass().getName(), options, true); + } + + /** + * Override and write your stuff using command line options. + */ + protected abstract void go(CommandLine line) throws Exception; + + /** + * Override and initialize options. + */ + protected abstract void initializeOptions(Options options); + + /** + * Is the tool available? true by default. + */ + protected boolean isAvailable() { + return true; + } +} diff --git a/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java b/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java new file mode 100644 index 0000000..5caee57 --- /dev/null +++ b/morfologik-tools/src/main/java/morfologik/tools/WriterMessageLogger.java @@ -0,0 +1,125 @@ +package morfologik.tools; + +import java.io.PrintWriter; +import java.util.*; + +import morfologik.fsa.IMessageLogger; + +/** + * A logger dumping info to System.err. + */ +public class WriterMessageLogger implements IMessageLogger { + /** + * Start of the world timestamp. + */ + private final static long world = System.currentTimeMillis(); + + /** + * A single part: name, start timestamp. + */ + private static class Part { + final String name; + final long start; + + Part(String name, long start) { + this.name = name; + this.start = start; + } + } + + /** + * Is the output currently indented? + */ + private boolean indent; + + /** + * Active parts. + */ + private ArrayDeque parts = new ArrayDeque(); + + /** + * Output writer. + */ + private final PrintWriter writer; + + /** + * + */ + public WriterMessageLogger(PrintWriter w) { + this.writer = w; + } + + /* + * + */ + @Override + public void log(String msg) { + cancelIndent(); + + writer.println(msg); + writer.flush(); + } + + /* + * + */ + @Override + public void log(String header, Object v) { + cancelIndent(); + + if (v instanceof Integer || v instanceof Long) { + writer.println(String.format(Locale.ENGLISH, "%-30s %,11d", header, v)); + } else { + writer.println(String.format(Locale.ENGLISH, "%-30s %11s", header, v.toString())); + } + writer.flush(); + } + + /* + * + */ + @Override + public void startPart(String header) { + cancelIndent(); + + Part p = new Part(header, System.currentTimeMillis()); + parts.addLast(p); + + writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "...")); + writer.flush(); + + indent = true; + } + + /* + * + */ + @Override + public void endPart() { + long now = System.currentTimeMillis(); + Part p = parts.removeLast(); + + if (!indent) { + writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "...")); + } + + writer.println( + String.format(Locale.ENGLISH, "%13.2f sec. [%6.2f sec.]", + (now - p.start) / 1000.0, + (now - world) / 1000.0)); + writer.flush(); + + indent = false; + } + + /* + * + */ + private void cancelIndent() { + if (indent) { + System.err.println(); + } + + indent = false; + } +} diff --git a/morfologik-tools/src/proguard/rules.pro b/morfologik-tools/src/proguard/rules.pro new file mode 100644 index 0000000..c8db532 --- /dev/null +++ b/morfologik-tools/src/proguard/rules.pro @@ -0,0 +1,16 @@ + +-dontnote +-dontoptimize +-dontwarn + +-renamepackage org.apache=>morfologik.dependencies +-renamepackage com.carrotsearch=>morfologik.dependencies +-repackageclasses morfologik.dependencies + +-keep class morfologik.** { + ; ; +} + +-dontnote + +-libraryjars /lib/rt.jar(java/**) diff --git a/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java new file mode 100644 index 0000000..4d45f9c --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/FSABuildToolTest.java @@ -0,0 +1,53 @@ +package morfologik.tools; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; +import java.nio.charset.Charset; + +import org.hamcrest.core.StringContains; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.base.Charsets; +import com.google.common.base.Joiner; +import com.google.common.io.ByteStreams; +import com.google.common.io.Files; + +public class FSABuildToolTest +{ + /* */ + @Test + public void testStemmingFile() throws Exception { + // Create a simple plain text file. + File input = File.createTempFile("input", "in"); + File output = File.createTempFile("output", "fsa.txt"); + input.deleteOnExit(); + output.deleteOnExit(); + + // Populate the file with data. + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + // Emit UTF-8 BOM prefixed list of three strings. + baos.write(new byte [] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}); + baos.write(Joiner.on('\n').join("abc", "def", "xyz").getBytes(Charsets.UTF_8)); + Files.copy(ByteStreams.newInputStreamSupplier(baos.toByteArray()), input); + + baos.reset(); + PrintStream prev = System.err; + PrintStream ps = new PrintStream(baos); + System.setErr(ps); + try { + FSABuildTool.main(new String [] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath() + }); + } finally { + System.setErr(prev); + } + + String logs = new String(baos.toByteArray(), Charset.defaultCharset()); + Assert.assertThat(logs, StringContains.containsString("UTF-8 BOM")); + + System.out.println(logs); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java new file mode 100644 index 0000000..8e1d0e9 --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/LauncherTest.java @@ -0,0 +1,26 @@ +package morfologik.tools; + +import java.util.Map; + +import morfologik.tools.Launcher.ToolInfo; + +import org.junit.Assert; +import org.junit.Test; + +/* + * + */ +public class LauncherTest { + /* */ + @Test + public void testTools() throws Exception { + for (Map.Entry e : Launcher.initTools().entrySet()) { + try { + e.getValue().invoke(new String[] {"--help"}); + } catch (Throwable t) { + Assert.fail("Unable to launch " + e.getKey() + ": " + + t.getMessage()); + } + } + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java new file mode 100644 index 0000000..496880f --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/MorphEncodingToolTest.java @@ -0,0 +1,243 @@ +package morfologik.tools; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.util.List; + +import morfologik.fsa.FSA; +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.DictionaryMetadataBuilder; +import morfologik.stemming.EncoderType; +import morfologik.stemming.WordData; + +import org.fest.assertions.api.Assertions; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.google.common.base.Charsets; +import com.google.common.io.Closer; + +/* + * + */ +public class MorphEncodingToolTest extends RandomizedTest { + private Closer closer = Closer.create(); + + @After + public void cleanup() throws IOException { + closer.close(); + } + + @Test + public void testTool() throws Exception { + // Create a simple plain text file. + File input = super.newTempFile(); + File output = super.newTempFile(); + + // Populate the file with data. + PrintWriter w = + new PrintWriter( + new OutputStreamWriter( + closer.register(new FileOutputStream(input)), "UTF-8")); + w.println("passagère\tpassager\ttag"); + w.println("nieduży\tduży\ttest"); + w.print("abcd\tabc\txyz"); + w.close(); + + // suffix + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "--encoder", "suffix" }); + + BufferedReader testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + Assert.assertEquals("passagère+Eer+tag", testOutput.readLine()); + Assert.assertEquals("nieduży+Iduży+test", testOutput.readLine()); + Assert.assertEquals("abcd+B+xyz", testOutput.readLine()); + + // prefix + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "--encoder", "prefix" }); + + testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + Assert.assertEquals("passagère+AEer+tag", testOutput.readLine()); + Assert.assertEquals("nieduży+DA+test", testOutput.readLine()); + Assert.assertEquals("abcd+AB+xyz", testOutput.readLine()); + + // infix + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "--encoder", "infix" }); + + testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + Assert.assertEquals("passagère+GDAr+tag", testOutput.readLine()); + Assert.assertEquals("nieduży+ADA+test", testOutput.readLine()); + Assert.assertEquals("abcd+AAB+xyz", testOutput.readLine()); + + // custom annotation - test tabs + MorphEncodingTool.main(new String[] { + "--annotation", "\t", + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "--encoder", "infix" }); + + testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + Assert.assertEquals("passagère\tGDAr\ttag", testOutput.readLine()); + Assert.assertEquals("nieduży\tADA\ttest", testOutput.readLine()); + Assert.assertEquals("abcd\tAAB\txyz", testOutput.readLine()); + } + + /* */ + @Test + public void testStemmingFile() throws Exception { + // Create a simple plain text file. + File input = super.newTempFile(); + File output = super.newTempFile(); + + PrintWriter w = + new PrintWriter( + new OutputStreamWriter( + closer.register(new FileOutputStream(input)), "UTF-8")); + w.println("passagère\tpassager"); + w.println("nieduży\tduży"); + w.println(); + w.println("abcd\tabc"); + w.close(); + + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "-e", "suffix" }); + + BufferedReader testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + Assert.assertEquals("passagère+Eer+", testOutput.readLine()); + Assert.assertEquals("nieduży+Iduży+", testOutput.readLine()); + Assert.assertEquals("abcd+B+", testOutput.readLine()); + + testOutput.close(); + } + + /* */ + @Test + public void testZeroByteSeparator() throws Exception { + // Create a simple plain text file. + File input = newTempFile(); + File output = newTempFile(); + + // Populate the file with data. + PrintWriter w = + new PrintWriter( + new OutputStreamWriter( + closer.register(new FileOutputStream(input)), "UTF-8")); + w.println("passagère\tpassager\tTAG1"); + w.println("nieduży\tduży\tTAG2"); + w.println("abcd\tabc\tTAG3"); + w.close(); + + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "-e", "suffix", + "--annotation", "\u0000"}); + + BufferedReader testOutput = + new BufferedReader( + new InputStreamReader( + closer.register(new FileInputStream(output.getAbsolutePath())), "UTF-8")); + + Assert.assertEquals("passagère\u0000Eer\u0000TAG1", testOutput.readLine()); + Assert.assertEquals("nieduży\u0000Iduży\u0000TAG2", testOutput.readLine()); + Assert.assertEquals("abcd\u0000B\u0000TAG3", testOutput.readLine()); + + File fsaFile = newTempFile(); + FSABuildTool.main( + "--input", output.getAbsolutePath(), + "--output", fsaFile.getAbsolutePath()); + + FSA fsa = FSA.read(fsaFile); + DictionaryLookup dl = new DictionaryLookup( + new Dictionary( + fsa, + new DictionaryMetadataBuilder() + .separator((char) 0) + .encoding(Charsets.UTF_8) + .encoder(EncoderType.SUFFIX) + .build())); + + checkEntry(dl, "passagère", "passager", "TAG1"); + checkEntry(dl, "nieduży", "duży", "TAG2"); + checkEntry(dl, "abcd", "abc", "TAG3"); + } + + /* */ + @Test + public void testAnnotationCharacterInBaseOrDerivedWord() throws Exception { + // Create a simple plain text file. + File input = newTempFile(); + File output = newTempFile(); + + // Populate the file with data. + PrintWriter w = + new PrintWriter( + new OutputStreamWriter( + closer.register(new FileOutputStream(input)), "UTF-8")); + w.println("foo+\tbar-\tTAG1"); + w.close(); + + PrintStream err = System.err; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + try { + System.setErr(new PrintStream(baos, true, "UTF-8")); + MorphEncodingTool.main(new String[] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath(), + "-e", "suffix", + "--annotation", "+"}); + } finally { + System.err.flush(); + System.setErr(err); + } + + Assertions.assertThat(new String(baos.toByteArray(), Charsets.UTF_8)) + .contains("contain the annotation byte"); + } + + private void checkEntry(DictionaryLookup dl, String word, String base, String tag) { + List lookup = dl.lookup(word); + Assertions.assertThat(lookup.size()).isEqualTo(1); + WordData wordData = lookup.get(0); + Assertions.assertThat(wordData.getWord().toString()).isEqualTo(word); + Assertions.assertThat(wordData.getStem().toString()).isEqualTo(base); + Assertions.assertThat(wordData.getTag().toString()).isEqualTo(tag); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java new file mode 100644 index 0000000..d0379d7 --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersRandomizedTest.java @@ -0,0 +1,106 @@ +package morfologik.tools; + +import java.nio.ByteBuffer; +import java.util.List; + +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.DictionaryMetadataBuilder; +import morfologik.stemming.EncoderType; + +import org.junit.Test; + +import com.carrotsearch.hppc.ByteArrayList; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import com.google.common.base.Charsets; +import com.google.common.collect.Lists; + +public class SequenceEncodersRandomizedTest extends RandomizedTest { + private final SequenceEncoders.IEncoder coder; + + public SequenceEncodersRandomizedTest(@Name("coder") SequenceEncoders.IEncoder coder) + { + this.coder = coder; + } + + @ParametersFactory + public static List testFactory() { + List encoders = Lists.newArrayList(); + for (EncoderType t : EncoderType.values()) { + encoders.add(new Object [] {SequenceEncoders.forType(t)}); + } + return encoders; + } + + @Test + public void testEncodeSuffixOnRandomSequences() { + for (int i = 0; i < 10000; i++) { + assertRoundtripEncode( + randomAsciiOfLengthBetween(0, 500), + randomAsciiOfLengthBetween(0, 500)); + } + } + + @Test + public void testEncodeSamples() { + assertRoundtripEncode("", ""); + assertRoundtripEncode("abc", "ab"); + assertRoundtripEncode("abc", "abx"); + assertRoundtripEncode("ab", "abc"); + assertRoundtripEncode("xabc", "abc"); + assertRoundtripEncode("axbc", "abc"); + assertRoundtripEncode("axybc", "abc"); + assertRoundtripEncode("axybc", "abc"); + assertRoundtripEncode("azbc", "abcxy"); + + assertRoundtripEncode("Niemcami", "Niemiec"); + assertRoundtripEncode("Niemiec", "Niemcami"); + } + + private void assertRoundtripEncode(String srcString, String dstString) + { + ByteArrayList src = ByteArrayList.from(srcString.getBytes(UTF8)); + ByteArrayList dst = ByteArrayList.from(dstString.getBytes(UTF8)); + ByteArrayList encoded = ByteArrayList.newInstance(); + ByteArrayList decoded = ByteArrayList.newInstance(); + + coder.encode(src, dst, encoded); + coder.decode(src, encoded, decoded); + + if (!dst.equals(decoded)) { + System.out.println("src: " + new String(src.toArray(), Charsets.UTF_8)); + System.out.println("dst: " + new String(dst.toArray(), Charsets.UTF_8)); + System.out.println("enc: " + new String(encoded.toArray(), Charsets.UTF_8)); + System.out.println("dec: " + new String(decoded.toArray(), Charsets.UTF_8)); + } + + assertEquals(dst, decoded); + + // DictionaryLookup.decodeBaseForm decoding testing + DictionaryMetadataBuilder builder = new DictionaryMetadataBuilder(); + builder.encoding(Charsets.UTF_8); + builder.encoder(coder.type()); + + ByteBuffer bb = DictionaryLookup.decodeBaseForm( + ByteBuffer.allocate(0), + encoded.toArray(), + encoded.size(), + ByteBuffer.wrap(src.toArray()), builder.build()); + + ByteArrayList decoded2 = ByteArrayList.newInstance(); + bb.flip(); + while (bb.hasRemaining()) decoded2.add(bb.get()); + + if (!dst.equals(decoded2)) { + System.out.println("DictionaryLookup.decodeBaseForm incorrect, coder: " + coder); + System.out.println("src : " + new String(src.toArray(), Charsets.UTF_8)); + System.out.println("dst : " + new String(dst.toArray(), Charsets.UTF_8)); + System.out.println("enc : " + new String(encoded.toArray(), Charsets.UTF_8)); + System.out.println("dec : " + new String(decoded.toArray(), Charsets.UTF_8)); + System.out.println("dec2: " + new String(decoded2.toArray(), Charsets.UTF_8)); + } + + assertEquals(dst, decoded2); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java new file mode 100644 index 0000000..3f1625d --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/SequenceEncodersStaticTest.java @@ -0,0 +1,96 @@ +package morfologik.tools; + +import static org.junit.Assert.*; + +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; + +import morfologik.stemming.EncoderType; + +import org.junit.Test; + +import com.google.common.base.Charsets; + +/* + * + */ +public class SequenceEncodersStaticTest { + private SequenceAssembler suffix = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder()); + private SequenceAssembler prefix = new SequenceAssembler(new SequenceEncoders.TrimPrefixAndSuffixEncoder()); + private SequenceAssembler infix = new SequenceAssembler(new SequenceEncoders.TrimInfixAndSuffixEncoder()); + + @Test + public void testStandardEncode() throws Exception { + assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag")); + assertEquals("abc+Dxyz+tag", encode(suffix, "abc", "xyz", "tag")); + assertEquals("abc+Bć+tag", encode(suffix, "abc", "abć", "tag")); + } + + @Test + public void testSeparatorChange() throws Exception { + assertEquals("abc+Ad+tag", encode(suffix, "abc", "abcd", "tag")); + + SequenceAssembler assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '_'); + assertEquals("abc_Ad_tag", encode(assembler, "abc", "abcd", "tag")); + + assembler = new SequenceAssembler(new SequenceEncoders.TrimSuffixEncoder(), (byte) '\t'); + assertEquals("abc\tAd\ttag", encode(assembler, "abc", "abcd", "tag")); + } + + @Test + public void testPrefixEncode() throws UnsupportedEncodingException { + assertEquals("abc+AAd+tag", encode(prefix, "abc", "abcd", "tag")); + assertEquals("abcd+AB+tag", encode(prefix, "abcd", "abc", "tag")); + assertEquals("abc+ADxyz+tag", encode(prefix, "abc", "xyz", "tag")); + assertEquals("abc+ABć+tag", encode(prefix, "abc", "abć", "tag")); + assertEquals("postmodernizm+AAu+xyz", encode(prefix, "postmodernizm", "postmodernizmu", "xyz")); + assertEquals("postmodernizmu+AB+xyz", encode(prefix, "postmodernizmu", "postmodernizm", "xyz")); + assertEquals("nieduży+DA+adj", encode(prefix, "nieduży", "duży", "adj")); + assertEquals("postmodernizm+EA+xyz", encode(prefix, "postmodernizm", "modernizm", "xyz")); + } + + @Test + public void testInfixEncode() throws UnsupportedEncodingException { + assertEquals("ayz+AACbc+tag", encode(infix, "ayz", "abc", "tag")); + assertEquals("xyz+AADabc+tag", encode(infix, "xyz", "abc", "tag")); + + assertEquals("abc+AAAd+tag", encode(infix, "abc", "abcd", "tag")); + assertEquals("abcd+AAB+tag", encode(infix, "abcd", "abc", "tag")); + assertEquals("abc+AADxyz+tag", encode(infix, "abc", "xyz", "tag")); + assertEquals("abc+AABć+tag", encode(infix, "abc", "abć", "tag")); + assertEquals("postmodernizm+AAAu+xyz", encode(infix, "postmodernizm", "postmodernizmu", "xyz")); + assertEquals("postmodernizmu+AAB+xyz", encode(infix, "postmodernizmu", "postmodernizm", "xyz")); + assertEquals("nieduży+ADA+adj", encode(infix, "nieduży", "duży", "adj")); + + // real infix cases + assertEquals("kcal+ABA+xyz", encode(infix, "kcal", "cal", "xyz")); + assertEquals("aillent+BBCr+xyz", encode(infix, "aillent", "aller", "xyz")); + assertEquals("laquelle+AGAquel+D f s", encode(infix, "laquelle", "lequel", "D f s")); + assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test")); + assertEquals("ccal+ABA+test", encode(infix, "ccal", "cal", "test")); + } + + @Test + public void testUTF8Boundary() throws Exception { + assertEquals("passagère+Eer+tag", encode(suffix, "passagère", "passager", "tag")); + assertEquals("passagère+GDAr+tag", encode(infix, "passagère", "passager", "tag")); + assertEquals("passagère+AEer+tag", encode(prefix, "passagère", "passager", "tag")); + } + + @Test + public void testAllEncodersHaveImplementations() { + for (EncoderType t : EncoderType.values()) { + assertNotNull(null != SequenceEncoders.forType(t)); + } + } + + private String encode(SequenceAssembler assembler, String wordForm, + String wordLemma, String wordTag) + { + Charset UTF8 = Charsets.UTF_8; + return new String(assembler.encode( + wordForm.getBytes(UTF8), + wordLemma.getBytes(UTF8), + wordTag.getBytes(UTF8)), UTF8); + } +} diff --git a/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java new file mode 100644 index 0000000..573c5da --- /dev/null +++ b/morfologik-tools/src/test/java/morfologik/tools/Text2FSA5Test.java @@ -0,0 +1,37 @@ +package morfologik.tools; + +import java.io.*; + +import morfologik.fsa.*; + +import org.junit.Assert; +import org.junit.Test; + +/* + * + */ +public class Text2FSA5Test { + @Test + public void testTool() throws Exception { + // Create a simple plain text file. + File input = File.createTempFile("input", "in"); + File output = File.createTempFile("output", "fsa"); + input.deleteOnExit(); + output.deleteOnExit(); + + // Populate the file with data. + PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(input), "UTF-8")); + w.println("b"); + w.println("cab"); + w.println("ab"); + w.close(); + + FSABuildTool.main(new String [] { + "--input", input.getAbsolutePath(), + "--output", output.getAbsolutePath() + }); + + FSA5 fsa = FSA.read(new FileInputStream(output)); + Assert.assertEquals(3, new FSAInfo(fsa).finalStatesCount); + } +} diff --git a/morfologik.LICENSE b/morfologik.LICENSE new file mode 100644 index 0000000..bc7ee7a --- /dev/null +++ b/morfologik.LICENSE @@ -0,0 +1,29 @@ + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2014 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..a795da7 --- /dev/null +++ b/pom.xml @@ -0,0 +1,339 @@ + + + + + 4.0.0 + + + 3.0.2 + + + + + org.sonatype.oss + oss-parent + 7 + + + + org.carrot2 + morfologik-parent + 1.9.0 + pom + + Morfologik (parent POM) + Morfologik provides high quality lemmatisation for the Polish language, + along with tools for building and using byte-based finite state automata. + http://morfologik.blogspot.com/ + + + + + BSD + http://www.opensource.org/licenses/bsd-license.php + repo + + + + + + + + + Announcements, bug reports, developers mailing list + morfologik-devel@lists.sourceforge.net + + + + + https://morfologik.svn.sourceforge.net/svnroot/morfologik/morfologik-stemming/trunk + scm:svn:https://morfologik.svn.sourceforge.net/svnroot/morfologik/morfologik-stemming/trunk + scm:svn:https://morfologik.svn.sourceforge.net/svnroot/morfologik/morfologik-stemming/trunk + + + + + dawid.weiss + Dawid Weiss + dawid.weiss@carrotsearch.com + + + + marcin.milkowski + Marcin Miłkowski + + + + + + + 1.6 + 1.6 + UTF-8 + + false + + + + + + + SonaType snapshots + https://oss.sonatype.org/content/repositories/snapshots + + true + + + false + + + + + + + morfologik-fsa + morfologik-stemming + morfologik-polish + morfologik-tools + morfologik-distribution + morfologik-speller + + + + + + + com.carrotsearch + hppc + 0.5.3 + + + + com.carrotsearch + junit-benchmarks + 0.7.2 + + + + junit + junit + 4.11 + + + + commons-cli + commons-cli + 1.2 + + + + commons-lang + commons-lang + 2.6 + + + + com.google.guava + guava + 15.0 + + + + org.easytesting + fest-assert-core + 2.0M10 + test + + + + com.carrotsearch.randomizedtesting + randomizedtesting-runner + 2.0.13 + + + + + + + install + + + + + org.apache.maven.plugins + maven-deploy-plugin + 2.7 + + ${skip.deployment} + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9 + + + + org.apache.maven.plugins + maven-antrun-plugin + 1.7 + + + + org.apache.maven.plugins + maven-assembly-plugin + 2.4 + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + org.apache.maven.plugins + maven-install-plugin + 2.4 + + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + + + org.apache.maven.plugins + maven-dependency-plugin + 2.7 + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.9 + + ${basedir}/.eclipse/classes + + org.eclipse.jdt.core.javabuilder + + + org.eclipse.jdt.core.javanature + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.5.1 + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.12.4 + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.4 + + + + + + + + + quick + + true + + + + + sign + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.1 + + + **/*.gz + **/*.zip + + + + + + sign + + + + + + + + + + release + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + ${project.build.sourceEncoding} + ${project.name} v${project.version} API Documentation + ${project.name} v${project.version} API Documentation + UTF-8 + false + + + + attach-javadocs + + jar + + + + + + + + org.apache.maven.plugins + maven-source-plugin + + true + + + + attach-sources + + jar + + + + + + + + + + diff --git a/src-test/morfologik/fsa/CFSA2SerializerTest.java b/src-test/morfologik/fsa/CFSA2SerializerTest.java deleted file mode 100644 index 332bbcc..0000000 --- a/src-test/morfologik/fsa/CFSA2SerializerTest.java +++ /dev/null @@ -1,27 +0,0 @@ -package morfologik.fsa; - -import static org.junit.Assert.*; - -import org.junit.Test; - -/** - * - */ -public class CFSA2SerializerTest extends SerializerTestBase { - protected CFSA2Serializer createSerializer() { - return new CFSA2Serializer(); - } - - @Test - public void testVIntCoding() { - byte [] scratch = new byte [5]; - - int [] values = {0, 1, 128, 256, 0x1000, Integer.MAX_VALUE }; - - for (int v : values) { - int len = CFSA2.writeVInt(scratch, 0, v); - assertEquals(v, CFSA2.readVInt(scratch, 0)); - assertEquals(len, CFSA2.vIntLength(v)); - } - } -} diff --git a/src-test/morfologik/fsa/FSA5SerializerTest.java b/src-test/morfologik/fsa/FSA5SerializerTest.java deleted file mode 100644 index 1d05cfc..0000000 --- a/src-test/morfologik/fsa/FSA5SerializerTest.java +++ /dev/null @@ -1,10 +0,0 @@ -package morfologik.fsa; - -/** - * - */ -public class FSA5SerializerTest extends SerializerTestBase { - protected FSA5Serializer createSerializer() { - return new FSA5Serializer(); - } -} diff --git a/src-test/morfologik/fsa/FSA5Test.java b/src-test/morfologik/fsa/FSA5Test.java deleted file mode 100644 index 869dfeb..0000000 --- a/src-test/morfologik/fsa/FSA5Test.java +++ /dev/null @@ -1,117 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.FSAFlags.NEXTBIT; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import morfologik.stemming.Dictionary; - -import org.junit.Test; - -/** - * Additional tests for {@link FSA5}. - */ -public final class FSA5Test { - public ArrayList expected = new ArrayList(Arrays.asList( - "a", "aba", "ac", "b", "ba", "c")); - - @Test - public void testVersion5() throws IOException { - final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); - assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS)); - verifyContent(expected, fsa); - } - - @Test - public void testVersion5WithNumbers() throws IOException { - final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); - - verifyContent(expected, fsa); - assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS)); - } - - @Test - public void testArcsAndNodes() throws IOException { - final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream( - "abc.fsa")); - final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream( - "abc-numbers.fsa")); - - FSAInfo info1 = new FSAInfo(fsa1); - FSAInfo info2 = new FSAInfo(fsa2); - - assertEquals(info1.arcsCount, info2.arcsCount); - assertEquals(info1.nodeCount, info2.nodeCount); - - assertEquals(4, info2.nodeCount); - assertEquals(7, info2.arcsCount); - } - - @Test - public void testArcsAndNodesLarge() throws IOException { - final FSA fsa3 = Dictionary.getForLanguage("pl").fsa; - FSAInfo info3 = new FSAInfo(fsa3); - - assertEquals(293329, info3.nodeCount); - assertEquals(679676, info3.arcsCount); - assertEquals(3672200, info3.finalStatesCount); - } - - @Test - public void testNumbers() throws IOException { - final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); - - assertTrue(fsa.getFlags().contains(NEXTBIT)); - - // Get all numbers for nodes. - byte[] buffer = new byte[128]; - final ArrayList result = new ArrayList(); - walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); - - Collections.sort(result); - assertEquals(Arrays - .asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result); - } - - public static void walkNode(byte[] buffer, int depth, FSA fsa, int node, - int cnt, List result) throws IOException { - for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { - buffer[depth] = fsa.getArcLabel(arc); - - if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) { - result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8")); - } - - if (fsa.isArcFinal(arc)) - cnt++; - - if (!fsa.isArcTerminal(arc)) { - walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result); - cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc)); - } - } - } - - private static void verifyContent(List expected, FSA fsa) throws IOException { - final ArrayList actual = new ArrayList(); - - int count = 0; - for (ByteBuffer bb : fsa.getSequences()) { - assertEquals(0, bb.arrayOffset()); - assertEquals(0, bb.position()); - actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8")); - count++; - } - assertEquals(expected.size(), count); - Collections.sort(actual); - assertEquals(expected, actual); - } -} diff --git a/src-test/morfologik/fsa/FSABuilderTest.java b/src-test/morfologik/fsa/FSABuilderTest.java deleted file mode 100644 index d2e1bad..0000000 --- a/src-test/morfologik/fsa/FSABuilderTest.java +++ /dev/null @@ -1,112 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.FSATestUtils.*; -import static org.junit.Assert.assertEquals; - -import java.io.IOException; -import java.util.Arrays; - -import morfologik.fsa.FSA; -import morfologik.fsa.FSABuilder; -import morfologik.util.MinMax; - -import org.junit.BeforeClass; -import org.junit.Test; - -public class FSABuilderTest { - private static byte[][] input; - private static byte[][] input2; - - @BeforeClass - public static void prepareByteInput() { - input = generateRandom(25000, new MinMax(1, 20), new MinMax(0, 255)); - input2 = generateRandom(40, new MinMax(1, 20), new MinMax(0, 3)); - } - - /** - * - */ - @Test - public void testEmptyInput() { - byte[][] input = {}; - checkCorrect(input, FSABuilder.build(input)); - } - - /** - * - */ - @Test - public void testHashResizeBug() throws Exception { - byte[][] input = { - {0, 1 }, - {0, 2 }, - {1, 1 }, - {2, 1 }, - }; - - FSA fsa = FSABuilder.build(input); - checkCorrect(input, FSABuilder.build(input)); - checkMinimal(fsa); - } - - /** - * - */ - @Test - public void testSmallInput() throws Exception { - byte[][] input = { - "abc".getBytes("UTF-8"), - "bbc".getBytes("UTF-8"), - "d".getBytes("UTF-8"), - }; - checkCorrect(input, FSABuilder.build(input)); - } - - /** - * Verify absolute byte-value ordering in the comparators and serialized automaton. - */ - @Test - public void testLexicographicOrder() throws IOException { - byte[][] input = { - {0}, - {1}, - {(byte) 0xff}, - }; - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - - // Check if lexical ordering is consistent with absolute byte value. - assertEquals(0, input[0][0]); - assertEquals(1, input[1][0]); - assertEquals((byte) 0xff, input[2][0]); - - final FSA fsa; - checkCorrect(input, fsa = FSABuilder.build(input)); - - int arc = fsa.getFirstArc(fsa.getRootNode()); - assertEquals(0, fsa.getArcLabel(arc)); - arc = fsa.getNextArc(arc); - assertEquals(1, fsa.getArcLabel(arc)); - arc = fsa.getNextArc(arc); - assertEquals((byte) 0xff, fsa.getArcLabel(arc)); - } - - /** - * - */ - @Test - public void testRandom25000_largerAlphabet() { - FSA fsa = FSABuilder.build(input); - checkCorrect(input, fsa); - checkMinimal(fsa); - } - - /** - * - */ - @Test - public void testRandom25000_smallAlphabet() throws IOException { - FSA fsa = FSABuilder.build(input2); - checkCorrect(input2, fsa); - checkMinimal(fsa); - } -} diff --git a/src-test/morfologik/fsa/FSATestUtils.java b/src-test/morfologik/fsa/FSATestUtils.java deleted file mode 100644 index d6cfeee..0000000 --- a/src-test/morfologik/fsa/FSATestUtils.java +++ /dev/null @@ -1,179 +0,0 @@ -package morfologik.fsa; - -import java.nio.ByteBuffer; -import java.util.*; - -import morfologik.util.BufferUtils; -import morfologik.util.MinMax; - -import org.junit.Assert; - -public class FSATestUtils { - /** - * Generate a sorted list of random sequences. - */ - public static byte[][] generateRandom(int count, MinMax length, - MinMax alphabet) { - final byte[][] input = new byte[count][]; - final Random rnd = new Random(0x11223344); - for (int i = 0; i < count; i++) { - input[i] = randomByteSequence(rnd, length, alphabet); - } - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - return input; - } - - /** - * Generate a random string. - */ - private static byte[] randomByteSequence(Random rnd, MinMax length, - MinMax alphabet) { - byte[] bytes = new byte[length.min + rnd.nextInt(length.range())]; - for (int i = 0; i < bytes.length; i++) { - bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range())); - } - return bytes; - } - - /** - * Check if the DFSA is correct with respect to the given input. - */ - public static void checkCorrect(byte[][] input, FSA fsa) { - // (1) All input sequences are in the right language. - HashSet rl = new HashSet(); - for (ByteBuffer bb : fsa) { - rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining()))); - } - - HashSet uniqueInput = new HashSet(); - for (byte[] sequence : input) { - uniqueInput.add(ByteBuffer.wrap(sequence)); - } - - for (ByteBuffer sequence : uniqueInput) { - Assert.assertTrue("Not present in the right language: " - + BufferUtils.toString(sequence), rl.remove(sequence)); - } - - // (2) No other sequence _other_ than the input is in the right language. - Assert.assertEquals(0, rl.size()); - } - - /** - * Check if the DFSA reachable from a given state is minimal. This means no - * two states have the same right language. - */ - public static void checkMinimal(final FSA fsa) { - final HashMap stateLanguages = new HashMap(); - - fsa.visitInPostOrder(new StateVisitor() { - private StringBuilder b = new StringBuilder(); - - public boolean accept(int state) { - List rightLanguage = allSequences(fsa, state); - Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING); - - b.setLength(0); - for (byte[] seq : rightLanguage) { - b.append(Arrays.toString(seq)); - b.append(','); - } - - String full = b.toString(); - Assert.assertFalse("State exists: " + state + " " - + full + " " + stateLanguages.get(full), stateLanguages.containsKey(full)); - stateLanguages.put(full, state); - - return true; - } - }); - } - - static List allSequences(FSA fsa, int state) { - ArrayList seq = new ArrayList(); - for (ByteBuffer bb : fsa.getSequences(state)) { - seq.add(Arrays.copyOf(bb.array(), bb.remaining())); - } - return seq; - } - - /** - * Check if two FSAs are identical. - */ - public static void checkIdentical(FSA fsa1, FSA fsa2) { - ArrayDeque fromRoot = new ArrayDeque(); - checkIdentical(fromRoot, - fsa1, fsa1.getRootNode(), new BitSet(), - fsa2, fsa2.getRootNode(), new BitSet()); - } - - /* - * - */ - static void checkIdentical(ArrayDeque fromRoot, - FSA fsa1, int node1, BitSet visited1, - FSA fsa2, int node2, BitSet visited2) { - int arc1 = fsa1.getFirstArc(node1); - int arc2 = fsa2.getFirstArc(node2); - - if (visited1.get(node1) != visited2.get(node2)) { - throw new RuntimeException("Two nodes should either be visited or not visited: " - + Arrays.toString(fromRoot.toArray()) + " " - + " node1: " + node1 + " " - + " node2: " + node2); - } - visited1.set(node1); - visited2.set(node2); - - TreeSet labels1 = new TreeSet(); - TreeSet labels2 = new TreeSet(); - while (true) { - labels1.add((char) fsa1.getArcLabel(arc1)); - labels2.add((char) fsa2.getArcLabel(arc2)); - - arc1 = fsa1.getNextArc(arc1); - arc2 = fsa2.getNextArc(arc2); - - if (arc1 == 0 || arc2 == 0) { - if (arc1 != arc2) { - throw new RuntimeException("Different number of labels at path: " - + Arrays.toString(fromRoot.toArray())); - } - break; - } - } - - if (!labels1.equals(labels2)) { - throw new RuntimeException("Different sets of labels at path: " - + Arrays.toString(fromRoot.toArray()) + ":\n" - + labels1 + "\n" + labels2); - } - - // recurse. - for (char chr : labels1) { - byte label = (byte) chr; - fromRoot.push(Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr)); - - arc1 = fsa1.getArc(node1, label); - arc2 = fsa2.getArc(node2, label); - - if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) { - throw new RuntimeException("Different final flag on arcs at: " - + Arrays.toString(fromRoot.toArray()) + ", label: " + label); - } - - if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) { - throw new RuntimeException("Different terminal flag on arcs at: " - + Arrays.toString(fromRoot.toArray()) + ", label: " + label); - } - - if (!fsa1.isArcTerminal(arc1)) { - checkIdentical(fromRoot, - fsa1, fsa1.getEndNode(arc1), visited1, - fsa2, fsa2.getEndNode(arc2), visited2); - } - - fromRoot.pop(); - } - } -} diff --git a/src-test/morfologik/fsa/FSATraversalTest.java b/src-test/morfologik/fsa/FSATraversalTest.java deleted file mode 100644 index ddafb6d..0000000 --- a/src-test/morfologik/fsa/FSATraversalTest.java +++ /dev/null @@ -1,160 +0,0 @@ -package morfologik.fsa; - -import static org.junit.Assert.*; -import static morfologik.fsa.MatchResult.*; - -import java.io.*; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.HashSet; - - -import org.junit.Before; -import org.junit.Test; - -/** - * Tests {@link FSATraversal}. - */ -public final class FSATraversalTest { - private FSA fsa; - - /** - * - */ - @Before - public void setUp() throws Exception { - fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict")); - } - - /** - * - */ - @Test - public void testTraversalWithIterable() { - int count = 0; - for (ByteBuffer bb : fsa.getSequences()) { - assertEquals(0, bb.arrayOffset()); - assertEquals(0, bb.position()); - count++; - } - assertEquals(346773, count); - } - - /** - * - */ - @Test - public void testPerfectHash() throws IOException { - byte[][] input = new byte[][] { - { 'a' }, - { 'a', 'b', 'a' }, - { 'a', 'c' }, - { 'b' }, - { 'b', 'a' }, - { 'c' }, - }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - final byte[] fsaData = - new FSA5Serializer() - .withNumbers() - .serialize(s, new ByteArrayOutputStream()) - .toByteArray(); - - final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData)); - final FSATraversal traversal = new FSATraversal(fsa); - - int i = 0; - for (byte [] seq : input) - { - assertEquals(new String(seq), i++, traversal.perfectHash(seq)); - } - - // Check if the total number of sequences is encoded at the root node. - assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode())); - - // Check sub/super sequence scenarios. - assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8"))); - assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8"))); - assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8"))); - assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0})); - - assertTrue(AUTOMATON_HAS_PREFIX < 0); - assertTrue(SEQUENCE_IS_A_PREFIX < 0); - assertTrue(NO_MATCH < 0); - } - - /** - * - */ - @Test - public void testRecursiveTraversal() { - final int[] counter = new int[] { 0 }; - - class Recursion { - public void dumpNode(final int node) { - int arc = fsa.getFirstArc(node); - do { - if (fsa.isArcFinal(arc)) { - counter[0]++; - } - - if (!fsa.isArcTerminal(arc)) { - dumpNode(fsa.getEndNode(arc)); - } - - arc = fsa.getNextArc(arc); - } while (arc != 0); - } - } - - new Recursion().dumpNode(fsa.getRootNode()); - - assertEquals(346773, counter[0]); - } - - /** - * Test {@link FSATraversal} and matching results. - */ - @Test - public void testMatch() throws IOException { - final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); - final FSATraversal traversalHelper = new FSATraversal(fsa); - - MatchResult m = traversalHelper.match("ax".getBytes()); - assertEquals(NO_MATCH, m.kind); - assertEquals(1, m.index); - assertEquals(new HashSet(Arrays.asList("ba", "c")), - suffixes(fsa, m.node)); - - assertEquals(EXACT_MATCH, - traversalHelper.match("aba".getBytes()).kind); - - m = traversalHelper.match("abalonger".getBytes()); - assertEquals(AUTOMATON_HAS_PREFIX, m.kind); - assertEquals("longer", "abalonger".substring(m.index)); - - m = traversalHelper.match("ab".getBytes()); - assertEquals(SEQUENCE_IS_A_PREFIX, m.kind); - assertEquals(new HashSet(Arrays.asList("a")), - suffixes(fsa, m.node)); - } - - /** - * Return all sequences reachable from a given node, as strings. - */ - private HashSet suffixes(FSA fsa, int node) { - HashSet result = new HashSet(); - for (ByteBuffer bb : fsa.getSequences(node)) - { - try { - result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8")); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException(e); - } - } - return result; - } -} diff --git a/src-test/morfologik/fsa/SerializerTestBase.java b/src-test/morfologik/fsa/SerializerTestBase.java deleted file mode 100644 index ce373ba..0000000 --- a/src-test/morfologik/fsa/SerializerTestBase.java +++ /dev/null @@ -1,256 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.FSAFlags.NUMBERS; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.*; -import java.nio.ByteBuffer; -import java.util.*; - -import morfologik.util.BufferUtils; - -import org.junit.*; - -public abstract class SerializerTestBase { - @Test - public void testA() throws IOException { - byte[][] input = new byte[][] { - { 'a' }, - }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - @Test - public void testArcsSharing() throws IOException { - byte[][] input = new byte[][] { - { 'a', 'c', 'f' }, - { 'a', 'd', 'g' }, - { 'a', 'e', 'h' }, - { 'b', 'd', 'g' }, - { 'b', 'e', 'h' }, - }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - @Test - public void testFSA5SerializerSimple() throws IOException { - byte[][] input = new byte[][] { - { 'a' }, - { 'a', 'b', 'a' }, - { 'a', 'c' }, - { 'b' }, - { 'b', 'a' }, - { 'c' }, - }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - @Test - public void testNotMinimal() throws IOException { - byte[][] input = new byte[][] { - { 'a', 'b', 'a' }, - { 'b' }, - { 'b', 'a' } - }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - /** - * - */ - @Test - public void testFSA5Bug0() throws IOException { - checkCorrect(new String[] { - "3-D+A+JJ", - "3-D+A+NN", - "4-F+A+NN", - "z+A+NN", }); - } - - /** - * - */ - @Test - public void testFSA5Bug1() throws IOException { - checkCorrect(new String[] { "+NP", "n+N", "n+NP", }); - } - - private void checkCorrect(String[] strings) throws IOException { - byte[][] input = new byte[strings.length][]; - for (int i = 0; i < strings.length; i++) { - input[i] = strings[i].getBytes("ISO8859-1"); - } - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - /** - * - */ - @Test - public void testEmptyInput() throws IOException { - byte[][] input = new byte[][] {}; - FSA s = FSABuilder.build(input); - - checkSerialization(input, s); - } - - /** - * - */ - @Test - public void test_abc() throws IOException { - testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa"))); - } - - /** - * - */ - @Test - public void test_minimal() throws IOException { - testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa"))); - } - - /** - * - */ - @Test - public void test_minimal2() throws IOException { - testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa"))); - } - - /** - * - */ - @Test - public void test_en_tst() throws IOException { - testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict"))); - } - - private void testBuiltIn(FSA fsa) throws IOException { - final ArrayList sequences = new ArrayList(); - - sequences.clear(); - for (ByteBuffer bb : fsa) { - sequences.add(Arrays.copyOf(bb.array(), bb.remaining())); - } - - Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); - - final byte[][] in = sequences.toArray(new byte[sequences.size()][]); - FSA root = FSABuilder.build(in); - - // Check if the DFSA is correct first. - FSATestUtils.checkCorrect(in, root); - - // Check serialization. - checkSerialization(in, root); - } - - /** */ - private void checkSerialization(byte[][] input, FSA root) - throws IOException { - checkSerialization0(createSerializer(), input, root); - if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) { - checkSerialization0(createSerializer().withNumbers(), input, root); - } - } - - /** */ - private void checkSerialization0(FSASerializer serializer, - final byte[][] in, FSA root) throws IOException { - final byte[] fsaData = serializer.serialize(root, - new ByteArrayOutputStream()).toByteArray(); - - FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); - checkCorrect(in, fsa); - } - - /** - * Check if the FSA is correct with respect to the given input. - */ - protected void checkCorrect(byte[][] input, FSA fsa) { - // (1) All input sequences are in the right language. - HashSet rl = new HashSet(); - for (ByteBuffer bb : fsa) { - byte[] array = bb.array(); - int length = bb.remaining(); - rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length))); - } - - HashSet uniqueInput = new HashSet(); - for (byte[] sequence : input) { - uniqueInput.add(ByteBuffer.wrap(sequence)); - } - - for (ByteBuffer sequence : uniqueInput) { - Assert.assertTrue("Not present in the right language: " - + BufferUtils.toString(sequence), rl.remove(sequence)); - } - - // (2) No other sequence _other_ than the input is in the right - // language. - Assert.assertEquals(0, rl.size()); - } - - @Test - public void testAutomatonWithNodeNumbers() throws IOException { - Assume.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS)); - - byte[][] input = new byte[][] { - { 'a' }, - { 'a', 'b', 'a' }, - { 'a', 'c' }, - { 'b' }, - { 'b', 'a' }, - { 'c' }, }; - - Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); - FSA s = FSABuilder.build(input); - - final byte[] fsaData = - createSerializer() - .withNumbers() - .serialize(s, new ByteArrayOutputStream()).toByteArray(); - - FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); - - // Ensure we have the NUMBERS flag set. - assertTrue(fsa.getFlags().contains(NUMBERS)); - - // Get all numbers from nodes. - byte[] buffer = new byte[128]; - final ArrayList result = new ArrayList(); - FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); - - Collections.sort(result); - assertEquals( - Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"), - result); - } - - /** - * - */ - protected abstract FSASerializer createSerializer(); -} diff --git a/src-test/morfologik/fsa/abc-numbers.fsa b/src-test/morfologik/fsa/abc-numbers.fsa deleted file mode 100644 index d97091d..0000000 Binary files a/src-test/morfologik/fsa/abc-numbers.fsa and /dev/null differ diff --git a/src-test/morfologik/fsa/abc.fsa b/src-test/morfologik/fsa/abc.fsa deleted file mode 100644 index 68c0b96..0000000 Binary files a/src-test/morfologik/fsa/abc.fsa and /dev/null differ diff --git a/src-test/morfologik/fsa/abc.in b/src-test/morfologik/fsa/abc.in deleted file mode 100644 index 7bb8744..0000000 --- a/src-test/morfologik/fsa/abc.in +++ /dev/null @@ -1,6 +0,0 @@ -a -aba -ac -b -ba -c diff --git a/src-test/morfologik/fsa/en_tst.dict b/src-test/morfologik/fsa/en_tst.dict deleted file mode 100644 index 09cc22b..0000000 Binary files a/src-test/morfologik/fsa/en_tst.dict and /dev/null differ diff --git a/src-test/morfologik/fsa/minimal.fsa b/src-test/morfologik/fsa/minimal.fsa deleted file mode 100644 index 9d667b7..0000000 Binary files a/src-test/morfologik/fsa/minimal.fsa and /dev/null differ diff --git a/src-test/morfologik/fsa/minimal.in b/src-test/morfologik/fsa/minimal.in deleted file mode 100644 index 7ae8d81..0000000 --- a/src-test/morfologik/fsa/minimal.in +++ /dev/null @@ -1,3 +0,0 @@ -+NP -n+N -n+NP diff --git a/src-test/morfologik/fsa/minimal2.fsa b/src-test/morfologik/fsa/minimal2.fsa deleted file mode 100644 index e81f6d0..0000000 Binary files a/src-test/morfologik/fsa/minimal2.fsa and /dev/null differ diff --git a/src-test/morfologik/fsa/minimal2.in b/src-test/morfologik/fsa/minimal2.in deleted file mode 100644 index d28708d..0000000 --- a/src-test/morfologik/fsa/minimal2.in +++ /dev/null @@ -1,24 +0,0 @@ -3-D+A+JJ -3-D+A+NN -4-F+A+NN -4-H+A+JJ -z+A+NN -z-axis+A+NN -zB+A+NN -zZt+A+NNP -za-zen+A+NN -zabaglione+A+NN -zabagliones+B+NNS -zabajone+A+NN -zabajones+B+NNS -zabaione+A+NN -zabaiones+B+NNS -zabra+A+NN -zabras+B+NNS -zack+A+NN -zacaton+A+NN -zacatons+B+NNS -zacatun+A+NN -zaddik+A+NN -zaddiks+B+NNS -zaffar+A+NN \ No newline at end of file diff --git a/src-test/morfologik/stemming/DictionaryLookupTest.java b/src-test/morfologik/stemming/DictionaryLookupTest.java deleted file mode 100644 index e5c5204..0000000 --- a/src-test/morfologik/stemming/DictionaryLookupTest.java +++ /dev/null @@ -1,250 +0,0 @@ -package morfologik.stemming; - -import static org.junit.Assert.*; - -import java.io.IOException; -import java.net.URL; -import java.nio.ByteBuffer; -import java.util.*; - -import org.junit.Test; - -/* - * - */ -public class DictionaryLookupTest { - /* */ - @Test - public void testPrefixDictionaries() throws IOException { - final URL url = this.getClass().getResource("test-prefix.dict"); - final IStemmer s = new DictionaryLookup(Dictionary.read(url)); - - assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, - stem(s, "Rzeczypospolitej")); - assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, - stem(s, "Rzecząpospolitą")); - - // This word is not in the dictionary. - assertNoStemFor(s, "martygalski"); - } - - /* */ - @Test - public void testInfixDictionaries() throws IOException { - final URL url = this.getClass().getResource("test-infix.dict"); - final IStemmer s = new DictionaryLookup(Dictionary.read(url)); - - assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, - stem(s, "Rzeczypospolitej")); - assertArrayEquals(new String[] { "Rzeczycki", "adj:pl:nom:m" }, stem(s, - "Rzeczyccy")); - assertArrayEquals(new String[] { "Rzeczpospolita", "subst:irreg" }, - stem(s, "Rzecząpospolitą")); - - // This word is not in the dictionary. - assertNoStemFor(s, "martygalski"); - } - - /* */ - @Test - public void testWordDataIterator() throws IOException { - final URL url = this.getClass().getResource("test-infix.dict"); - final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); - - final HashSet entries = new HashSet(); - for (WordData wd : s) { - entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag()); - } - - // Make sure a sample of the entries is present. - assertTrue(entries.contains("Rzekunia Rzekuń subst:sg:gen:m")); - assertTrue(entries - .contains("Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n")); - assertTrue(entries - .contains("Rzecząpospolitą Rzeczpospolita subst:irreg")); - assertTrue(entries - .contains("Rzeczypospolita Rzeczpospolita subst:irreg")); - assertTrue(entries - .contains("Rzeczypospolitych Rzeczpospolita subst:irreg")); - assertTrue(entries - .contains("Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f")); - } - - /* */ - @Test - public void testWordDataCloning() throws IOException { - final URL url = this.getClass().getResource("test-infix.dict"); - final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); - - ArrayList words = new ArrayList(); - for (WordData wd : s) { - WordData clone = wd.clone(); - words.add(clone); - } - - // Reiterate and verify that we have the same entries. - final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url)); - int i = 0; - for (WordData wd : s2) { - WordData clone = words.get(i++); - assertEqualSequences(clone.getStem(), wd.getStem()); - assertEqualSequences(clone.getTag(), wd.getTag()); - assertEqualSequences(clone.getWord(), wd.getWord()); - assertEqualSequences(clone.wordCharSequence, wd.wordCharSequence); - } - - // Check collections contract. - final HashSet entries = new HashSet(); - try { - entries.add(words.get(0)); - fail(); - } catch (RuntimeException e) { - // Expected. - } - } - - private void assertEqualSequences(CharSequence s1, CharSequence s2) { - assertEquals(s1.toString(), s2.toString()); - } - - /* */ - @Test - public void testWordDataFields() throws IOException { - final IStemmer s = new PolishStemmer(); - - final String word = "liga"; - final List response = s.lookup(word); - assertEquals(2, response.size()); - - final HashSet stems = new HashSet(); - final HashSet tags = new HashSet(); - for (WordData wd : response) { - stems.add(wd.getStem().toString()); - tags.add(wd.getTag().toString()); - assertSame(word, wd.getWord()); - } - assertTrue(stems.contains("ligać")); - assertTrue(stems.contains("liga")); - assertTrue(tags.contains("subst:sg:nom:f")); - assertTrue(tags.contains("verb:fin:sg:ter:imperf")); - - // Repeat to make sure we get the same values consistently. - for (WordData wd : response) { - stems.contains(wd.getStem().toString()); - tags.contains(wd.getTag().toString()); - } - - // Run the same consistency check for the returned buffers. - final ByteBuffer temp = ByteBuffer.allocate(100); - for (WordData wd : response) { - // Buffer should be copied. - final ByteBuffer copy = wd.getStemBytes(null); - final String stem = new String(copy.array(), copy.arrayOffset() - + copy.position(), copy.remaining(), "iso-8859-2"); - // The buffer should be present in stems set. - assertTrue(stem, stems.contains(stem)); - // Buffer large enough to hold the contents. - temp.clear(); - assertSame(temp, wd.getStemBytes(temp)); - // The copy and the clone should be identical. - assertEquals(0, copy.compareTo(temp)); - } - - for (WordData wd : response) { - // Buffer should be copied. - final ByteBuffer copy = wd.getTagBytes(null); - final String tag = new String(copy.array(), copy.arrayOffset() - + copy.position(), copy.remaining(), "iso-8859-2"); - // The buffer should be present in tags set. - assertTrue(tag, tags.contains(tag)); - // Buffer large enough to hold the contents. - temp.clear(); - assertSame(temp, wd.getTagBytes(temp)); - // The copy and the clone should be identical. - assertEquals(0, copy.compareTo(temp)); - } - - for (WordData wd : response) { - // Buffer should be copied. - final ByteBuffer copy = wd.getWordBytes(null); - assertNotNull(copy); - assertEquals(0, copy.compareTo(ByteBuffer.wrap(word - .getBytes("iso-8859-2")))); - } - } - - /* */ - @Test - public void testMultibyteEncodingUTF8() throws IOException { - final URL url = this.getClass() - .getResource("test-diacritics-utf8.dict"); - final IStemmer s = new DictionaryLookup(Dictionary.read(url)); - - assertArrayEquals(new String[] { "merge", "001" }, stem(s, "mergeam")); - assertArrayEquals(new String[] { "merge", "002" }, - stem(s, "merseserăm")); - } - - /* */ - @Test - public void testSynthesis() throws IOException { - final URL url = this.getClass().getResource("test-synth.dict"); - final IStemmer s = new DictionaryLookup(Dictionary.read(url)); - - assertArrayEquals(new String[] { "miała", null }, stem(s, - "mieć|verb:praet:sg:ter:f:?perf")); - assertArrayEquals(new String[] { "a", null }, stem(s, "a|conj")); - assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n")); - - // This word is not in the dictionary. - assertNoStemFor(s, "martygalski"); - } - - /* */ - @Test - public void testInputWithSeparators() throws IOException { - final URL url = this.getClass().getResource("test-separators.dict"); - final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); - - /* - * Attemp to reconstruct input sequences using WordData iterator. - */ - ArrayList sequences = new ArrayList(); - for (WordData wd : s) { - sequences.add("" + wd.getWord() + " " + wd.getStem() + " " - + wd.getTag()); - } - Collections.sort(sequences); - - assertEquals("token1 null null", sequences.get(0)); - assertEquals("token2 null null", sequences.get(1)); - assertEquals("token3 null +", sequences.get(2)); - assertEquals("token4 token2 null", sequences.get(3)); - assertEquals("token5 token2 null", sequences.get(4)); - assertEquals("token6 token2 +", sequences.get(5)); - assertEquals("token7 token2 token3+", sequences.get(6)); - assertEquals("token8 token2 token3++", sequences.get(7)); - } - - /* */ - public static String[] stem(IStemmer s, String word) { - ArrayList result = new ArrayList(); - for (WordData wd : s.lookup(word)) { - result.add(asString(wd.getStem())); - result.add(asString(wd.getTag())); - } - return result.toArray(new String[result.size()]); - } - - /* */ - public static String asString(CharSequence s) { - if (s == null) - return null; - return s.toString(); - } - - /* */ - public static void assertNoStemFor(IStemmer s, String word) { - assertArrayEquals(new String[] {}, stem(s, word)); - } -} diff --git a/src-test/morfologik/stemming/PerformanceTest.java b/src-test/morfologik/stemming/PerformanceTest.java deleted file mode 100644 index 3676d7d..0000000 --- a/src-test/morfologik/stemming/PerformanceTest.java +++ /dev/null @@ -1,73 +0,0 @@ -package morfologik.stemming; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.nio.ByteBuffer; - -import org.junit.BeforeClass; -import org.junit.Test; - -import com.carrotsearch.junitbenchmarks.AbstractBenchmark; -import com.carrotsearch.junitbenchmarks.BenchmarkOptions; - -/** - * Simple performance micro-benchmarks. - */ -@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 10) -public class PerformanceTest extends AbstractBenchmark { - /* Guard against escape analysis and HotSpot opts. */ - public volatile int guard; - - /* Test data. */ - static final int sequences = 100000; - static final String[] testWords = new String[sequences]; - static final PolishStemmer stemmer = new PolishStemmer(); - - /** - * Prepare test data. - */ - @BeforeClass - public static void prepare() throws UnsupportedEncodingException - { - final Dictionary dict = Dictionary.getForLanguage("pl"); - int i = 0; - for (ByteBuffer sequence : dict.fsa) { - testWords[i] = new String(sequence.array(), 0, - sequence.remaining(), dict.metadata.encoding); - testWords[i] = testWords[i].substring(0, testWords[i] - .indexOf(dict.metadata.separator)); - i++; - - if (i == testWords.length) - break; - } - } - - @Test - public void traversal_100000() throws IOException { - final Dictionary dict = Dictionary.getForLanguage("pl"); - - int max = sequences; - int guard = 0; - for (ByteBuffer sequence : dict.fsa) { - guard += sequence.remaining(); - if (--max == 0) - break; - } - - this.guard = guard; - } - - @Test - public void stemming_100000() throws IOException { - int guard = 0; - for (String word : testWords) { - for (WordData dta : stemmer.lookup(word)) - { - guard += dta.getStem().length(); - guard += dta.getTag().length(); - } - } - this.guard = guard; - } -} diff --git a/src-test/morfologik/stemming/PolishStemmerTest.java b/src-test/morfologik/stemming/PolishStemmerTest.java deleted file mode 100644 index b3230ac..0000000 --- a/src-test/morfologik/stemming/PolishStemmerTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package morfologik.stemming; - -import static morfologik.stemming.DictionaryLookupTest.assertNoStemFor; -import static morfologik.stemming.DictionaryLookupTest.stem; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; - -import java.io.IOException; -import java.util.HashSet; -import java.util.TreeSet; - -import org.junit.Ignore; -import org.junit.Test; - -/* - * - */ -public class PolishStemmerTest { - /* */ - @Test - public void testLexemes() throws IOException { - PolishStemmer s = new PolishStemmer(); - - assertEquals("żywotopisarstwo", stem(s, "żywotopisarstwie")[0]); - assertEquals("abradować", stem(s, "abradowałoby")[0]); - - assertArrayEquals(new String[] { "żywotopisarstwo", "subst:sg:loc:n" }, - stem(s, "żywotopisarstwie")); - assertArrayEquals(new String[] { "bazia", "subst:pl:inst:f" }, stem(s, - "baziami")); - - // This word is not in the dictionary. - assertNoStemFor(s, "martygalski"); - } - - /* */ - @Test - @Ignore - public void listUniqueTags() throws IOException { - HashSet forms = new HashSet(); - for (WordData wd : new PolishStemmer()) { - final CharSequence chs = wd.getTag(); - if (chs == null) { - System.err.println("Missing tag for: " + wd.getWord()); - continue; - } - forms.add(chs.toString()); - } - - for (String s : new TreeSet(forms)) { - System.out.println(s); - } - } -} diff --git a/src-test/morfologik/stemming/StringDecoderBenchmarkTest.java b/src-test/morfologik/stemming/StringDecoderBenchmarkTest.java deleted file mode 100644 index e8c6c17..0000000 --- a/src-test/morfologik/stemming/StringDecoderBenchmarkTest.java +++ /dev/null @@ -1,62 +0,0 @@ -package morfologik.stemming; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; - -import org.junit.Ignore; -import org.junit.Test; - -import com.carrotsearch.junitbenchmarks.AbstractBenchmark; -import com.carrotsearch.junitbenchmarks.BenchmarkOptions; - -@BenchmarkOptions(callgc = false, warmupRounds = 5, benchmarkRounds = 20) -@Ignore -public class StringDecoderBenchmarkTest extends AbstractBenchmark { - /* Guard against escape analysis and HotSpot opts. */ - public volatile int guard; - - private final int sequences = 1000000; - - final String input = "dbaoidbhoei"; - final CharBuffer chars = CharBuffer.allocate(100); - final ByteBuffer bytes = ByteBuffer.allocate(100); - final CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder(); - - /** - * This is a simple comparison of performance converting a string to bytes - * using String.getBytes and CharsetEncoder (which String.getBytes uses - * internally in SUN's JDK). - */ - @Test - public void stringGetBytes() throws Exception { - int guard = 0; - for (int i = 0; i < sequences; i++) { - guard += input.getBytes("UTF-8").length; - } - this.guard = guard; - } - - @Test - public void charsetEncoder() throws Exception { - int guard = 0; - for (int i = 0; i < sequences; i++) { - chars.clear(); - for (int j = 0; j < input.length(); j++) { - chars.put(input.charAt(j)); - } - chars.flip(); - - bytes.clear(); - chars.mark(); - encoder.encode(chars, bytes, true); - bytes.flip(); - chars.reset(); - - guard += chars.remaining(); - } - - this.guard = guard; - } -} diff --git a/src-test/morfologik/stemming/test-diacritics-utf8.dict b/src-test/morfologik/stemming/test-diacritics-utf8.dict deleted file mode 100644 index 2a62f21..0000000 Binary files a/src-test/morfologik/stemming/test-diacritics-utf8.dict and /dev/null differ diff --git a/src-test/morfologik/stemming/test-diacritics-utf8.info b/src-test/morfologik/stemming/test-diacritics-utf8.info deleted file mode 100644 index ec3a98e..0000000 --- a/src-test/morfologik/stemming/test-diacritics-utf8.info +++ /dev/null @@ -1,10 +0,0 @@ -# -# Dictionary properties. -# - -fsa.dict.separator=+ -fsa.dict.encoding=UTF-8 - -fsa.dict.uses-prefixes=false -fsa.dict.uses-infixes=false - diff --git a/src-test/morfologik/stemming/test-infix.dict b/src-test/morfologik/stemming/test-infix.dict deleted file mode 100644 index cc91f70..0000000 Binary files a/src-test/morfologik/stemming/test-infix.dict and /dev/null differ diff --git a/src-test/morfologik/stemming/test-infix.info b/src-test/morfologik/stemming/test-infix.info deleted file mode 100644 index 30e68ba..0000000 --- a/src-test/morfologik/stemming/test-infix.info +++ /dev/null @@ -1,9 +0,0 @@ -# -# Dictionary properties. -# - -fsa.dict.separator=+ -fsa.dict.encoding=iso-8859-2 - -fsa.dict.uses-prefixes=true -fsa.dict.uses-infixes=true \ No newline at end of file diff --git a/src-test/morfologik/stemming/test-prefix.dict b/src-test/morfologik/stemming/test-prefix.dict deleted file mode 100644 index d0bed4c..0000000 Binary files a/src-test/morfologik/stemming/test-prefix.dict and /dev/null differ diff --git a/src-test/morfologik/stemming/test-prefix.info b/src-test/morfologik/stemming/test-prefix.info deleted file mode 100644 index 59b5376..0000000 --- a/src-test/morfologik/stemming/test-prefix.info +++ /dev/null @@ -1,9 +0,0 @@ -# -# Dictionary properties. -# - -fsa.dict.separator=+ -fsa.dict.encoding=iso-8859-2 - -fsa.dict.uses-prefixes=true -fsa.dict.uses-infixes=false \ No newline at end of file diff --git a/src-test/morfologik/stemming/test-separators.dict b/src-test/morfologik/stemming/test-separators.dict deleted file mode 100644 index a71b9e7..0000000 Binary files a/src-test/morfologik/stemming/test-separators.dict and /dev/null differ diff --git a/src-test/morfologik/stemming/test-separators.info b/src-test/morfologik/stemming/test-separators.info deleted file mode 100644 index c4a260f..0000000 --- a/src-test/morfologik/stemming/test-separators.info +++ /dev/null @@ -1,9 +0,0 @@ -# -# Dictionary properties. -# - -fsa.dict.separator=+ -fsa.dict.encoding=iso8859-1 - -fsa.dict.uses-prefixes=false -fsa.dict.uses-infixes=false \ No newline at end of file diff --git a/src-test/morfologik/stemming/test-separators.txt b/src-test/morfologik/stemming/test-separators.txt deleted file mode 100644 index cd77945..0000000 --- a/src-test/morfologik/stemming/test-separators.txt +++ /dev/null @@ -1,8 +0,0 @@ -token1+ -token2++ -token3+++ -token4+token2 -token5+token2+ -token6+token2++ -token7+token2+token3+ -token8+token2+token3++ \ No newline at end of file diff --git a/src-test/morfologik/stemming/test-synth.dict b/src-test/morfologik/stemming/test-synth.dict deleted file mode 100644 index 6890253..0000000 Binary files a/src-test/morfologik/stemming/test-synth.dict and /dev/null differ diff --git a/src-test/morfologik/stemming/test-synth.info b/src-test/morfologik/stemming/test-synth.info deleted file mode 100644 index ffce33e..0000000 --- a/src-test/morfologik/stemming/test-synth.info +++ /dev/null @@ -1,6 +0,0 @@ -# -# Dictionary properties. -# - -fsa.dict.separator=+ -fsa.dict.encoding=iso-8859-2 \ No newline at end of file diff --git a/src-test/morfologik/tools/LauncherTest.java b/src-test/morfologik/tools/LauncherTest.java deleted file mode 100644 index da94df7..0000000 --- a/src-test/morfologik/tools/LauncherTest.java +++ /dev/null @@ -1,26 +0,0 @@ -package morfologik.tools; - -import java.util.Map; - -import morfologik.tools.Launcher.ToolInfo; - -import org.junit.Assert; -import org.junit.Test; - -/* - * - */ -public class LauncherTest { - /* */ - @Test - public void testTools() throws Exception { - for (Map.Entry e : Launcher.initTools().entrySet()) { - try { - e.getValue().invoke(new String[] {}); - } catch (Throwable t) { - Assert.fail("Unable to launch " + e.getKey() + ": " - + t.getMessage()); - } - } - } -} diff --git a/src-test/morfologik/tools/MorphEncoderTest.java b/src-test/morfologik/tools/MorphEncoderTest.java deleted file mode 100644 index a9b8f0d..0000000 --- a/src-test/morfologik/tools/MorphEncoderTest.java +++ /dev/null @@ -1,125 +0,0 @@ -package morfologik.tools; - -import static org.junit.Assert.assertEquals; - -import java.io.UnsupportedEncodingException; - -import morfologik.tools.MorphEncoder; - -import org.junit.Before; -import org.junit.Test; - -import static morfologik.tools.MorphEncoder.*; - -/* - * - */ -public class MorphEncoderTest { - private MorphEncoder encoder; - - @Before - public void setUp() { - encoder = new MorphEncoder(); - } - - @Test - public void testCommonPrefix() { - assertEquals(3, commonPrefix("abc".getBytes(), "abcd".getBytes())); - assertEquals(0, commonPrefix("abc".getBytes(), "cba".getBytes())); - } - - @Test - public void testStandardEncode() throws UnsupportedEncodingException { - assertEquals("abc+Ad+tag", - asString(encoder.standardEncode( - "abc".getBytes("UTF-8"), - "abcd".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "UTF-8")); - - assertEquals("abc+Dxyz+tag", encoder.standardEncodeUTF8("abc", "xyz", "tag")); - assertEquals("abc+Bć+tag", encoder.standardEncodeUTF8("abc", "abć", "tag")); - } - - @Test - public void testSeparatorChange() throws UnsupportedEncodingException { - assertEquals("abc+Ad+tag", - asString(encoder.standardEncode( - "abc".getBytes("UTF-8"), - "abcd".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "UTF-8")); - - encoder = new MorphEncoder((byte) '_'); - assertEquals("abc_Ad_tag", - asString(encoder.standardEncode( - "abc".getBytes("UTF-8"), - "abcd".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "UTF-8")); - - encoder = new MorphEncoder((byte) '\t'); - assertEquals("abc\tAd\ttag", - asString(encoder.standardEncode( - "abc".getBytes("UTF-8"), - "abcd".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "UTF-8")); - } - - @Test - public void testPrefixEncode() throws UnsupportedEncodingException { - assertEquals("abc+AAd+tag", asString( - encoder.prefixEncode( - "abc".getBytes("UTF-8"), - "abcd".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "UTF-8")); - - assertEquals("abcd+AB+tag", asString( - encoder.prefixEncode( - "abcd".getBytes("UTF-8"), - "abc".getBytes("UTF-8"), - "tag".getBytes("UTF-8")), "US-ASCII")); - - assertEquals("abc+ADxyz+tag", - encoder.prefixEncodeUTF8("abc", "xyz", "tag")); - assertEquals("abc+ABć+tag", - encoder.prefixEncodeUTF8("abc", "abć", "tag")); - assertEquals("postmodernizm+AAu+xyz", - encoder.prefixEncodeUTF8("postmodernizm", "postmodernizmu", "xyz")); - assertEquals("postmodernizmu+AB+xyz", - encoder.prefixEncodeUTF8("postmodernizmu", "postmodernizm", "xyz")); - assertEquals("nieduży+DA+adj", - encoder.prefixEncodeUTF8("nieduży", "duży", "adj")); - assertEquals("postmodernizm+ANmodernizm+xyz", - encoder.prefixEncodeUTF8("postmodernizm", "modernizm", "xyz")); - } - - @Test - public void testInfixEncode() throws UnsupportedEncodingException { - assertEquals("abc+AAAd+tag", encoder.infixEncodeUTF8("abc", "abcd", "tag")); - assertEquals("abcd+AAB+tag", encoder.infixEncodeUTF8("abcd", "abc", "tag")); - assertEquals("abc+AADxyz+tag", encoder.infixEncodeUTF8("abc", "xyz", "tag")); - assertEquals("abc+AABć+tag", encoder.infixEncodeUTF8("abc", "abć", "tag")); - assertEquals("postmodernizm+AAAu+xyz", - encoder.infixEncodeUTF8("postmodernizm", "postmodernizmu", "xyz")); - assertEquals("postmodernizmu+AAB+xyz", - encoder.infixEncodeUTF8("postmodernizmu", "postmodernizm", "xyz")); - assertEquals("nieduży+ADA+adj", - encoder.infixEncodeUTF8("nieduży", "duży", "adj")); - - // real infix cases - assertEquals("kcal+ABA+xyz", encoder.infixEncodeUTF8("kcal", "cal", "xyz")); - assertEquals("aillent+BBCr+xyz", encoder.infixEncodeUTF8("aillent", "aller", "xyz")); - assertEquals("laquelle+AAHequel+D f s", encoder.infixEncodeUTF8("laquelle", "lequel", "D f s")); - assertEquals("ccal+ABA+test", encoder.infixEncodeUTF8("ccal", "cal", "test")); - } - - @Test - public void testUTF8Boundary() throws UnsupportedEncodingException { - assertEquals("passagère+Eer+tag", encoder.standardEncodeUTF8("passagère", "passager", "tag")); - assertEquals("passagère+AAEer+tag", encoder.infixEncodeUTF8("passagère", "passager", "tag")); - assertEquals("passagère+AEer+tag", encoder.prefixEncodeUTF8("passagère", "passager", "tag")); - } - - @Test - public void testAsString() throws UnsupportedEncodingException { - assertEquals("passagère", asString("passagère".getBytes("UTF-8"), "UTF-8")); - } -} diff --git a/src-test/morfologik/tools/MorphEncodingToolTest.java b/src-test/morfologik/tools/MorphEncodingToolTest.java deleted file mode 100644 index c3d2559..0000000 --- a/src-test/morfologik/tools/MorphEncodingToolTest.java +++ /dev/null @@ -1,110 +0,0 @@ -package morfologik.tools; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; - -import org.junit.Assert; -import org.junit.Test; - -/* - * - */ -public class MorphEncodingToolTest { - - @Test - public void testTool() throws Exception { - // Create a simple plain text file. - File input = File.createTempFile("input", "in"); - File output = File.createTempFile("output", "fsa.txt"); - input.deleteOnExit(); - output.deleteOnExit(); - - // Populate the file with data. - PrintWriter w = new PrintWriter(new OutputStreamWriter( - new FileOutputStream(input), "UTF-8")); - w.println("passagère\tpassager\ttag"); - w.println("nieduży\tduży\ttest"); - w.println("abcd\tabc\txyz"); - w.close(); - - // suffix - MorphEncodingTool.main(new String[] { - "--input", input.getAbsolutePath(), - "--output", output.getAbsolutePath(), - "-suf" }); - - BufferedReader testOutput = new BufferedReader(new InputStreamReader( - new FileInputStream(output.getAbsolutePath()), "UTF-8")); - Assert.assertEquals("passagère+Eer+tag", testOutput.readLine()); - Assert.assertEquals("nieduży+Iduży+test", testOutput.readLine()); - Assert.assertEquals("abcd+B+xyz", testOutput.readLine()); - - testOutput.close(); - - // prefix - MorphEncodingTool.main(new String[] { - "--input", input.getAbsolutePath(), - "--output", output.getAbsolutePath(), - "-pre" }); - - testOutput = new BufferedReader(new InputStreamReader( - new FileInputStream(output.getAbsolutePath()), "UTF-8")); - Assert.assertEquals("passagère+AEer+tag", testOutput.readLine()); - Assert.assertEquals("nieduży+DA+test", testOutput.readLine()); - Assert.assertEquals("abcd+AB+xyz", testOutput.readLine()); - - testOutput.close(); - - // infix - MorphEncodingTool.main(new String[] { - "--input", input.getAbsolutePath(), - "--output", output.getAbsolutePath(), - "-inf" }); - - testOutput = new BufferedReader(new InputStreamReader( - new FileInputStream(output.getAbsolutePath()), "UTF-8")); - Assert.assertEquals("passagère+AAEer+tag", testOutput.readLine()); - Assert.assertEquals("nieduży+ADA+test", testOutput.readLine()); - Assert.assertEquals("abcd+AAB+xyz", testOutput.readLine()); - - testOutput.close(); - - } - - /* */ - @Test - public void testStemmingFile() throws Exception { - // Create a simple plain text file. - File input = File.createTempFile("input", "in"); - File output = File.createTempFile("output", "fsa.txt"); - input.deleteOnExit(); - output.deleteOnExit(); - - // Populate the file with data. - - // stemming only - PrintWriter w = new PrintWriter(new OutputStreamWriter( - new FileOutputStream(input), "UTF-8")); - w.println("passagère\tpassager"); - w.println("nieduży\tduży"); - w.println("abcd\tabc"); - w.close(); - - MorphEncodingTool.main(new String[] { "--input", - input.getAbsolutePath(), "--output", output.getAbsolutePath(), - "-suf" }); - - BufferedReader testOutput = new BufferedReader(new InputStreamReader( - new FileInputStream(output.getAbsolutePath()), "UTF-8")); - Assert.assertEquals("passagère+Eer+", testOutput.readLine()); - Assert.assertEquals("nieduży+Iduży+", testOutput.readLine()); - Assert.assertEquals("abcd+B+", testOutput.readLine()); - - testOutput.close(); - } -} diff --git a/src-test/morfologik/tools/Text2FSA5Test.java b/src-test/morfologik/tools/Text2FSA5Test.java deleted file mode 100644 index 573c5da..0000000 --- a/src-test/morfologik/tools/Text2FSA5Test.java +++ /dev/null @@ -1,37 +0,0 @@ -package morfologik.tools; - -import java.io.*; - -import morfologik.fsa.*; - -import org.junit.Assert; -import org.junit.Test; - -/* - * - */ -public class Text2FSA5Test { - @Test - public void testTool() throws Exception { - // Create a simple plain text file. - File input = File.createTempFile("input", "in"); - File output = File.createTempFile("output", "fsa"); - input.deleteOnExit(); - output.deleteOnExit(); - - // Populate the file with data. - PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(input), "UTF-8")); - w.println("b"); - w.println("cab"); - w.println("ab"); - w.close(); - - FSABuildTool.main(new String [] { - "--input", input.getAbsolutePath(), - "--output", output.getAbsolutePath() - }); - - FSA5 fsa = FSA.read(new FileInputStream(output)); - Assert.assertEquals(3, new FSAInfo(fsa).finalStatesCount); - } -} diff --git a/src-test/morfologik/util/MinMax.java b/src-test/morfologik/util/MinMax.java deleted file mode 100644 index 4af6118..0000000 --- a/src-test/morfologik/util/MinMax.java +++ /dev/null @@ -1,21 +0,0 @@ -package morfologik.util; - -/** - * Minimum/maximum and range. - */ -public final class MinMax -{ - public final int min; - public final int max; - - public MinMax(int min, int max) - { - this.min = Math.min(min, max); - this.max = Math.max(min, max); - } - - public int range() - { - return max - min; - } -} \ No newline at end of file diff --git a/src/morfologik/dictionaries/pl.LICENSE b/src/morfologik/dictionaries/pl.LICENSE deleted file mode 100644 index 8529618..0000000 --- a/src/morfologik/dictionaries/pl.LICENSE +++ /dev/null @@ -1,8 +0,0 @@ -LICENCE - -The dictionary comes from Morfologik project. Morfologik uses data from -Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and -is licenced on the terms of (inter alia) LGPL and Creative Commons -ShareAlike. The part-of-speech tags were added in Morfologik project and -are not found in the data from sjp.pl. The tagset is similar to IPI PAN -tagset. diff --git a/src/morfologik/dictionaries/pl.dict b/src/morfologik/dictionaries/pl.dict deleted file mode 100644 index 1ebd28a..0000000 Binary files a/src/morfologik/dictionaries/pl.dict and /dev/null differ diff --git a/src/morfologik/dictionaries/pl.info b/src/morfologik/dictionaries/pl.info deleted file mode 100644 index 0c933c9..0000000 --- a/src/morfologik/dictionaries/pl.info +++ /dev/null @@ -1,13 +0,0 @@ -# -# Dictionary metadata. -# - -fsa.dict.author=morfologik.blogspot.com -fsa.dict.created=19.11.2010 -fsa.dict.license=LGPL or Creative Commons ShareAlike license (pick any suitable). http://morfologik.blogspot.com - -fsa.dict.separator=+ -fsa.dict.encoding=iso-8859-2 - -fsa.dict.uses-prefixes=true -fsa.dict.uses-infixes=true diff --git a/src/morfologik/fsa/CFSA.java b/src/morfologik/fsa/CFSA.java deleted file mode 100644 index 695664d..0000000 --- a/src/morfologik/fsa/CFSA.java +++ /dev/null @@ -1,364 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.FSAFlags.*; -import static morfologik.util.FileUtils.readFully; - -import java.io.*; -import java.util.*; - -/** - * CFSA (Compact Finite State Automaton) binary format implementation. This is a - * slightly reorganized version of {@link FSA5} offering smaller automata size - * at some (minor) performance penalty. - * - *

Note: Serialize to {@link CFSA2} for new code.

- * - *

The encoding of automaton body is as follows.

- * - *
- * ---- FSA header (standard)
- * Byte                            Description 
- *       +-+-+-+-+-+-+-+-+\
- *     0 | | | | | | | | | +------ '\'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     1 | | | | | | | | | +------ 'f'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     2 | | | | | | | | | +------ 's'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     3 | | | | | | | | | +------ 'a'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     4 | | | | | | | | | +------ version (fixed 0xc5)
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     5 | | | | | | | | | +------ filler character
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     6 | | | | | | | | | +------ annot character
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *  8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding. 
- *       : : : : : : : : : |
- *       +-+-+-+-+-+-+-+-+/
- * 
- * ---- Start of a node; only if automaton was compiled with NUMBERS option.
- * 
- * Byte
- *        +-+-+-+-+-+-+-+-+\
- *      0 | | | | | | | | | \  LSB
- *        +-+-+-+-+-+-+-+-+  +
- *      1 | | | | | | | | |  |      number of strings recognized
- *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
- *        : : : : : : : : :  |      from this node.
- *        +-+-+-+-+-+-+-+-+  +
- *  ctl-1 | | | | | | | | | /  MSB
- *        +-+-+-+-+-+-+-+-+/
- *        
- * ---- A vector of node's arcs. Conditional format, depending on flags.
- * 
- * 1) NEXT bit set, mapped arc label. 
- * 
- *                +--------------- arc's label mapped in M bits if M's field value > 0
- *                | +------------- node pointed to is next
- *                | | +----------- the last arc of the node
- *         _______| | | +--------- the arc is final
- *        /       | | | |
- *       +-+-+-+-+-+-+-+-+\
- *     0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
- *       +-+-+-+-+-+-+-+-+/
- * 
- * 2) NEXT bit set, label separate.
- * 
- *                +--------------- arc's label stored separately (M's field is zero).
- *                | +------------- node pointed to is next
- *                | | +----------- the last arc of the node
- *                | | | +--------- the arc is final
- *                | | | |
- *       +-+-+-+-+-+-+-+-+\
- *     0 |0|0|0|0|0|1|L|F| +------ flags
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     1 | | | | | | | | | +------ label
- *       +-+-+-+-+-+-+-+-+/
- * 
- * 3) NEXT bit not set. Full arc.
- * 
- *                  +------------- node pointed to is next
- *                  | +----------- the last arc of the node
- *                  | | +--------- the arc is final
- *                  | | |
- *       +-+-+-+-+-+-+-+-+\
- *     0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     1 | | | | | | | | | +------ label
- *       +-+-+-+-+-+-+-+-+/
- *       : : : : : : : : :       
- *       +-+-+-+-+-+-+-+-+\
- * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
- *       +-+-+-+-+-+-+-+-+/
- * 
- */ -public final class CFSA extends FSA { - /** - * Automaton header version value. - */ - public static final byte VERSION = (byte) 0xC5; - - /** - * Bitmask indicating that an arc corresponds to the last character of a - * sequence available when building the automaton. - */ - public static final int BIT_FINAL_ARC = 1 << 0; - - /** - * Bitmask indicating that an arc is the last one of the node's list and the - * following one belongs to another node. - */ - public static final int BIT_LAST_ARC = 1 << 1; - - /** - * Bitmask indicating that the target node of this arc follows it in the - * compressed automaton structure (no goto field). - */ - public static final int BIT_TARGET_NEXT = 1 << 2; - - /** - * An array of bytes with the internal representation of the automaton. - * Please see the documentation of this class for more information on how - * this structure is organized. - */ - public byte[] arcs; - - /** - * The length of the node header structure (if the automaton was compiled with - * NUMBERS option). Otherwise zero. - */ - public final int nodeDataLength; - - /** - * Flags for this automaton version. - */ - private final Set flags; - - /** - * Number of bytes each address takes in full, expanded form (goto length). - */ - public final int gtl; - - /** - * Label mapping for arcs of type (1) (see class documentation). The array - * is indexed by mapped label's value and contains the original label. - */ - public final byte[] labelMapping; - - /** - * Creates a new automaton, reading it from a file in FSA format, version 5. - */ - public CFSA(InputStream fsaStream) throws IOException { - // Read the header first. - final FSAHeader header = FSAHeader.read(fsaStream); - - // Ensure we have the correct version. - if (header.version != VERSION) { - throw new IOException("This class can read FSA version 5 only: " + header.version); - } - - /* - * Determine if the automaton was compiled with NUMBERS. If so, modify - * ctl and goto fields accordingly. - */ - flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); - if ((header.gtl & 0xf0) != 0) { - this.nodeDataLength = (header.gtl >>> 4) & 0x0f; - this.gtl = header.gtl & 0x0f; - flags.add(NUMBERS); - } else { - this.nodeDataLength = 0; - this.gtl = header.gtl & 0x0f; - } - - /* - * Read mapping dictionary. - */ - labelMapping = new byte[1 << 5]; - readFully(fsaStream, labelMapping); - - /* - * Read arcs' data. - */ - arcs = readFully(fsaStream); - } - - /** - * Returns the start node of this automaton. May return 0 if - * the start node is also an end node. - */ - @Override - public int getRootNode() { - // Skip dummy node marking terminating state. - final int epsilonNode = skipArc(getFirstArc(0)); - - // And follow the epsilon node's first (and only) arc. - return getDestinationNodeOffset(getFirstArc(epsilonNode)); - } - - /** - * {@inheritDoc} - */ - @Override - public final int getFirstArc(int node) { - return nodeDataLength + node; - } - - /** - * {@inheritDoc} - */ - @Override - public final int getNextArc(int arc) { - if (isArcLast(arc)) - return 0; - else - return skipArc(arc); - } - - /** - * {@inheritDoc} - */ - @Override - public int getArc(int node, byte label) { - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (getArcLabel(arc) == label) - return arc; - } - - // An arc labeled with "label" not found. - return 0; - } - - /** - * {@inheritDoc} - */ - @Override - public int getEndNode(int arc) { - final int nodeOffset = getDestinationNodeOffset(arc); - if (0 == nodeOffset) { - throw new RuntimeException("This is a terminal arc [" + arc + "]"); - } - return nodeOffset; - } - - /** - * {@inheritDoc} - */ - @Override - public byte getArcLabel(int arc) { - if (isNextSet(arc) && isLabelCompressed(arc)) { - return this.labelMapping[(arcs[arc] >>> 3) & 0x1f]; - } else { - return arcs[arc + 1]; - } - } - - /** - * {@inheritDoc} - */ - @Override - public int getRightLanguageCount(int node) { - assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; - return FSA5.decodeFromBytes(arcs, node, nodeDataLength); - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcFinal(int arc) { - return (arcs[arc] & BIT_FINAL_ARC) != 0; - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcTerminal(int arc) { - return (0 == getDestinationNodeOffset(arc)); - } - - /** - * Returns true if this arc has NEXT bit set. - * - * @see #BIT_LAST_ARC - */ - public boolean isArcLast(int arc) { - return (arcs[arc] & BIT_LAST_ARC) != 0; - } - - /** - * @see #BIT_TARGET_NEXT - */ - public boolean isNextSet(int arc) { - return (arcs[arc] & BIT_TARGET_NEXT) != 0; - } - - /** - * Returns true if the label is compressed inside flags byte. - */ - public boolean isLabelCompressed(int arc) { - assert isNextSet(arc) : "Only applicable to arcs with NEXT bit."; - return (arcs[arc] & (-1 << 3)) != 0; - } - - /** - * {@inheritDoc} - * - *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag - * may be set to indicate the automaton contains extra fields for each node.

- */ - public Set getFlags() { - return Collections.unmodifiableSet(flags); - } - - /** - * Returns the address of the node pointed to by this arc. - */ - final int getDestinationNodeOffset(int arc) { - if (isNextSet(arc)) { - /* The destination node follows this arc in the array. */ - return skipArc(arc); - } else { - /* - * The destination node address has to be extracted from the arc's - * goto field. - */ - int r = 0; - for (int i = gtl; --i >= 1;) { - r = r << 8 | (arcs[arc + 1 + i] & 0xff); - } - r = r << 8 | (arcs[arc] & 0xff); - return r >>> 3; - } - } - - /** - * Read the arc's layout and skip as many bytes, as needed, to skip it. - */ - private int skipArc(int offset) { - if (isNextSet(offset)) { - if (isLabelCompressed(offset)) { - offset++; - } else { - offset += 1 + 1; - } - } else { - offset += 1 + gtl; - } - return offset; - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/CFSA2.java b/src/morfologik/fsa/CFSA2.java deleted file mode 100644 index 6955da4..0000000 --- a/src/morfologik/fsa/CFSA2.java +++ /dev/null @@ -1,404 +0,0 @@ -package morfologik.fsa; - -import static morfologik.util.FileUtils.readFully; - -import java.io.IOException; -import java.io.InputStream; -import java.util.EnumSet; -import java.util.Set; - -import morfologik.util.FileUtils; - -/** - * CFSA (Compact Finite State Automaton) binary format implementation, version 2: - *
    - *
  • {@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one.
  • - *
  • v-coded goto field
  • - *
  • v-coded perfect hashing numbers, if any
  • - *
  • 31 most frequent labels integrated with flags byte
  • - *
- * - *

The encoding of automaton body is as follows.

- * - *
- * ---- CFSA header
- * Byte                            Description 
- *       +-+-+-+-+-+-+-+-+\
- *     0 | | | | | | | | | +------ '\'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     1 | | | | | | | | | +------ 'f'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     2 | | | | | | | | | +------ 's'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     3 | | | | | | | | | +------ 'a'
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     4 | | | | | | | | | +------ version (fixed 0xc6)
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     5 | | | | | | | | | +----\
- *       +-+-+-+-+-+-+-+-+/      \ flags [MSB first]
- *       +-+-+-+-+-+-+-+-+\      /
- *     6 | | | | | | | | | +----/
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     7 | | | | | | | | | +------ label lookup table size
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *  8-32 | | | | | | | | | +------ label value lookup table 
- *       : : : : : : : : : |
- *       +-+-+-+-+-+-+-+-+/
- * 
- * ---- Start of a node; only if automaton was compiled with NUMBERS option.
- * 
- * Byte
- *        +-+-+-+-+-+-+-+-+\
- *      0 | | | | | | | | | \  
- *        +-+-+-+-+-+-+-+-+  +
- *      1 | | | | | | | | |  |      number of strings recognized
- *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
- *        : : : : : : : : :  |      from this node. v-coding
- *        +-+-+-+-+-+-+-+-+  +
- *        | | | | | | | | | /  
- *        +-+-+-+-+-+-+-+-+/
- *
- * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
- * 
- * 1) NEXT bit set, mapped arc label. 
- * 
- *        +----------------------- node pointed to is next
- *        | +--------------------- the last arc of the node
- *        | | +------------------- this arc leads to a final state (acceptor)
- *        | | |  _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
- *        | | | / | | | |
- *       +-+-+-+-+-+-+-+-+\
- *     0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
- *       +-+-+-+-+-+-+-+-+/
- *       +-+-+-+-+-+-+-+-+\
- *     1 | | | | | | | | | +------ optional label if M == 0
- *       +-+-+-+-+-+-+-+-+/
- *       : : : : : : : : :
- *       +-+-+-+-+-+-+-+-+\
- *       |A|A|A|A|A|A|A|A| +------ v-coded goto address
- *       +-+-+-+-+-+-+-+-+/
- * 
- */ -public final class CFSA2 extends FSA { - /** - * Automaton header version value. - */ - public static final byte VERSION = (byte) 0xc6; - - /** - * The target node of this arc follows the last arc of the current state - * (no goto field). - */ - public static final int BIT_TARGET_NEXT = 1 << 7; - - /** - * The arc is the last one from the current node's arcs list. - */ - public static final int BIT_LAST_ARC = 1 << 6; - - /** - * The arc corresponds to the last character of a sequence - * available when building the automaton (acceptor transition). - */ - public static final int BIT_FINAL_ARC = 1 << 5; - - /** - * The count of bits assigned to storing an indexed label. - */ - static final int LABEL_INDEX_BITS = 5; - - /** - * Masks only the M bits of a flag byte. - */ - static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1; - - /** - * Maximum size of the labels index. - */ - static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1; - - /** - * An array of bytes with the internal representation of the automaton. - * Please see the documentation of this class for more information on how - * this structure is organized. - */ - public byte[] arcs; - - /** - * Flags for this automaton version. - */ - private final EnumSet flags; - - /** - * Label mapping for M-indexed labels. - */ - public final byte[] labelMapping; - - /** - * If true states are prepended with numbers. - */ - private final boolean hasNumbers; - - /** - * Epsilon node's offset. - */ - private final int epsilon = 0; - - /** - * Reads an automaton from a byte stream. - */ - public CFSA2(InputStream in) throws IOException { - // Read the header first. - if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in)) - throw new IOException("Invalid file header magic bytes."); - - // Ensure we have the correct version. - final int version = FileUtils.readByte(in); - if (version != VERSION) { - throw new IOException("This class can only read FSA version: " + VERSION); - } - - // Read flags. - short flagBits = FileUtils.readShort(in); - flags = EnumSet.noneOf(FSAFlags.class); - for (FSAFlags f : FSAFlags.values()) { - if (FSAFlags.isSet(flagBits, f)) - flags.add(f); - } - - if (flagBits != FSAFlags.asShort(flags)) - throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits)); - - this.hasNumbers = flags.contains(FSAFlags.NUMBERS); - - /* - * Read mapping dictionary. - */ - int labelMappingSize = FileUtils.readByte(in) & 0xff; - labelMapping = new byte[labelMappingSize]; - readFully(in, labelMapping); - - /* - * Read arcs' data. - */ - arcs = readFully(in); - } - - /** - * {@inheritDoc} - */ - @Override - public int getRootNode() { - // Skip dummy node marking terminating state. - return getDestinationNodeOffset(getFirstArc(epsilon)); - } - - /** - * {@inheritDoc} - */ - @Override - public final int getFirstArc(int node) { - if (hasNumbers) { - return skipVInt(node); - } else { - return node; - } - } - - /** - * {@inheritDoc} - */ - @Override - public final int getNextArc(int arc) { - if (isArcLast(arc)) - return 0; - else - return skipArc(arc); - } - - /** - * {@inheritDoc} - */ - @Override - public int getArc(int node, byte label) { - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (getArcLabel(arc) == label) - return arc; - } - - // An arc labeled with "label" not found. - return 0; - } - - /** - * {@inheritDoc} - */ - @Override - public int getEndNode(int arc) { - final int nodeOffset = getDestinationNodeOffset(arc); - assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc; - assert nodeOffset < arcs.length : "Node out of bounds."; - return nodeOffset; - } - - /** - * {@inheritDoc} - */ - @Override - public byte getArcLabel(int arc) { - int index = arcs[arc] & LABEL_INDEX_MASK; - if (index > 0) { - return this.labelMapping[index]; - } else { - return arcs[arc + 1]; - } - } - - /** - * {@inheritDoc} - */ - @Override - public int getRightLanguageCount(int node) { - assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; - return readVInt(arcs, node); - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcFinal(int arc) { - return (arcs[arc] & BIT_FINAL_ARC) != 0; - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcTerminal(int arc) { - return (0 == getDestinationNodeOffset(arc)); - } - - /** - * Returns true if this arc has NEXT bit set. - * - * @see #BIT_LAST_ARC - */ - public boolean isArcLast(int arc) { - return (arcs[arc] & BIT_LAST_ARC) != 0; - } - - /** - * @see #BIT_TARGET_NEXT - */ - public boolean isNextSet(int arc) { - return (arcs[arc] & BIT_TARGET_NEXT) != 0; - } - - /** - * {@inheritDoc} - */ - public Set getFlags() { - return flags; - } - - /** - * Returns the address of the node pointed to by this arc. - */ - final int getDestinationNodeOffset(int arc) { - if (isNextSet(arc)) { - /* Follow until the last arc of this state. */ - while (!isArcLast(arc)) { - arc = getNextArc(arc); - } - - /* And return the byte right after it. */ - return skipArc(arc); - } else { - /* - * The destination node address is v-coded. v-code starts either - * at the next byte (label indexed) or after the next byte (label explicit). - */ - return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1)); - } - } - - /** - * Read the arc's layout and skip as many bytes, as needed, to skip it. - */ - private int skipArc(int offset) { - int flag = arcs[offset++]; - - // Explicit label? - if ((flag & LABEL_INDEX_MASK) == 0) { - offset++; - } - - // Explicit goto? - if ((flag & BIT_TARGET_NEXT) == 0) { - offset = skipVInt(offset); - } - - assert offset < this.arcs.length; - return offset; - } - - /** - * Read a v-int. - */ - static int readVInt(byte[] array, int offset) { - byte b = array[offset]; - int value = b & 0x7F; - - for (int shift = 7; b < 0; shift += 7) { - b = array[++offset]; - value |= (b & 0x7F) << shift; - } - - return value; - } - - /** - * Write a v-int to a byte array. - */ - static int writeVInt(byte[] array, int offset, int value) { - assert value >= 0 : "Can't v-code negative ints."; - - while (value > 0x7F) { - array[offset++] = (byte) (0x80 | (value & 0x7F)); - value >>= 7; - } - array[offset++] = (byte) value; - - return offset; - } - - /** - * Return the byte-length of a v-coded int. - */ - static int vIntLength(int value) { - assert value >= 0 : "Can't v-code negative ints."; - - int bytes; - for (bytes = 1; value >= 0x80; bytes++) { - value >>= 7; - } - - return bytes; - } - - /** - * Skip a v-int. - */ - private int skipVInt(int offset) { - while (arcs[offset++] < 0); - return offset; - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/CFSA2Serializer.java b/src/morfologik/fsa/CFSA2Serializer.java deleted file mode 100644 index 11c7b13..0000000 --- a/src/morfologik/fsa/CFSA2Serializer.java +++ /dev/null @@ -1,536 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.CFSA2.BIT_FINAL_ARC; -import static morfologik.fsa.CFSA2.BIT_LAST_ARC; -import static morfologik.fsa.CFSA2.BIT_TARGET_NEXT; -import static morfologik.fsa.FSAFlags.FLEXIBLE; -import static morfologik.fsa.FSAFlags.NEXTBIT; -import static morfologik.fsa.FSAFlags.NUMBERS; -import static morfologik.fsa.FSAFlags.STOPBIT; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayDeque; -import java.util.Comparator; -import java.util.EnumSet; -import java.util.PriorityQueue; -import java.util.Set; - -import morfologik.fsa.FSAUtils.IntIntHolder; -import morfologik.tools.IMessageLogger; -import morfologik.util.FileUtils; - -import com.carrotsearch.hppc.BitSet; -import com.carrotsearch.hppc.BoundedProportionalArraySizingStrategy; -import com.carrotsearch.hppc.IntArrayList; -import com.carrotsearch.hppc.IntIntOpenHashMap; -import com.carrotsearch.hppc.IntStack; -import com.carrotsearch.hppc.cursors.IntCursor; -import com.carrotsearch.hppc.cursors.IntIntCursor; - -/** - * Serializes in-memory {@link FSA} graphs to {@link CFSA2}. - * - *

- * It is possible to serialize the automaton with numbers required for perfect - * hashing. See {@link #withNumbers()} method. - *

- * - * @see CFSA2 - * @see FSA#read(java.io.InputStream) - */ -public final class CFSA2Serializer implements FSASerializer { - /** - * Supported flags. - */ - private final static EnumSet flags = EnumSet.of(NUMBERS, FLEXIBLE, STOPBIT, NEXTBIT); - - /** - * No-state id. - */ - private final static int NO_STATE = -1; - - /** - * true if we should serialize with numbers. - * - * @see #withNumbers() - */ - private boolean withNumbers; - - /** - * A hash map of [state, offset] pairs. - */ - private IntIntOpenHashMap offsets = new IntIntOpenHashMap(); - - /** - * A hash map of [state, right-language-count] pairs. - */ - private IntIntOpenHashMap numbers = new IntIntOpenHashMap(); - - /** - * Scratch array for serializing vints. - */ - private final byte [] scratch = new byte [5]; - - /** - * The most frequent labels for integrating with the flags field. - */ - private byte [] labelsIndex; - - /** - * Inverted index of labels to be integrated with flags field. A label - * at index i has the index or zero (no integration). - */ - private int [] labelsInvIndex; - - /** - * Logger for progress. - */ - private IMessageLogger logger = new NullMessageLogger(); - - /** - * Serialize the automaton with the number of right-language sequences in - * each node. This is required to implement perfect hashing. The numbering - * also preserves the order of input sequences. - * - * @return Returns the same object for easier call chaining. - */ - public CFSA2Serializer withNumbers() { - withNumbers = true; - return this; - } - - /** - * Serializes any {@link FSA} to {@link CFSA2} stream. - * - * @see #withNumbers - * @return Returns os for chaining. - */ - @Override - public T serialize(final FSA fsa, T os) throws IOException { - /* - * Calculate the most frequent labels and build indexed labels dictionary. - */ - computeLabelsIndex(fsa); - - /* - * Calculate the number of bytes required for the node data, if - * serializing with numbers. - */ - if (withNumbers) { - this.numbers = FSAUtils.rightLanguageForAllStates(fsa); - } - - /* - * Linearize all the states, optimizing their layout. - */ - IntArrayList linearized = linearize(fsa); - - /* - * Emit the header. - */ - FileUtils.writeInt(os, FSAHeader.FSA_MAGIC); - os.write(CFSA2.VERSION); - - EnumSet fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); - if (withNumbers) fsaFlags.add(NUMBERS); - FileUtils.writeShort(os, FSAFlags.asShort(fsaFlags)); - - /* - * Emit labels index. - */ - os.write(labelsIndex.length); - os.write(labelsIndex); - - /* - * Emit the automaton. - */ - int size = emitNodes(fsa, os, linearized); - assert size == 0 : "Size changed in the final pass?"; - - return os; - } - - /** - * Compute a set of labels to be integrated with the flags field. - */ - private void computeLabelsIndex(final FSA fsa) { - // Compute labels count. - final int [] countByValue = new int [256]; - - fsa.visitAllStates(new StateVisitor() { - public boolean accept(int state) { - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) - countByValue[fsa.getArcLabel(arc) & 0xff]++; - return true; - } - }); - - // Order by descending frequency of counts and increasing label value. - Comparator comparator = new Comparator() { - public int compare(IntIntHolder o1, IntIntHolder o2) { - int countDiff = o2.b - o1.b; - if (countDiff == 0) { - countDiff = o1.a - o2.a; - } - return countDiff; - } - }; - - PriorityQueue labelAndCount = new PriorityQueue(countByValue.length, comparator); - for (int label = 0; label < countByValue.length; label++) { - if (countByValue[label] > 0) { - labelAndCount.add(new IntIntHolder(label, countByValue[label])); - } - } - - labelsIndex = new byte [1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)]; - labelsInvIndex = new int [256]; - for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) { - IntIntHolder p = labelAndCount.remove(); - labelsInvIndex[p.a] = i; - labelsIndex[i] = (byte) p.a; - } - } - - /** - * Return supported flags. - */ - @Override - public Set getFlags() { - return flags; - } - - /** - * Linearization of states. - */ - private IntArrayList linearize(final FSA fsa) throws IOException { - /* - * Compute the states with most inlinks. These should be placed as close to the - * start of the automaton, as possible so that v-coded addresses are tiny. - */ - final IntIntOpenHashMap inlinkCount = computeInlinkCount(fsa); - - /* - * An array of ordered states for serialization. - */ - final IntArrayList linearized = new IntArrayList(0, - new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f)); - - /* - * Determine which states should be linearized first (at fixed positions) so as to - * minimize the place occupied by goto fields. - */ - int maxStates = Integer.MAX_VALUE; - int minInlinkCount = 2; - ArrayDeque statesQueue = computeFirstStates(inlinkCount, maxStates, minInlinkCount); - IntArrayList states = new IntArrayList(); - while (!statesQueue.isEmpty()) - states.add(statesQueue.pop()); - - /* - * Compute initial addresses, without node rearrangements. - */ - int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets); - - /* - * Probe for better node arrangements by selecting between [lower, upper] - * nodes from the potential candidate nodes list. - */ - IntArrayList sublist = new IntArrayList(); - sublist.buffer = states.buffer; - sublist.elementsCount = states.elementsCount; - - /* - * Probe the initial region a little bit, looking for optimal cut. It can't be binary search - * because the result isn't monotonic. - */ - logger.startPart("Compacting"); - logger.log("Initial output size", serializedSize); - int cutAt = 0; - for (int cut = Math.min(25, states.size()); cut <= Math.min(150, states.size()); cut += 25) { - sublist.elementsCount = cut; - int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); - logger.log("Moved " + sublist.size() + " states, output size", newSize); - if (newSize >= serializedSize) { - break; - } - cutAt = cut; - } - - /* - * Cut at the calculated point and repeat linearization. - */ - sublist.elementsCount = cutAt; - int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); - - logger.log("Will move " + sublist.size() + " states, final size", size); - logger.endPart(); - - return linearized; - } - - /** - * Linearize all states, putting states in front of the automaton and - * calculating stable state offsets. - */ - private int linearizeAndCalculateOffsets(FSA fsa, IntArrayList states, - IntArrayList linearized, IntIntOpenHashMap offsets) throws IOException - { - final BitSet visited = new BitSet(); - final IntStack nodes = new IntStack(); - linearized.clear(); - - /* - * Linearize states with most inlinks first. - */ - for (int i = 0; i < states.size(); i++) { - linearizeState(fsa, nodes, linearized, visited, states.get(i)); - } - - /* - * Linearize the remaining states by chaining them one after another, in depth-order. - */ - nodes.push(fsa.getRootNode()); - while (!nodes.isEmpty()) { - final int node = nodes.pop(); - if (visited.get(node)) - continue; - - linearizeState(fsa, nodes, linearized, visited, node); - } - - /* - * Calculate new state offsets. This is iterative. We start with - * maximum potential offsets and recalculate until converged. - */ - int MAX_OFFSET = Integer.MAX_VALUE; - for (IntCursor c : linearized) { - offsets.put(c.value, MAX_OFFSET); - } - - int i, j = 0; - while ((i = emitNodes(fsa, null, linearized)) > 0) { - j = i; - } - return j; - } - - /** - * Add a state to linearized list. - */ - private void linearizeState(final FSA fsa, - IntStack nodes, - IntArrayList linearized, - BitSet visited, int node) - { - linearized.add(node); - visited.set(node); - for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { - if (!fsa.isArcTerminal(arc)) { - final int target = fsa.getEndNode(arc); - if (!visited.get(target)) - nodes.push(target); - } - } - } - - /** - * Compute the set of states that should be linearized first to minimize other - * states goto length. - */ - private ArrayDeque computeFirstStates(IntIntOpenHashMap inlinkCount, - int maxStates, - int minInlinkCount) - { - Comparator comparator = new Comparator() { - public int compare(IntIntHolder o1, IntIntHolder o2) { - int v = o1.a - o2.a; - return v == 0 ? (o1.b - o2.b) : v; - } - }; - - PriorityQueue stateInlink = new PriorityQueue(1, comparator); - IntIntHolder scratch = new IntIntHolder(); - for (IntIntCursor c : inlinkCount) { - if (c.value > minInlinkCount) { - scratch.a = c.value; - scratch.b = c.key; - - if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) { - stateInlink.add(new IntIntHolder(c.value, c.key)); - if (stateInlink.size() > maxStates) stateInlink.remove(); - } - } - } - - ArrayDeque states = new ArrayDeque(); - while (!stateInlink.isEmpty()) { - IntIntHolder i = stateInlink.remove(); - states.addFirst(i.b); - } - return states; - } - - /** - * Compute in-link count for each state. - */ - private IntIntOpenHashMap computeInlinkCount(final FSA fsa) { - IntIntOpenHashMap inlinkCount = new IntIntOpenHashMap(); - BitSet visited = new BitSet(); - IntStack nodes = new IntStack(); - nodes.push(fsa.getRootNode()); - - while (!nodes.isEmpty()) { - final int node = nodes.pop(); - if (visited.get(node)) - continue; - - visited.set(node); - - for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { - if (!fsa.isArcTerminal(arc)) { - final int target = fsa.getEndNode(arc); - inlinkCount.putOrAdd(target, 1, 1); - if (!visited.get(target)) - nodes.push(target); - } - } - } - - return inlinkCount; - } - - /** - * Update arc offsets assuming the given goto length. - */ - private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException { - int offset = 0; - - // Add epsilon state. - offset += emitNodeData(os, 0); - if (fsa.getRootNode() != 0) - offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode())); - else - offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0); - - boolean offsetsChanged = false; - final int max = linearized.size(); - for (IntCursor c : linearized) { - final int state = c.value; - final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE; - - if (os == null) { - offsetsChanged |= (offsets.get(state) != offset); - offsets.put(state, offset); - } else { - assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset; - } - - offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0); - offset += emitNodeArcs(fsa, os, state, nextState); - } - - return offsetsChanged ? offset : 0; - } - - /** - * Emit all arcs of a single node. - */ - private int emitNodeArcs(FSA fsa, OutputStream os, - final int state, final int nextState) throws IOException { - int offset = 0; - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { - int targetOffset; - final int target; - - if (fsa.isArcTerminal(arc)) { - target = 0; - targetOffset = 0; - } else { - target = fsa.getEndNode(arc); - targetOffset = offsets.get(target); - } - - int flags = 0; - - if (fsa.isArcFinal(arc)) { - flags |= BIT_FINAL_ARC; - } - - if (fsa.getNextArc(arc) == 0) { - flags |= BIT_LAST_ARC; - } - - if (targetOffset != 0 && target == nextState) { - flags |= BIT_TARGET_NEXT; - targetOffset = 0; - } - - offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset); - } - - return offset; - } - - /** */ - private int emitArc(OutputStream os, int flags, byte label, int targetOffset) - throws IOException - { - int length = 0; - - int labelIndex = labelsInvIndex[label & 0xff]; - if (labelIndex > 0) { - if (os != null) os.write(flags | labelIndex); - length++; - } else { - if (os != null) { - os.write(flags); - os.write(label); - } - length += 2; - } - - if ((flags & BIT_TARGET_NEXT) == 0) { - int len = CFSA2.writeVInt(scratch, 0, targetOffset); - if (os != null) { - os.write(scratch, 0, len); - } - length += len; - } - - return length; - } - - /** */ - private int emitNodeData(OutputStream os, int number) throws IOException { - int size = 0; - - if (withNumbers) { - size = CFSA2.writeVInt(scratch, 0, number); - if (os != null) { - os.write(scratch, 0, size); - } - } - - return size; - } - - /** */ - @Override - public CFSA2Serializer withFiller(byte filler) { - throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file."); - } - - /** */ - @Override - public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) { - throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file."); - } - - @Override - public CFSA2Serializer withLogger(IMessageLogger logger) { - this.logger = logger; - return this; - } -} diff --git a/src/morfologik/fsa/ConstantArcSizeFSA.java b/src/morfologik/fsa/ConstantArcSizeFSA.java deleted file mode 100644 index 2f6d412..0000000 --- a/src/morfologik/fsa/ConstantArcSizeFSA.java +++ /dev/null @@ -1,134 +0,0 @@ -package morfologik.fsa; - -import java.util.Collections; -import java.util.Set; - -/** - * An FSA with constant-size arc representation produced directly - * by {@link FSABuilder}. - * - * @see FSABuilder - */ -public final class ConstantArcSizeFSA extends FSA { - /** Size of the target address field (constant for the builder). */ - public final static int TARGET_ADDRESS_SIZE = 4; - - /** Size of the flags field (constant for the builder). */ - public final static int FLAGS_SIZE = 1; - - /** Size of the label field (constant for the builder). */ - public final static int LABEL_SIZE = 1; - - /** - * Size of a single arc structure. - */ - public final static int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE; - - /** Offset of the flags field inside an arc. */ - public final static int FLAGS_OFFSET = 0; - - /** Offset of the label field inside an arc. */ - public final static int LABEL_OFFSET = FLAGS_SIZE; - - /** Offset of the address field inside an arc. */ - public final static int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE; - - /** A dummy address of the terminal state. */ - final static int TERMINAL_STATE = 0; - - /** - * An arc flag indicating the target node of an arc corresponds to a final - * state. - */ - public final static int BIT_ARC_FINAL = 1 << 1; - - /** An arc flag indicating the arc is last within its state. */ - public final static int BIT_ARC_LAST = 1 << 0; - - /** - * An epsilon state. The first and only arc of this state points either - * to the root or to the terminal state, indicating an empty automaton. - */ - private final int epsilon; - - /** - * FSA data, serialized as a byte array. - */ - private final byte[] data; - - /** - * @param data FSA data. There must be no trailing bytes after the last state. - */ - ConstantArcSizeFSA(byte[] data, int epsilon) { - assert epsilon == 0 : "Epsilon is not zero?"; - - this.epsilon = epsilon; - this.data = data; - } - - @Override - public int getRootNode() { - return getEndNode(getFirstArc(epsilon)); - } - - @Override - public int getFirstArc(int node) { - return node; - } - - @Override - public int getArc(int node, byte label) { - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (getArcLabel(arc) == label) - return arc; - } - return 0; - } - - @Override - public int getNextArc(int arc) { - if (isArcLast(arc)) - return 0; - return arc + ARC_SIZE; - } - - @Override - public byte getArcLabel(int arc) { - return data[arc + LABEL_OFFSET]; - } - - /** - * Fills the target state address of an arc. - */ - private int getArcTarget(int arc) { - arc += ADDRESS_OFFSET; - return (data[arc]) << 24 | - (data[arc + 1] & 0xff) << 16 | - (data[arc + 2] & 0xff) << 8 | - (data[arc + 3] & 0xff); - } - - @Override - public boolean isArcFinal(int arc) { - return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; - } - - @Override - public boolean isArcTerminal(int arc) { - return getArcTarget(arc) == 0; - } - - private boolean isArcLast(int arc) { - return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; - } - - @Override - public int getEndNode(int arc) { - return getArcTarget(arc); - } - - @Override - public Set getFlags() { - return Collections.emptySet(); - } -} diff --git a/src/morfologik/fsa/FSA.java b/src/morfologik/fsa/FSA.java deleted file mode 100644 index 28b44a2..0000000 --- a/src/morfologik/fsa/FSA.java +++ /dev/null @@ -1,270 +0,0 @@ -package morfologik.fsa; - -import java.io.*; -import java.nio.ByteBuffer; -import java.util.*; - -/** - * This is a top abstract class for handling finite state automata. These - * automata are arc-based, a design described in Jan Daciuk's Incremental - * Construction of Finite-State Automata and Transducers, and Their Use in the - * Natural Language Processing (PhD thesis, Technical University of Gdansk). - * - *

- * Concrete subclasses (implementations) provide varying tradeoffs and features: - * traversal speed vs. memory size, for example. - *

- * - * @see FSABuilder - */ -public abstract class FSA implements Iterable { - /** - * @return Returns the identifier of the root node of this automaton. - * Returns 0 if the start node is also the end node (the automaton - * is empty). - */ - public abstract int getRootNode(); - - /** - * @return Returns the identifier of the first arc leaving node - * or 0 if the node has no outgoing arcs. - */ - public abstract int getFirstArc(int node); - - /** - * @return Returns the identifier of the next arc after arc and - * leaving node. Zero is returned if no more arcs are - * available for the node. - */ - public abstract int getNextArc(int arc); - - /** - * @return Returns the identifier of an arc leaving node and - * labeled with label. An identifier equal to 0 means - * the node has no outgoing arc labeled label. - */ - public abstract int getArc(int node, byte label); - - /** - * Return the label associated with a given arc. - */ - public abstract byte getArcLabel(int arc); - - /** - * Returns true if the destination node at the end of this - * arc corresponds to an input sequence created when building - * this automaton. - */ - public abstract boolean isArcFinal(int arc); - - /** - * Returns true if this arc does not have a - * terminating node (@link {@link #getEndNode(int)} will throw an - * exception). Implies {@link #isArcFinal(int)}. - */ - public abstract boolean isArcTerminal(int arc); - - /** - * Return the end node pointed to by a given arc. Terminal arcs - * (those that point to a terminal state) have no end node representation - * and throw a runtime exception. - */ - public abstract int getEndNode(int arc); - - /** - * Returns a set of flags for this FSA instance. - */ - public abstract Set getFlags(); - - /** - * Calculates the number of arcs of a given node. Unless really required, - * use the following idiom for looping through all arcs: - *
-     * for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
-     * }
-     * 
- */ - public int getArcCount(int node) { - int count = 0; - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - count++; - } - return count; - } - - /** - * @return Returns the number of sequences reachable from the given state if - * the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of - * the right language of the state, in other words. - * - * @throws UnsupportedOperationException If the automaton was not compiled with - * {@link FSAFlags#NUMBERS}. The value can then be computed by manual count - * of {@link #getSequences(int)}. - */ - public int getRightLanguageCount(int node) { - throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS); - } - - /** - * Returns an iterator over all binary sequences starting at the given FSA - * state (node) and ending in final nodes. This corresponds to a set of - * suffixes of a given prefix from all sequences stored in the automaton. - * - *

- * The returned iterator is a {@link ByteBuffer} whose contents changes on - * each call to {@link Iterator#next()}. The keep the contents between calls - * to {@link Iterator#next()}, one must copy the buffer to some other - * location. - *

- * - *

- * Important. It is guaranteed that the returned byte buffer is - * backed by a byte array and that the content of the byte buffer starts at - * the array's index 0. - *

- * - * @see Iterable - */ - public Iterable getSequences(final int node) { - if (node == 0) { - return Collections. emptyList(); - } - - return new Iterable() { - public Iterator iterator() { - return new FSAFinalStatesIterator(FSA.this, node); - } - }; - } - - /** - * An alias of calling {@link #iterator} directly ({@link FSA} is also - * {@link Iterable}). - */ - public final Iterable getSequences() { - return getSequences(getRootNode()); - } - - /** - * Returns an iterator over all binary sequences starting from the initial - * FSA state (node) and ending in final nodes. The returned iterator is a - * {@link ByteBuffer} whose contents changes on each call to - * {@link Iterator#next()}. The keep the contents between calls to - * {@link Iterator#next()}, one must copy the buffer to some other location. - * - *

- * Important. It is guaranteed that the returned byte buffer is - * backed by a byte array and that the content of the byte buffer starts at - * the array's index 0. - *

- * - * @see Iterable - */ - public final Iterator iterator() { - return getSequences().iterator(); - } - - /** - * Visit all states. The order of visiting is undefined. This method may be faster - * than traversing the automaton in post or preorder since it can scan states - * linearly. Returning false from {@link StateVisitor#accept(int)} - * immediately terminates the traversal. - */ - public T visitAllStates(T v) { - return visitInPostOrder(v); - } - - /** - * Same as {@link #visitInPostOrder(StateVisitor, int)}, - * starting from root automaton node. - */ - public T visitInPostOrder(T v) { - return visitInPostOrder(v, getRootNode()); - } - - /** - * Visits all states reachable from node in postorder. - * Returning false from {@link StateVisitor#accept(int)} - * immediately terminates the traversal. - */ - public T visitInPostOrder(T v, int node) { - visitInPostOrder(v, node, new BitSet()); - return v; - } - - /** Private recursion. */ - private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) { - if (visited.get(node)) - return true; - visited.set(node); - - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (!isArcTerminal(arc)) { - if (!visitInPostOrder(v, getEndNode(arc), visited)) - return false; - } - } - - return v.accept(node); - } - - /** - * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node. - */ - public T visitInPreOrder(T v) { - return visitInPreOrder(v, getRootNode()); - } - - /** - * Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)} - * skips traversal of all sub-states of a given state. - */ - public T visitInPreOrder(T v, int node) { - visitInPreOrder(v, node, new BitSet()); - return v; - } - - /** Private recursion. */ - private void visitInPreOrder(StateVisitor v, int node, BitSet visited) { - if (visited.get(node)) - return; - visited.set(node); - - if (v.accept(node)) { - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (!isArcTerminal(arc)) { - visitInPreOrder(v, getEndNode(arc), visited); - } - } - } - } - - /** - * A factory for reading automata in any of the supported versions. If - * possible, explicit constructors should be used. - * - * @see FSA5#FSA5(InputStream) - */ - @SuppressWarnings("unchecked") - public static T read(InputStream in) throws IOException { - if (!in.markSupported()) { - in = new BufferedInputStream(in, Math.max(FSAHeader.MAX_HEADER_LENGTH + 1, 1024)); - } - - in.mark(FSAHeader.MAX_HEADER_LENGTH); - FSAHeader header = FSAHeader.read(in); - in.reset(); - - if (header.version == FSA5.VERSION) - return (T) new FSA5(in); - - if (header.version == CFSA.VERSION) - return (T) new CFSA(in); - - if (header.version == CFSA2.VERSION) - return (T) new CFSA2(in); - - throw new IOException("Unsupported automaton version: " - + header.version); - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/FSA5.java b/src/morfologik/fsa/FSA5.java deleted file mode 100644 index d43f4d8..0000000 --- a/src/morfologik/fsa/FSA5.java +++ /dev/null @@ -1,323 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.FSAFlags.FLEXIBLE; -import static morfologik.fsa.FSAFlags.NEXTBIT; -import static morfologik.fsa.FSAFlags.NUMBERS; -import static morfologik.fsa.FSAFlags.STOPBIT; -import static morfologik.util.FileUtils.readFully; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.EnumSet; -import java.util.Set; - -/** - * FSA binary format implementation for version 5. - * - *

- * Version 5 indicates the dictionary was built with these flags: - * {@link FSAFlags#FLEXIBLE}, {@link FSAFlags#STOPBIT} and - * {@link FSAFlags#NEXTBIT}. The internal representation of the FSA must - * therefore follow this description (please note this format describes only a - * single transition (arc), not the entire dictionary file). - * - *

- * ---- this node header present only if automaton was compiled with NUMBERS option.
- * Byte
- *        +-+-+-+-+-+-+-+-+\
- *      0 | | | | | | | | | \  LSB
- *        +-+-+-+-+-+-+-+-+  +
- *      1 | | | | | | | | |  |      number of strings recognized
- *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
- *        : : : : : : : : :  |      from this node.
- *        +-+-+-+-+-+-+-+-+  +
- *  ctl-1 | | | | | | | | | /  MSB
- *        +-+-+-+-+-+-+-+-+/
- *        
- * ---- remaining part of the node
- * 
- * Byte
- *       +-+-+-+-+-+-+-+-+\
- *     0 | | | | | | | | | +------ label
- *       +-+-+-+-+-+-+-+-+/
- * 
- *                  +------------- node pointed to is next
- *                  | +----------- the last arc of the node
- *                  | | +--------- the arc is final
- *                  | | |
- *             +-----------+
- *             |    | | |  |
- *         ___+___  | | |  |
- *        /       \ | | |  |
- *       MSB           LSB |
- *        7 6 5 4 3 2 1 0  |
- *       +-+-+-+-+-+-+-+-+ |
- *     1 | | | | | | | | | \ \
- *       +-+-+-+-+-+-+-+-+  \ \  LSB
- *       +-+-+-+-+-+-+-+-+     +
- *     2 | | | | | | | | |     |
- *       +-+-+-+-+-+-+-+-+     |
- *     3 | | | | | | | | |     +----- target node address (in bytes)
- *       +-+-+-+-+-+-+-+-+     |      (not present except for the byte
- *       : : : : : : : : :     |       with flags if the node pointed to
- *       +-+-+-+-+-+-+-+-+     +       is next)
- *   gtl | | | | | | | | |    /  MSB
- *       +-+-+-+-+-+-+-+-+   /
- * gtl+1                           (gtl = gotoLength)
- * 
- */ -public final class FSA5 extends FSA { - /** - * Default filler byte. - */ - public final static byte DEFAULT_FILLER = '_'; - - /** - * Default annotation byte. - */ - public final static byte DEFAULT_ANNOTATION = '+'; - - /** - * Automaton version as in the file header. - */ - public static final byte VERSION = 5; - - /** - * Bit indicating that an arc corresponds to the last character of a - * sequence available when building the automaton. - */ - public static final int BIT_FINAL_ARC = 1 << 0; - - /** - * Bit indicating that an arc is the last one of the node's list and the - * following one belongs to another node. - */ - public static final int BIT_LAST_ARC = 1 << 1; - - /** - * Bit indicating that the target node of this arc follows it in the - * compressed automaton structure (no goto field). - */ - public static final int BIT_TARGET_NEXT = 1 << 2; - - /** - * An offset in the arc structure, where the address and flags field begins. - * In version 5 of FSA automata, this value is constant (1, skip label). - */ - public final static int ADDRESS_OFFSET = 1; - - /** - * An array of bytes with the internal representation of the automaton. - * Please see the documentation of this class for more information on how - * this structure is organized. - */ - public final byte[] arcs; - - /** - * The length of the node header structure (if the automaton was compiled with - * NUMBERS option). Otherwise zero. - */ - public final int nodeDataLength; - - /** - * Flags for this automaton version. - */ - private final Set flags; - - /** - * Number of bytes each address takes in full, expanded form (goto length). - */ - public final int gtl; - - /** Filler character. */ - public final byte filler; - - /** Annotation character. */ - public final byte annotation; - - /** - * Read and wrap a binary automaton in FSA version 5. - */ - public FSA5(InputStream fsaStream) throws IOException { - // Read the header first. - final FSAHeader header = FSAHeader.read(fsaStream); - - // Ensure we have version 5. - if (header.version != VERSION) { - throw new IOException("This class can read FSA version 5 only: " + header.version); - } - - /* - * Determine if the automaton was compiled with NUMBERS. If so, modify - * ctl and goto fields accordingly. - */ - flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); - if ((header.gtl & 0xf0) != 0) { - flags.add(NUMBERS); - } - - this.nodeDataLength = (header.gtl >>> 4) & 0x0f; - this.gtl = header.gtl & 0x0f; - - this.filler = header.filler; - this.annotation = header.annotation; - - arcs = readFully(fsaStream); - } - - /** - * Returns the start node of this automaton. - */ - @Override - public int getRootNode() { - // Skip dummy node marking terminating state. - final int epsilonNode = skipArc(getFirstArc(0)); - - // And follow the epsilon node's first (and only) arc. - return getDestinationNodeOffset(getFirstArc(epsilonNode)); - } - - /** - * {@inheritDoc} - */ - @Override - public final int getFirstArc(int node) { - return nodeDataLength + node; - } - - /** - * {@inheritDoc} - */ - @Override - public final int getNextArc(int arc) { - if (isArcLast(arc)) - return 0; - else - return skipArc(arc); - } - - /** - * {@inheritDoc} - */ - @Override - public int getArc(int node, byte label) { - for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { - if (getArcLabel(arc) == label) - return arc; - } - - // An arc labeled with "label" not found. - return 0; - } - - /** - * {@inheritDoc} - */ - @Override - public int getEndNode(int arc) { - final int nodeOffset = getDestinationNodeOffset(arc); - assert nodeOffset != 0 : "No target node for terminal arcs."; - return nodeOffset; - } - - /** - * {@inheritDoc} - */ - @Override - public byte getArcLabel(int arc) { - return arcs[arc]; - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcFinal(int arc) { - return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0; - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isArcTerminal(int arc) { - return (0 == getDestinationNodeOffset(arc)); - } - - /** - * Returns the number encoded at the given node. The number equals the count - * of the set of suffixes reachable from node (called its right - * language). - */ - @Override - public int getRightLanguageCount(int node) { - assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS."; - return decodeFromBytes(arcs, node, nodeDataLength); - } - - /** - * {@inheritDoc} - * - *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag - * may be set to indicate the automaton contains extra fields for each node.

- */ - @Override - public Set getFlags() { - return Collections.unmodifiableSet(flags); - } - - /** - * Returns true if this arc has LAST bit set. - * - * @see #BIT_LAST_ARC - */ - public boolean isArcLast(int arc) { - return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0; - } - - /** - * @see #BIT_TARGET_NEXT - */ - public boolean isNextSet(int arc) { - return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0; - } - - /** - * Returns an n-byte integer encoded in byte-packed representation. - */ - static final int decodeFromBytes( - final byte[] arcs, final int start, final int n) - { - int r = 0; - for (int i = n; --i >= 0;) { - r = r << 8 | (arcs[start + i] & 0xff); - } - return r; - } - - /** - * Returns the address of the node pointed to by this arc. - */ - final int getDestinationNodeOffset(int arc) { - if (isNextSet(arc)) { - /* The destination node follows this arc in the array. */ - return skipArc(arc); - } else { - /* - * The destination node address has to be extracted from the arc's - * goto field. - */ - return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3; - } - } - - /** - * Read the arc's layout and skip as many bytes, as needed. - */ - private int skipArc(int offset) { - return offset + (isNextSet(offset) - ? 1 + 1 /* label + flags */ - : 1 + gtl /* label + flags/address */); - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/FSA5Serializer.java b/src/morfologik/fsa/FSA5Serializer.java deleted file mode 100644 index be017a4..0000000 --- a/src/morfologik/fsa/FSA5Serializer.java +++ /dev/null @@ -1,334 +0,0 @@ -package morfologik.fsa; - -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.util.*; - -import morfologik.tools.IMessageLogger; - -import com.carrotsearch.hppc.*; -import com.carrotsearch.hppc.BitSet; - -import static morfologik.fsa.FSAFlags.*; - -/** - * Serializes in-memory {@link FSA} graphs to a binary format compatible with - * Jan Daciuk's fsa's package FSA5 format. - * - *

- * It is possible to serialize the automaton with numbers required for perfect - * hashing. See {@link #withNumbers()} method. - *

- * - * @see FSA5 - * @see FSA#read(java.io.InputStream) - */ -public final class FSA5Serializer implements FSASerializer { - /** - * Maximum number of bytes for a serialized arc. - */ - private final static int MAX_ARC_SIZE = 1 + 5; - - /** - * Maximum number of bytes for per-node data. - */ - private final static int MAX_NODE_DATA_SIZE = 16; - - /** - * Number of bytes for the arc's flags header (arc representation without - * the goto address). - */ - private final static int SIZEOF_FLAGS = 1; - - /** - * Supported flags. - */ - private final static EnumSet flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT); - - /** - * @see FSA5#filler - */ - public byte fillerByte = FSA5.DEFAULT_FILLER; - - /** - * @see FSA5#annotation - */ - public byte annotationByte = FSA5.DEFAULT_ANNOTATION; - - /** - * true if we should serialize with numbers. - * - * @see #withNumbers() - */ - private boolean withNumbers; - - /** - * A hash map of [state, offset] pairs. - */ - private IntIntOpenHashMap offsets = new IntIntOpenHashMap(); - - /** - * A hash map of [state, right-language-count] pairs. - */ - private IntIntOpenHashMap numbers = new IntIntOpenHashMap(); - - /** - * Serialize the automaton with the number of right-language sequences in - * each node. This is required to implement perfect hashing. The numbering - * also preserves the order of input sequences. - * - * @return Returns the same object for easier call chaining. - */ - public FSA5Serializer withNumbers() { - withNumbers = true; - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public FSA5Serializer withFiller(byte filler) { - this.fillerByte = filler; - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) { - this.annotationByte = annotationSeparator; - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public FSASerializer withLogger(IMessageLogger logger) { - return this; - } - - /** - * Serialize root state s to an output stream in - * FSA5 format. - * - * @see #withNumbers - * @return Returns os for chaining. - */ - @Override - public T serialize(final FSA fsa, T os) - throws IOException { - - // Prepare space for arc offsets and linearize all the states. - int[] linearized = linearize(fsa); - - /* - * Calculate the number of bytes required for the node data, if - * serializing with numbers. - */ - int nodeDataLength = 0; - if (withNumbers) { - this.numbers = FSAUtils.rightLanguageForAllStates(fsa); - int maxNumber = numbers.get(fsa.getRootNode()); - while (maxNumber > 0) { - nodeDataLength++; - maxNumber >>>= 8; - } - } - - // Calculate minimal goto length. - int gtl = 1; - while (true) { - // First pass: calculate offsets of states. - if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) { - gtl++; - continue; - } - - // Second pass: check if goto overflows anywhere. - if (emitArcs(fsa, null, linearized, gtl, nodeDataLength)) - break; - - gtl++; - } - - /* - * Emit the header. - */ - os.write(new byte[] { '\\', 'f', 's', 'a' }); - os.write(FSA5.VERSION); - os.write(fillerByte); - os.write(annotationByte); - os.write((nodeDataLength << 4) | gtl); - - /* - * Emit the automaton. - */ - boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength); - assert gtlUnchanged : "gtl changed in the final pass."; - - return os; - } - - /** - * Return supported flags. - */ - @Override - public Set getFlags() { - return flags; - } - - /** - * Linearization of states. - */ - private int[] linearize(final FSA fsa) { - int[] linearized = new int[0]; - int last = 0; - - BitSet visited = new BitSet(); - IntStack nodes = new IntStack(); - nodes.push(fsa.getRootNode()); - - while (!nodes.isEmpty()) { - final int node = nodes.pop(); - if (visited.get(node)) - continue; - - if (last >= linearized.length) { - linearized = Arrays.copyOf(linearized, linearized.length + 100000); - } - - visited.set(node); - linearized[last++] = node; - - for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { - if (!fsa.isArcTerminal(arc)) { - int target = fsa.getEndNode(arc); - if (!visited.get(target)) - nodes.push(target); - } - } - } - - return Arrays.copyOf(linearized, last); - } - - /** - * Update arc offsets assuming the given goto length. - */ - private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized, - int gtl, int nodeDataLength) throws IOException { - final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE, - MAX_ARC_SIZE)); - - int offset = 0; - - // Add dummy terminal state. - offset += emitNodeData(bb, os, nodeDataLength, 0); - offset += emitArc(bb, os, gtl, 0, (byte) 0, 0); - - // Add epsilon state. - offset += emitNodeData(bb, os, nodeDataLength, 0); - if (fsa.getRootNode() != 0) - offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0); - else - offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC , (byte) '^', 0); - - int maxStates = linearized.length; - for (int j = 0; j < maxStates; j++) { - final int s = linearized[j]; - - if (os == null) { - offsets.put(s, offset); - } else { - assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset; - } - - offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0); - - for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { - int targetOffset; - final int target; - if (fsa.isArcTerminal(arc)) { - targetOffset = 0; - target = 0; - } else { - target = fsa.getEndNode(arc); - targetOffset = offsets.get(target); - } - - int flags = 0; - if (fsa.isArcFinal(arc)) { - flags |= FSA5.BIT_FINAL_ARC; - } - - if (fsa.getNextArc(arc) == 0) { - flags |= FSA5.BIT_LAST_ARC; - - if (j + 1 < maxStates && target == linearized[j + 1] - && targetOffset != 0) { - flags |= FSA5.BIT_TARGET_NEXT; - targetOffset = 0; - } - } - - int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset); - if (bytes < 0) - // gtl too small. interrupt eagerly. - return false; - - offset += bytes; - } - } - - return true; - } - - /** */ - private int emitArc(ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset) - throws IOException - { - int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl; - - flags |= (targetOffset << 3); - bb.put(label); - for (int b = 0; b < arcBytes; b++) { - bb.put((byte) flags); - flags >>>= 8; - } - - if (flags != 0) { - // gtl too small. interrupt eagerly. - return -1; - } - - bb.flip(); - int bytes = bb.remaining(); - if (os != null) { - os.write(bb.array(), bb.position(), bb.remaining()); - } - bb.clear(); - - return bytes; - } - - /** */ - private int emitNodeData(ByteBuffer bb, OutputStream os, - int nodeDataLength, int number) throws IOException { - if (nodeDataLength > 0 && os != null) { - for (int i = 0; i < nodeDataLength; i++) { - bb.put((byte) number); - number >>>= 8; - } - - bb.flip(); - os.write(bb.array(), bb.position(), bb.remaining()); - bb.clear(); - } - - return nodeDataLength; - } -} diff --git a/src/morfologik/fsa/FSABuilder.java b/src/morfologik/fsa/FSABuilder.java deleted file mode 100644 index 0cf7cc0..0000000 --- a/src/morfologik/fsa/FSABuilder.java +++ /dev/null @@ -1,486 +0,0 @@ -package morfologik.fsa; - -import java.util.*; - -import morfologik.util.Arrays; - -import static morfologik.fsa.ConstantArcSizeFSA.*; - -/** - * Fast, memory-conservative finite state automaton builder, returning a - * byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction - * speed and memory consumption). - */ -public final class FSABuilder { - /** - * Debug and information constants. - * - * @see FSABuilder#getInfo() - */ - public enum InfoEntry { - SERIALIZATION_BUFFER_SIZE("Serialization buffer size"), - SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"), - CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"), - MAX_ACTIVE_PATH_LENGTH("Max active path"), - STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"), - STATE_REGISTRY_SIZE("Registry hash entries"), - ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)"); - - private final String stringified; - - InfoEntry(String stringified) { - this.stringified = stringified; - } - - @Override - public String toString() { - return stringified; - } - } - - /** A megabyte. */ - private final static int MB = 1024 * 1024; - - /** - * Internal serialized FSA buffer expand ratio. - */ - private final static int BUFFER_GROWTH_SIZE = 5 * MB; - - /** - * Maximum number of labels from a single state. - */ - private final static int MAX_LABELS = 256; - - /** - * Comparator comparing full byte arrays consistently with - * {@link #compare(byte[], int, int, byte[], int, int)}. - */ - public static final Comparator LEXICAL_ORDERING = new Comparator() { - public int compare(byte[] o1, byte[] o2) { - return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length); - } - }; - - /** - * Internal serialized FSA buffer expand ratio. - */ - private final int bufferGrowthSize; - - /** - * Holds serialized and mutable states. Each state is a sequential list of - * arcs, the last arc is marked with {@link #BIT_ARC_LAST}. - */ - private byte[] serialized = new byte[0]; - - /** - * Number of bytes already taken in {@link #serialized}. Start from 1 to - * keep 0 a sentinel value (for the hash set and final state). - */ - private int size; - - /** - * States on the "active path" (still mutable). Values are addresses of each - * state's first arc. - */ - private int[] activePath = new int[0]; - - /** - * Current length of the active path. - */ - private int activePathLen; - - /** - * The next offset at which an arc will be added to the given state on - * {@link #activePath}. - */ - private int[] nextArcOffset = new int[0]; - - /** - * Root state. If negative, the automaton has been built already and cannot be extended. - */ - private int root; - - /** - * An epsilon state. The first and only arc of this state points either - * to the root or to the terminal state, indicating an empty automaton. - */ - private int epsilon; - - /** - * Hash set of state addresses in {@link #serialized}, hashed by - * {@link #hash(int, int)}. Zero reserved for an unoccupied slot. - */ - private int[] hashSet = new int[2]; - - /** - * Number of entries currently stored in {@link #hashSet}. - */ - private int hashSize = 0; - - /** - * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only. - */ - private byte [] previous; - - /** - * Information about the automaton and its compilation. - */ - private TreeMap info; - - /** - * {@link #previous} sequence's length, used in assertions only. - */ - private int previousLength; - - /** */ - public FSABuilder() { - this(BUFFER_GROWTH_SIZE); - } - - /** */ - public FSABuilder(int bufferGrowthSize) { - this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS); - - // Allocate epsilon state. - epsilon = allocateState(1); - serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST; - - // Allocate root, with an initial empty set of output arcs. - expandActivePath(1); - root = activePath[0]; - } - - /** - * Add a single sequence of bytes to the FSA. The input must be lexicographically greater - * than any previously added sequence. - */ - public void add(byte[] sequence, int start, int len) { - assert serialized != null : "Automaton already built."; - assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : - "Input must be sorted: " - + Arrays.toString(previous, 0, previousLength) + " >= " - + Arrays.toString(sequence, start, len); - assert setPrevious(sequence, start, len); - - // Determine common prefix length. - final int commonPrefix = commonPrefix(sequence, start, len); - - // Make room for extra states on active path, if needed. - expandActivePath(len); - - // Freeze all the states after the common prefix. - for (int i = activePathLen - 1; i > commonPrefix; i--) { - final int frozenState = freezeState(i); - setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState); - nextArcOffset[i] = activePath[i]; - } - - // Create arcs to new suffix states. - for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) { - final int p = nextArcOffset[i - 1]; - - serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0); - serialized[p + LABEL_OFFSET] = sequence[j++]; - setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]); - - nextArcOffset[i - 1] = p + ARC_SIZE; - } - - // Save last sequence's length so that we don't need to calculate it again. - this.activePathLen = len; - } - - /** Number of serialization buffer reallocations. */ - private int serializationBufferReallocations; - - /** - * Complete the automaton. - */ - public FSA complete() { - add(new byte[0], 0, 0); - - if (nextArcOffset[0] - activePath[0] == 0) { - // An empty FSA. - setArcTarget(epsilon, TERMINAL_STATE); - } else { - // An automaton with at least a single arc from root. - root = freezeState(0); - setArcTarget(epsilon, root); - } - - info = new TreeMap(); - info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length); - info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations); - info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size); - info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length); - info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length); - info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize); - info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, - (this.serialized.length + this.hashSet.length * 4) / (double) MB); - - final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon); - this.serialized = null; - this.hashSet = null; - return fsa; - } - - /** - * Build a minimal, deterministic automaton from a sorted list of byte sequences. - */ - public static FSA build(byte[][] input) { - final FSABuilder builder = new FSABuilder(); - - for (byte [] chs : input) - builder.add(chs, 0, chs.length); - - return builder.complete(); - } - - /** - * Build a minimal, deterministic automaton from an iterable list of byte sequences. - */ - public static FSA build(Iterable input) { - final FSABuilder builder = new FSABuilder(); - - for (byte [] chs : input) - builder.add(chs, 0, chs.length); - - return builder.complete(); - } - - /** - * Return various statistics concerning the FSA and its compilation. - */ - public Map getInfo() { - return info; - } - - /** Is this arc the state's last? */ - private boolean isArcLast(int arc) { - return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; - } - - /** Is this arc final? */ - private boolean isArcFinal(int arc) { - return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; - } - - /** Get label's arc. */ - private byte getArcLabel(int arc) { - return serialized[arc + LABEL_OFFSET]; - } - - /** - * Fills the target state address of an arc. - */ - private void setArcTarget(int arc, int state) { - arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE; - for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) { - serialized[--arc] = (byte) state; - state >>>= 8; - } - } - - /** - * Returns the address of an arc. - */ - private int getArcTarget(int arc) { - arc += ADDRESS_OFFSET; - return (serialized[arc]) << 24 | - (serialized[arc + 1] & 0xff) << 16 | - (serialized[arc + 2] & 0xff) << 8 | - (serialized[arc + 3] & 0xff); - } - - /** - * @return The number of common prefix characters with the previous - * sequence. - */ - private int commonPrefix(byte[] sequence, int start, int len) { - // Empty root state case. - final int max = Math.min(len, activePathLen); - int i; - for (i = 0; i < max; i++) { - final int lastArc = nextArcOffset[i] - ARC_SIZE; - if (sequence[start++] != getArcLabel(lastArc)) { - break; - } - } - - return i; - } - - /** - * Freeze a state: try to find an equivalent state in the interned states - * dictionary first, if found, return it, otherwise, serialize the mutable - * state at activePathIndex and return it. - */ - private int freezeState(final int activePathIndex) { - final int start = activePath[activePathIndex]; - final int end = nextArcOffset[activePathIndex]; - final int len = end - start; - - // Set the last arc flag on the current active path's state. - serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST; - - // Try to locate a state with an identical content in the hash set. - final int bucketMask = (hashSet.length - 1); - int slot = hash(start, len) & bucketMask; - for (int i = 0;;) { - int state = hashSet[slot]; - if (state == 0) { - state = hashSet[slot] = serialize(activePathIndex); - if (++hashSize > hashSet.length / 2) - expandAndRehash(); - return state; - } else if (equivalent(state, start, len)) { - return state; - } - - slot = (slot + (++i)) & bucketMask; - } - } - - /** - * Reallocate and rehash the hash set. - */ - private void expandAndRehash() { - final int[] newHashSet = new int[hashSet.length * 2]; - final int bucketMask = (newHashSet.length - 1); - - for (int j = 0; j < hashSet.length; j++) { - final int state = hashSet[j]; - if (state > 0) { - int slot = hash(state, stateLength(state)) & bucketMask; - for (int i = 0; newHashSet[slot] > 0;) { - slot = (slot + (++i)) & bucketMask; - } - newHashSet[slot] = state; - } - } - this.hashSet = newHashSet; - } - - /** - * The total length of the serialized state data (all arcs). - */ - private int stateLength(int state) { - int arc = state; - while (!isArcLast(arc)) { - arc += ARC_SIZE; - } - return arc - state + ARC_SIZE; - } - - /** Return true if two regions in {@link #serialized} are identical. */ - private boolean equivalent(int start1, int start2, int len) { - if (start1 + len > size || start2 + len > size) - return false; - - while (len-- > 0) - if (serialized[start1++] != serialized[start2++]) - return false; - - return true; - } - - /** - * Serialize a given state on the active path. - */ - private int serialize(final int activePathIndex) { - expandBuffers(); - - final int newState = size; - final int start = activePath[activePathIndex]; - final int len = nextArcOffset[activePathIndex] - start; - System.arraycopy(serialized, start, serialized, newState, len); - - size += len; - return newState; - } - - /** - * Hash code of a fragment of {@link #serialized} array. - */ - private int hash(int start, int byteCount) { - assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?"; - - int h = 0; - for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) { - h = 17 * h + getArcLabel(start); - h = 17 * h + getArcTarget(start); - if (isArcFinal(start)) h += 17; - } - - return h; - } - - /** - * Append a new mutable state to the active path. - */ - private void expandActivePath(int size) { - if (activePath.length < size) { - final int p = activePath.length; - activePath = java.util.Arrays.copyOf(activePath, size); - nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size); - - for (int i = p; i < size; i++) { - nextArcOffset[i] = activePath[i] = - allocateState(/* assume max labels count */ MAX_LABELS); - } - } - } - - /** - * Expand internal buffers for the next state. - */ - private void expandBuffers() { - if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) { - serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize); - serializationBufferReallocations++; - } - } - - /** - * Allocate space for a state with the given number of outgoing labels. - * - * @return state offset - */ - private int allocateState(int labels) { - expandBuffers(); - final int state = size; - size += labels * ARC_SIZE; - return state; - } - - /** - * Copy current into an internal buffer. - */ - private boolean setPrevious(byte [] sequence, int start, int length) { - if (previous == null || previous.length < length) { - previous = new byte [length]; - } - - System.arraycopy(sequence, start, previous, 0, length); - previousLength = length; - return true; - } - - /** - * Lexicographic order of input sequences. By default, consistent with the "C" sort - * (absolute value of bytes, 0-255). - */ - public static int compare(byte [] s1, int start1, int lens1, - byte [] s2, int start2, int lens2) { - final int max = Math.min(lens1, lens2); - - for (int i = 0; i < max; i++) { - final byte c1 = s1[start1++]; - final byte c2 = s2[start2++]; - if (c1 != c2) - return (c1 & 0xff) - (c2 & 0xff); - } - - return lens1 - lens2; - } -} diff --git a/src/morfologik/fsa/FSAFinalStatesIterator.java b/src/morfologik/fsa/FSAFinalStatesIterator.java deleted file mode 100644 index 9e381f4..0000000 --- a/src/morfologik/fsa/FSAFinalStatesIterator.java +++ /dev/null @@ -1,154 +0,0 @@ -package morfologik.fsa; - -import java.nio.ByteBuffer; -import java.util.*; - -/** - * An iterator that traverses the right language of a given node (all sequences - * reachable from a given node). - */ -public final class FSAFinalStatesIterator implements Iterator { - /** - * Default expected depth of the recursion stack (estimated longest sequence - * in the automaton). Buffers expand by the same value if exceeded. - */ - private final static int EXPECTED_MAX_STATES = 15; - - /** The FSA to which this iterator belongs. */ - private final FSA fsa; - - /** An internal cache for the next element in the FSA */ - private ByteBuffer nextElement; - - /** - * A buffer for the current sequence of bytes from the current node to the - * root. - */ - private byte[] buffer = new byte[EXPECTED_MAX_STATES]; - - /** Reusable byte buffer wrapper around {@link #buffer}. */ - private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer); - - /** An arc stack for DFS when processing the automaton. */ - private int[] arcs = new int[EXPECTED_MAX_STATES]; - - /** Current processing depth in {@link #arcs}. */ - private int position; - - /** - * Create an instance of the iterator for a given node. - */ - public FSAFinalStatesIterator(FSA fsa, int node) { - this.fsa = fsa; - - if (fsa.getFirstArc(node) != 0) { - restartFrom(node); - } - } - - /** - * Restart walking from node. Allows iterator reuse. - */ - public void restartFrom(int node) { - position = 0; - bufferWrapper.clear(); - nextElement = null; - - pushNode(node); - } - - /** Returns true if there are still elements in this iterator. */ - @Override - public boolean hasNext() { - if (nextElement == null) { - nextElement = advance(); - } - - return nextElement != null; - } - - /** - * @return Returns a {@link ByteBuffer} with the sequence corresponding to - * the next final state in the automaton. - */ - @Override - public ByteBuffer next() { - if (nextElement != null) { - final ByteBuffer cache = nextElement; - nextElement = null; - return cache; - } else { - final ByteBuffer cache = advance(); - if (cache == null) { - throw new NoSuchElementException(); - } - return cache; - } - } - - /** - * Advances to the next available final state. - */ - private final ByteBuffer advance() { - if (position == 0) { - return null; - } - - while (position > 0) { - final int lastIndex = position - 1; - final int arc = arcs[lastIndex]; - - if (arc == 0) { - // Remove the current node from the queue. - position--; - continue; - } - - // Go to the next arc, but leave it on the stack - // so that we keep the recursion depth level accurate. - arcs[lastIndex] = fsa.getNextArc(arc); - - // Expand buffer if needed. - final int bufferLength = this.buffer.length; - if (lastIndex >= bufferLength) { - this.buffer = Arrays.copyOf(buffer, bufferLength - + EXPECTED_MAX_STATES); - this.bufferWrapper = ByteBuffer.wrap(buffer); - } - buffer[lastIndex] = fsa.getArcLabel(arc); - - if (!fsa.isArcTerminal(arc)) { - // Recursively descend into the arc's node. - pushNode(fsa.getEndNode(arc)); - } - - if (fsa.isArcFinal(arc)) { - bufferWrapper.clear(); - bufferWrapper.limit(lastIndex + 1); - return bufferWrapper; - } - } - - return null; - } - - /** - * Not implemented in this iterator. - */ - @Override - public void remove() { - throw new UnsupportedOperationException("Read-only iterator."); - } - - /** - * Descends to a given node, adds its arcs to the stack to be traversed. - */ - private void pushNode(int node) { - // Expand buffers if needed. - if (position == arcs.length) { - arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES); - } - - arcs[position++] = fsa.getFirstArc(node); - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/FSAFlags.java b/src/morfologik/fsa/FSAFlags.java deleted file mode 100644 index 7b9a730..0000000 --- a/src/morfologik/fsa/FSAFlags.java +++ /dev/null @@ -1,64 +0,0 @@ -package morfologik.fsa; - -import java.util.Set; - -/** - * FSA automaton flags. Where applicable, flags follow Daciuk's fsa package. - */ -public enum FSAFlags { - /** Daciuk: flexible FSA encoding. */ - FLEXIBLE(1 << 0), - - /** Daciuk: stop bit in use. */ - STOPBIT(1 << 1), - - /** Daciuk: next bit in use. */ - NEXTBIT(1 << 2), - - /** Daciuk: tails compression. */ - TAILS(1 << 3), - - /* - * These flags are outside of byte range (never occur in Daciuk's FSA). - */ - - /** - * The FSA contains right-language count numbers on states. - * - * @see FSA#getRightLanguageCount(int) - */ - NUMBERS(1 << 8), - - /** - * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package - * compatibility). - */ - SEPARATORS(1 << 9); - - /** - * Bit mask for the corresponding flag. - */ - public final int bits; - - /** */ - private FSAFlags(int bits) { - this.bits = bits; - } - - /** - * Returns true if the corresponding flag is set in the bit set. - */ - public static boolean isSet(int flags, FSAFlags flag) { - return (flags & flag.bits) != 0; - } - - /** - * Returns the set of flags encoded in a single short. - */ - public static short asShort(Set flags) { - short value = 0; - for (FSAFlags f : flags) - value |= f.bits; - return value; - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/FSAHeader.java b/src/morfologik/fsa/FSAHeader.java deleted file mode 100644 index 76fd6ff..0000000 --- a/src/morfologik/fsa/FSAHeader.java +++ /dev/null @@ -1,52 +0,0 @@ -package morfologik.fsa; - -import java.io.IOException; -import java.io.InputStream; - -import morfologik.util.FileUtils; -import static morfologik.util.FileUtils.*; - -/** - * Standard FSA file header, as described in fsa package documentation. - */ -final class FSAHeader { - /** - * FSA magic (4 bytes). - */ - public final static int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a'); - - /** - * Maximum length of the header block. - */ - public static final int MAX_HEADER_LENGTH = 4 + 8; - - /** FSA version number. */ - public byte version; - - /** Filler character. */ - public byte filler; - - /** Annotation character. */ - public byte annotation; - - /** Goto field (may be a compound, depending on the automaton version). */ - public byte gtl; - - /** - * Read FSA header from a stream, consuming its bytes. - * - * @throws IOException If the stream ends prematurely or if it contains invalid data. - */ - public static FSAHeader read(InputStream in) throws IOException { - if (FSA_MAGIC != FileUtils.readInt(in)) - throw new IOException("Invalid file header magic bytes."); - - final FSAHeader h = new FSAHeader(); - h.version = readByte(in); - h.filler = readByte(in); - h.annotation = readByte(in); - h.gtl = readByte(in); - - return h; - } -} diff --git a/src/morfologik/fsa/FSAInfo.java b/src/morfologik/fsa/FSAInfo.java deleted file mode 100644 index dc5cb27..0000000 --- a/src/morfologik/fsa/FSAInfo.java +++ /dev/null @@ -1,157 +0,0 @@ -package morfologik.fsa; - -import java.util.BitSet; -import java.util.HashMap; - -/** - * Compute additional information about an FSA: number of arcs, nodes, etc. - */ -public final class FSAInfo { - /** - * Computes the exact number of states and nodes by recursively traversing - * the FSA. - */ - private static class NodeVisitor { - final BitSet visitedArcs = new BitSet(); - final BitSet visitedNodes = new BitSet(); - - int nodes; - int arcs; - int totalArcs; - - private final FSA fsa; - - NodeVisitor(FSA fsa) { - this.fsa = fsa; - } - - public void visitNode(final int node) { - if (visitedNodes.get(node)) { - return; - } - visitedNodes.set(node); - - nodes++; - for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa - .getNextArc(arc)) { - if (!visitedArcs.get(arc)) { - arcs++; - } - totalArcs++; - visitedArcs.set(arc); - - if (!fsa.isArcTerminal(arc)) { - visitNode(fsa.getEndNode(arc)); - } - } - } - } - - /** - * Computes the exact number of final states. - */ - private static class FinalStateVisitor { - final HashMap visitedNodes - = new HashMap(); - - private final FSA fsa; - - FinalStateVisitor(FSA fsa) { - this.fsa = fsa; - } - - public int visitNode(int node) { - Integer cached = visitedNodes.get(node); - if (cached != null) - return cached; - - int fromHere = 0; - for (int arc = fsa.getFirstArc(node); - arc != 0; arc = fsa.getNextArc(arc)) - { - if (fsa.isArcFinal(arc)) - fromHere++; - - if (!fsa.isArcTerminal(arc)) { - fromHere += visitNode(fsa.getEndNode(arc)); - } - } - visitedNodes.put(node, fromHere); - return fromHere; - } - } - - /** - * Number of nodes in the automaton. - */ - public final int nodeCount; - - /** - * Number of arcs in the automaton, excluding an arcs from the zero node - * (initial) and an arc from the start node to the root node. - */ - public final int arcsCount; - - /** - * Total number of arcs, counting arcs that physically overlap due to - * merging. - */ - public final int arcsCountTotal; - - /** - * Number of final states (number of input sequences stored in the automaton). - */ - public final int finalStatesCount; - - /** - * Arcs size (in serialized form). - */ - public final int size; - - /* - * - */ - public FSAInfo(FSA fsa) { - final NodeVisitor w = new NodeVisitor(fsa); - int root = fsa.getRootNode(); - if (root > 0) { - w.visitNode(root); - } - - this.nodeCount = 1 + w.nodes; - this.arcsCount = 1 + w.arcs; - this.arcsCountTotal = 1 + w.totalArcs; - - final FinalStateVisitor fsv = new FinalStateVisitor(fsa); - this.finalStatesCount = fsv.visitNode(fsa.getRootNode()); - - if (fsa instanceof FSA5) { - this.size = ((FSA5) fsa).arcs.length; - } else { - this.size = 0; - } - } - - /* - * - */ - public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) { - this.nodeCount = nodeCount; - this.arcsCount = arcsCount; - this.arcsCountTotal = arcsCountTotal; - this.finalStatesCount = finalStatesCount; - this.size = 0; - } - - /* - * - */ - @Override - public String toString() { - return "Nodes: " + nodeCount - + ", arcs visited: " + arcsCount - + ", arcs total: " + arcsCountTotal - + ", final states: " + finalStatesCount - + ", size: " + size; - } -} diff --git a/src/morfologik/fsa/FSASerializer.java b/src/morfologik/fsa/FSASerializer.java deleted file mode 100644 index 414640e..0000000 --- a/src/morfologik/fsa/FSASerializer.java +++ /dev/null @@ -1,45 +0,0 @@ -package morfologik.fsa; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.Set; - -import morfologik.tools.IMessageLogger; - -/** - * All FSA serializers to binary formats will implement this interface. - */ -public interface FSASerializer { - /** - * Serialize a finite state automaton to an output stream. - */ - public T serialize(FSA fsa, T os) throws IOException; - - /** - * Returns the set of flags supported by the serializer (and the output automaton). - */ - public Set getFlags(); - - /** - * Log extra messages during construction. - */ - public FSASerializer withLogger(IMessageLogger logger); - - /** - * Supports built-in filler separator. Only if {@link #getFlags()} returns - * {@link FSAFlags#SEPARATORS}. - */ - public FSASerializer withFiller(byte filler); - - /** - * Supports built-in annotation separator. Only if {@link #getFlags()} returns - * {@link FSAFlags#SEPARATORS}. - */ - public FSASerializer withAnnotationSeparator(byte annotationSeparator); - - /** - * Supports built-in right language count on nodes, speeding up perfect hash counts. - * Only if {@link #getFlags()} returns {@link FSAFlags#NUMBERS}. - */ - public FSASerializer withNumbers(); -} diff --git a/src/morfologik/fsa/FSATraversal.java b/src/morfologik/fsa/FSATraversal.java deleted file mode 100644 index 9e59003..0000000 --- a/src/morfologik/fsa/FSATraversal.java +++ /dev/null @@ -1,169 +0,0 @@ -package morfologik.fsa; - -import static morfologik.fsa.MatchResult.*; - -/** - * This class implements some common matching and scanning operations on a - * generic FSA. - */ -public final class FSATraversal { - /** - * Target automaton. - */ - private final FSA fsa; - - /** - * Traversals of the given FSA. - */ - public FSATraversal(FSA fsa) { - this.fsa = fsa; - } - - /** - * Calculate perfect hash for a given input sequence of bytes. The perfect hash requires - * that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential - * order of input sequences used at automaton construction time. - * - * @param start Start index in the sequence array. - * @param length Length of the byte sequence, must be at least 1. - * - * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting - * the number of that sequence in the input used to build the automaton). Returns a negative - * integer if the input sequence was not part of the input from which the automaton was created. - * The type of mismatch is a constant defined in {@link MatchResult}. - */ - public int perfectHash(byte[] sequence, int start, int length, int node) { - assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option."; - assert length > 0 : "Must be a non-empty sequence."; - - int hash = 0; - final int end = start + length - 1; - - int seqIndex = start; - byte label = sequence[seqIndex]; - - // Seek through the current node's labels, looking for 'label', update hash. - for (int arc = fsa.getFirstArc(node); arc != 0;) { - if (fsa.getArcLabel(arc) == label) { - if (fsa.isArcFinal(arc)) { - if (seqIndex == end) - return hash; - - hash++; - } - - if (fsa.isArcTerminal(arc)) { - /* The automaton contains a prefix of the input sequence. */ - return AUTOMATON_HAS_PREFIX; - } - - // The sequence is a prefix of one of the sequences stored in the automaton. - if (seqIndex == end) { - return SEQUENCE_IS_A_PREFIX; - } - - // Make a transition along the arc, go the target node's first arc. - arc = fsa.getFirstArc(fsa.getEndNode(arc)); - label = sequence[++seqIndex]; - continue; - } else { - if (fsa.isArcFinal(arc)) - hash++; - if (!fsa.isArcTerminal(arc)) - hash += fsa.getRightLanguageCount(fsa.getEndNode(arc)); - } - - arc = fsa.getNextArc(arc); - } - - // Labels of this node ended without a match on the sequence. - // Perfect hash does not exist. - return NO_MATCH; - } - - /** - * @see #perfectHash(byte[], int, int, int) - */ - public int perfectHash(byte[] sequence) { - return perfectHash(sequence, 0, sequence.length, fsa.getRootNode()); - } - - /** - * Same as {@link #match(byte[], int, int, int)}, but allows passing - * a reusable {@link MatchResult} object so that no intermediate garbage is - * produced. - * - * @return The same object as result, but with reset internal - * type and other fields. - */ - public MatchResult match(MatchResult result, - byte[] sequence, int start, int length, int node) - { - if (node == 0) { - result.reset(NO_MATCH, start, node); - return result; - } - - final FSA fsa = this.fsa; - final int end = start + length; - for (int i = start; i < end; i++) { - final int arc = fsa.getArc(node, sequence[i]); - if (arc != 0) { - if (fsa.isArcFinal(arc) && i + 1 == end) { - /* The automaton has an exact match of the input sequence. */ - result.reset(EXACT_MATCH, i, node); - return result; - } - - if (fsa.isArcTerminal(arc)) { - /* The automaton contains a prefix of the input sequence. */ - result.reset(AUTOMATON_HAS_PREFIX, i + 1, 0); - return result; - } - - // Make a transition along the arc. - node = fsa.getEndNode(arc); - } else { - result.reset(NO_MATCH, i, node); - return result; - } - } - - /* The sequence is a prefix of at least one sequence in the automaton. */ - result.reset(SEQUENCE_IS_A_PREFIX, 0, node); - return result; - } - - /** - * Finds a matching path in the dictionary for a given sequence of labels - * from sequence and starting at node node. - * - * @param sequence - * An array of labels to follow in the FSA. - * @param start - * Starting index in sequence. - * @param length - * How many symbols to consider from sequence? - * @param node - * Start node identifier in the FSA. - * - * @see #match(byte [], int) - */ - public MatchResult match(byte[] sequence, int start, int length, int node) { - return match(new MatchResult(), sequence, start, length, node); - } - - /** - * @see #match(byte[], int, int, int) - */ - public MatchResult match(byte[] sequence, int node) { - return match(sequence, 0, sequence.length, node); - } - - /** - * @see #match(byte[], int, int, int) - */ - public MatchResult match(byte[] sequence) { - return match(sequence, fsa.getRootNode()); - } -} \ No newline at end of file diff --git a/src/morfologik/fsa/FSAUtils.java b/src/morfologik/fsa/FSAUtils.java deleted file mode 100644 index cad611e..0000000 --- a/src/morfologik/fsa/FSAUtils.java +++ /dev/null @@ -1,202 +0,0 @@ -package morfologik.fsa; - -import java.io.IOException; -import java.io.StringWriter; -import java.io.Writer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.TreeMap; - -import com.carrotsearch.hppc.IntIntOpenHashMap; - -/** - * Other FSA-related utilities not directly associated with the class hierarchy. - */ -public final class FSAUtils { - public final static class IntIntHolder { - public int a; - public int b; - - public IntIntHolder(int a, int b) { - this.a = a; - this.b = b; - } - - public IntIntHolder() { - } - } - - /** - * Returns the right-language reachable from a given FSA node, formatted - * as an input for the graphviz package (expressed in the dot - * language). - */ - public static String toDot(FSA fsa, int node) { - try { - StringWriter w = new StringWriter(); - toDot(w, fsa, node); - return w.toString(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /** - * Saves the right-language reachable from a given FSA node, formatted - * as an input for the graphviz package (expressed in the dot - * language), to the given writer. - */ - public static void toDot(Writer w, FSA fsa, int node) throws IOException { - w.write("digraph Automaton {\n"); - w.write(" rankdir = LR;\n"); - - final BitSet visited = new BitSet(); - - w.write(" stop [shape=doublecircle,label=\"\"];\n"); - w.write(" initial [shape=plaintext,label=\"\"];\n"); - w.write(" initial -> " + node + "\n\n"); - - visitNode(w, 0, fsa, node, visited); - w.write("}\n"); - } - - private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException { - visited.set(s); - w.write(" "); w.write(Integer.toString(s)); - - if (fsa.getFlags().contains(FSAFlags.NUMBERS)) { - int nodeNumber = fsa.getRightLanguageCount(s); - w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n"); - } else { - w.write(" [shape=circle,label=\"\"];\n"); - } - - for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { - w.write(" "); - w.write(Integer.toString(s)); - w.write(" -> "); - if (fsa.isArcTerminal(arc)) { - w.write("stop"); - } else { - w.write(Integer.toString(fsa.getEndNode(arc))); - } - - final byte label = fsa.getArcLabel(arc); - w.write(" [label=\""); - if (Character.isLetterOrDigit(label)) - w.write((char) label); - else { - w.write("0x"); - w.write(Integer.toHexString(label & 0xFF)); - } - w.write("\""); - if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\""); - if (fsa instanceof FSA5) { - if (((FSA5) fsa).isNextSet(arc)) { - w.write(" color=\"blue\""); - } - } - - w.write("]\n"); - } - - for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { - if (!fsa.isArcTerminal(arc)) { - int endNode = fsa.getEndNode(arc); - if (!visited.get(endNode)) { - visitNode(w, d + 1, fsa, endNode, visited); - } - } - } - } - - /** - * All byte sequences generated as the right language of state. - */ - public static ArrayList rightLanguage(FSA fsa, int state) { - final ArrayList rl = new ArrayList(); - final byte [] buffer = new byte [0]; - - descend(fsa, state, buffer, 0, rl); - - return rl; - } - - /** - * Recursive descend and collection of the right language. - */ - private static byte [] descend(FSA fsa, int state, byte [] b, int position, ArrayList rl) { - - if (b.length <= position) { - b = Arrays.copyOf(b, position + 1); - } - - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { - b[position] = fsa.getArcLabel(arc); - - if (fsa.isArcFinal(arc)) { - rl.add(Arrays.copyOf(b, position + 1)); - } - - if (!fsa.isArcTerminal(arc)) - b = descend(fsa, fsa.getEndNode(arc), b, position + 1, rl); - } - - return b; - } - - /** - * Calculate fan-out ratio. - * @return The returned array: result[outgoing-arcs] - */ - public static TreeMap calculateFanOuts(final FSA fsa, int root) { - final int [] result = new int [256]; - fsa.visitInPreOrder(new StateVisitor() { - public boolean accept(int state) { - int count = 0; - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) - count++; - result[count]++; - return true; - } - }); - - TreeMap output = new TreeMap(); - - int low = 1; // Omit #0, there is always a single node like that (dummy). - while (low < result.length && result[low] == 0) low++; - - int high = result.length - 1; - while (high >= 0 && result[high] == 0) high--; - - for (int i = low; i <= high; i++) { - output.put(i, result[i]); - } - - return output; - } - - /** - * Calculate the size of right language for each state in an FSA. - */ - public static IntIntOpenHashMap rightLanguageForAllStates(final FSA fsa) { - final IntIntOpenHashMap numbers = new IntIntOpenHashMap(); - - fsa.visitInPostOrder(new StateVisitor() { - public boolean accept(int state) { - int thisNodeNumber = 0; - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { - thisNodeNumber += - (fsa.isArcFinal(arc) ? 1 : 0) + - (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc))); - } - numbers.put(state, thisNodeNumber); - - return true; - } - }); - - return numbers; - } -} diff --git a/src/morfologik/fsa/MatchResult.java b/src/morfologik/fsa/MatchResult.java deleted file mode 100644 index 2f5cbd7..0000000 --- a/src/morfologik/fsa/MatchResult.java +++ /dev/null @@ -1,86 +0,0 @@ -package morfologik.fsa; - -/** - * A matching result returned from {@link FSATraversal}. - * - * @see FSATraversal - */ -public final class MatchResult { - /** - * The automaton has exactly one match for the input sequence. - */ - public static final int EXACT_MATCH = 0; - - /** - * The automaton has no match for the input sequence. - */ - public static final int NO_MATCH = -1; - - /** - * The automaton contains a prefix of the input sequence. That is: - * one of the input sequences used to build the automaton is a - * prefix of the input sequence that is shorter than the sequence. - * - *

{@link MatchResult#index} will contain an index of the - * first character of the input sequence not present in the - * dictionary.

- */ - public static final int AUTOMATON_HAS_PREFIX = -3; - - /** - * The sequence is a prefix of at least one sequence in the automaton. - * {@link MatchResult#node} returns the node from which all sequences - * with the given prefix start in the automaton. - */ - public static final int SEQUENCE_IS_A_PREFIX = -4; - - /** - * One of the match kind constants defined in this class. - * - * @see #NO_MATCH - * @see #EXACT_MATCH - * @see #AUTOMATON_HAS_PREFIX - * @see #SEQUENCE_IS_A_PREFIX - */ - public int kind; - - /** - * Input sequence's index, interpretation depends on {@link #kind}. - */ - public int index; - - /** - * Automaton node, interpretation depends on the {@link #kind}. - */ - public int node; - - /* - * - */ - MatchResult(int kind, int index, int node) { - reset(kind, index, node); - } - - /* - * - */ - MatchResult(int kind) { - reset(kind, 0, 0); - } - - /* - * - */ - public MatchResult() { - reset(NO_MATCH, 0, 0); - } - - /* - * - */ - final void reset(int kind, int index, int node) { - this.kind = kind; - this.index = index; - this.node = node; - } -} diff --git a/src/morfologik/fsa/NullMessageLogger.java b/src/morfologik/fsa/NullMessageLogger.java deleted file mode 100644 index 6d326d9..0000000 --- a/src/morfologik/fsa/NullMessageLogger.java +++ /dev/null @@ -1,24 +0,0 @@ -package morfologik.fsa; - -import morfologik.tools.IMessageLogger; - -/* - * Do-nothing logger. - */ -final class NullMessageLogger implements IMessageLogger { - @Override - public void log(String msg) { - } - - @Override - public void startPart(String header) { - } - - @Override - public void endPart() { - } - - @Override - public void log(String header, Object v) { - } -} diff --git a/src/morfologik/fsa/StateVisitor.java b/src/morfologik/fsa/StateVisitor.java deleted file mode 100644 index 8ced239..0000000 --- a/src/morfologik/fsa/StateVisitor.java +++ /dev/null @@ -1,11 +0,0 @@ -package morfologik.fsa; - -/** - * State visitor. - * - * @see FSA#visitInPostOrder(StateVisitor) - * @see FSA#visitInPreOrder(StateVisitor) - */ -public interface StateVisitor { - public boolean accept(int state); -} \ No newline at end of file diff --git a/src/morfologik/stemming/ArrayViewList.java b/src/morfologik/stemming/ArrayViewList.java deleted file mode 100644 index 230aef6..0000000 --- a/src/morfologik/stemming/ArrayViewList.java +++ /dev/null @@ -1,111 +0,0 @@ -package morfologik.stemming; - -import java.util.*; - -/** - * A view over a range of an array. - */ -@SuppressWarnings("serial") -final class ArrayViewList extends AbstractList - implements RandomAccess, java.io.Serializable -{ - /** Backing array. */ - private E[] a; - private int start; - private int length; - - /* - * - */ - ArrayViewList(E[] array, int start, int length) { - if (array == null) - throw new IllegalArgumentException(); - wrap(a, start, length); - } - - /* - * - */ - public int size() { - return length; - } - - /* - * - */ - public E get(int index) { - return a[start + index]; - } - - /* - * - */ - public E set(int index, E element) { - throw new UnsupportedOperationException(); - } - - /* - * - */ - public void add(int index, E element) { - throw new UnsupportedOperationException(); - } - - /* - * - */ - public E remove(int index) { - throw new UnsupportedOperationException(); - } - - /* - * - */ - public boolean addAll(int index, Collection c) { - throw new UnsupportedOperationException(); - } - - /* - * - */ - public int indexOf(Object o) { - if (o == null) { - for (int i = start; i < start + length; i++) - if (a[i] == null) - return i - start; - } else { - for (int i = start; i < start + length; i++) - if (o.equals(a[i])) - return i - start; - } - return -1; - } - - public ListIterator listIterator() { - return listIterator(0); - } - - /* - * - */ - public ListIterator listIterator(final int index) { - return Arrays.asList(a).subList(start, start + length).listIterator( - index); - } - - /* - * - */ - public boolean contains(Object o) { - return indexOf(o) != -1; - } - - /* - * - */ - void wrap(E[] array, int start, int length) { - this.a = array; - this.start = start; - this.length = length; - } -} diff --git a/src/morfologik/stemming/Dictionary.java b/src/morfologik/stemming/Dictionary.java deleted file mode 100644 index 7441d5e..0000000 --- a/src/morfologik/stemming/Dictionary.java +++ /dev/null @@ -1,169 +0,0 @@ -package morfologik.stemming; - -import java.io.*; -import java.net.URL; -import java.util.*; - -import morfologik.fsa.FSA; -import morfologik.util.FileUtils; -import morfologik.util.ResourceUtils; - -/** - * A dictionary combines {@link FSA} automaton and metadata describing the - * internals of dictionary entries' coding ({@link DictionaryMetadata}. - * - *

- * A dictionary consists of two files: - *

    - *
  • an actual compressed FSA file, - *
  • a metadata file, describing the dictionary. - *
- * Use static methods in this class to read dictionaries and their metadata. - */ -public final class Dictionary { - /** - * Expected metadata file extension. - */ - public final static String METADATA_FILE_EXTENSION = "info"; - - /** - * {@link FSA} automaton with the compiled dictionary data. - */ - public final FSA fsa; - - /** - * Metadata associated with the dictionary. - */ - public final DictionaryMetadata metadata; - - /** - * Default loaded dictionaries. - */ - public static final WeakHashMap defaultDictionaries = new WeakHashMap(); - - /** - * It is strongly recommended to use static methods in this class for - * reading dictionaries. - * - * @param fsa - * An instantiated {@link FSA} instance. - * - * @param metadata - * A map of attributes describing the compression format and - * other settings not contained in the FSA automaton. For an - * explanation of available attributes and their possible values, - * see {@link DictionaryMetadata}. - */ - public Dictionary(FSA fsa, DictionaryMetadata metadata) { - this.fsa = fsa; - this.metadata = metadata; - } - - /** - * Attempts to load a dictionary using the path to the FSA file and the - * expected metadata extension. - */ - public static Dictionary read(File fsaFile) throws IOException { - final File featuresFile = new File(fsaFile.getParent(), - getExpectedFeaturesName(fsaFile.getName())); - - FileUtils.assertExists(featuresFile, true, false); - - return readAndClose(new FileInputStream(fsaFile), new FileInputStream( - featuresFile)); - } - - /** - *

- * Attempts to load a dictionary using the URL to the FSA file and the - * expected metadata extension. - * - *

- * This method can be used to load resource-based dictionaries, but be aware - * of JAR resource-locking issues that arise from resource URLs. - */ - public static Dictionary read(URL fsaURL) throws IOException { - final String fsa = fsaURL.toExternalForm(); - final String features = getExpectedFeaturesName(fsa); - - return readAndClose(ResourceUtils.openInputStream(fsa), ResourceUtils - .openInputStream(features)); - } - - /** - * Attempts to load a dictionary from opened streams of FSA dictionary data - * and associated metadata. - */ - public static Dictionary readAndClose(InputStream fsaData, - InputStream featuresData) throws IOException { - try { - final Properties properties = new Properties(); - properties.load(featuresData); - - final DictionaryMetadata features = DictionaryMetadata - .fromMap(properties); - final FSA fsa = FSA.read(fsaData); - - return new Dictionary(fsa, features); - } finally { - FileUtils.close(fsaData, featuresData); - } - } - - /** - * Returns the expected name of the metadata file, based on the name of the - * FSA dictionary file. The expected name is resolved by truncating any - * suffix of name and appending - * {@link #METADATA_FILE_EXTENSION}. - */ - public static String getExpectedFeaturesName(String name) { - final int dotIndex = name.lastIndexOf('.'); - final String featuresName; - if (dotIndex >= 0) { - featuresName = name.substring(0, dotIndex) + "." - + METADATA_FILE_EXTENSION; - } else { - featuresName = name + "." + METADATA_FILE_EXTENSION; - } - - return featuresName; - } - - /** - * Return a built-in dictionary for a given ISO language code. Dictionaries - * are cached internally for potential reuse. - * - * @throws RuntimeException - * Throws a {@link RuntimeException} if the dictionary is not - * bundled with the library. - */ - public static Dictionary getForLanguage(String languageCode) { - if (languageCode == null || "".equals(languageCode)) { - throw new IllegalArgumentException( - "Language code must not be empty."); - } - - synchronized (defaultDictionaries) { - Dictionary dict = defaultDictionaries.get(languageCode); - if (dict != null) - return dict; - - try { - final String dictPath = "morfologik/dictionaries/" + languageCode + ".dict"; - final String metaPath = Dictionary - .getExpectedFeaturesName(dictPath); - - dict = Dictionary.readAndClose( - ResourceUtils.openInputStream(dictPath), - ResourceUtils.openInputStream(metaPath)); - - defaultDictionaries.put(languageCode, dict); - return dict; - } catch (IOException e) { - throw new RuntimeException( - "Default dictionary resource for language '" - + languageCode + "not found.", e); - } - } - } -} diff --git a/src/morfologik/stemming/DictionaryIterator.java b/src/morfologik/stemming/DictionaryIterator.java deleted file mode 100644 index 19b6dad..0000000 --- a/src/morfologik/stemming/DictionaryIterator.java +++ /dev/null @@ -1,144 +0,0 @@ -package morfologik.stemming; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetDecoder; -import java.util.Iterator; - -import morfologik.util.BufferUtils; - -/** - * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems - * can be decoded from compressed format or the compressed form can be - * preserved. - */ -public final class DictionaryIterator implements Iterator { - private final CharsetDecoder decoder; - private final Iterator entriesIter; - private final WordData entry; - private final byte separator; - private final DictionaryMetadata dictionaryMetadata; - private final boolean decodeStems; - - private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0); - private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0); - private ByteBuffer temp = ByteBuffer.allocate(0); - - public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder, - boolean decodeStems) { - this.entriesIter = dictionary.fsa.iterator(); - this.separator = dictionary.metadata.separator; - this.dictionaryMetadata = dictionary.metadata; - this.decoder = decoder; - this.entry = new WordData(decoder); - this.decodeStems = decodeStems; - } - - public boolean hasNext() { - return entriesIter.hasNext(); - } - - public WordData next() { - final ByteBuffer entryBuffer = entriesIter.next(); - entry.reset(); - - /* - * Entries are typically: inflectedcodedBasetag so try to find - * this split. - */ - byte[] ba = entryBuffer.array(); - int bbSize = entryBuffer.remaining(); - - int sepPos; - for (sepPos = 0; sepPos < bbSize; sepPos++) { - if (ba[sepPos] == separator) - break; - } - - if (sepPos == bbSize) { - throw new RuntimeException("Invalid dictionary " - + "entry format (missing separator)."); - } - - inflectedBuffer.clear(); - inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos); - inflectedBuffer.put(ba, 0, sepPos); - inflectedBuffer.flip(); - - inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer); - entry.wordBuffer = inflectedBuffer; - entry.wordCharSequence = inflectedCharBuffer; - - temp.clear(); - temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos); - sepPos++; - temp.put(ba, sepPos, bbSize - sepPos); - temp.flip(); - - ba = temp.array(); - bbSize = temp.remaining(); - - /* - * Find the next separator byte's position splitting word form and tag. - */ - sepPos = 0; - for (; sepPos < bbSize; sepPos++) { - if (ba[sepPos] == separator) - break; - } - - /* - * Decode the stem into stem buffer. - */ - entry.stemBuffer.clear(); - if (decodeStems) { - entry.stemBuffer = DictionaryLookup.decodeStem(entry.stemBuffer, - ba, sepPos, inflectedBuffer, dictionaryMetadata); - } else { - entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer, - sepPos); - entry.stemBuffer.put(ba, 0, sepPos); - } - entry.stemBuffer.flip(); - - // Skip separator character, if present. - if (sepPos + 1 <= bbSize) { - sepPos++; - } - - /* - * Decode the tag data. - */ - entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize - - sepPos); - entry.tagBuffer.clear(); - entry.tagBuffer.put(ba, sepPos, bbSize - sepPos); - entry.tagBuffer.flip(); - - return entry; - } - - /** - * Decode the byte buffer, optionally expanding the char buffer. - */ - private CharBuffer bytesToChars(ByteBuffer bytes, CharBuffer chars) { - chars.clear(); - final int maxCapacity = (int) (bytes.remaining() * decoder - .maxCharsPerByte()); - if (chars.capacity() <= maxCapacity) { - chars = CharBuffer.allocate(maxCapacity); - } - - bytes.mark(); - decoder.reset(); - decoder.decode(bytes, chars, true); - chars.flip(); - bytes.reset(); - - return chars; - } - - public void remove() { - throw new UnsupportedOperationException(); - } -} diff --git a/src/morfologik/stemming/DictionaryLookup.java b/src/morfologik/stemming/DictionaryLookup.java deleted file mode 100644 index ac90107..0000000 --- a/src/morfologik/stemming/DictionaryLookup.java +++ /dev/null @@ -1,355 +0,0 @@ -package morfologik.stemming; - -import static morfologik.fsa.MatchResult.*; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.*; -import java.util.*; - -import morfologik.fsa.*; -import morfologik.util.BufferUtils; - -/** - * This class implements a dictionary lookup over an FSA dictionary. The - * dictionary for this class should be prepared from a text file using Jan - * Daciuk's FSA package (see link below). - * - *

- * Important: finite state automatons in Jan Daciuk's implementation use - * bytes not unicode characters. Therefore objects of this class always - * have to be constructed with an encoding used to convert Java strings to byte - * arrays and the other way around. You can use UTF-8 encoding, as it - * should not conflict with any control sequences and separator characters. - * - * @see FSA package Web - * site - */ -public final class DictionaryLookup implements IStemmer, Iterable { - /** An FSA used for lookups. */ - private final FSATraversal matcher; - - /** An iterator for walking along the final states of {@link #fsa}. */ - private final FSAFinalStatesIterator finalStatesIterator; - - /** FSA's root node. */ - private final int rootNode; - - /** Expand buffers and arrays by this constant. */ - private final static int EXPAND_SIZE = 10; - - /** Private internal array of reusable word data objects. */ - private WordData[] forms = new WordData[0]; - - /** A "view" over an array implementing */ - private ArrayViewList formsList = new ArrayViewList( - forms, 0, forms.length); - - /** - * Features of the compiled dictionary. - * - * @see DictionaryMetadata - */ - private final DictionaryMetadata dictionaryMetadata; - - /** - * Charset encoder for the FSA. - */ - private final CharsetEncoder encoder; - - /** - * Charset decoder for the FSA. - */ - private final CharsetDecoder decoder; - - /** - * The FSA we are using. - */ - private final FSA fsa; - - /** - * Internal reusable buffer for encoding words into byte arrays using - * {@link #encoder}. - */ - private ByteBuffer byteBuffer = ByteBuffer.allocate(0); - - /** - * Internal reusable buffer for encoding words into byte arrays using - * {@link #encoder}. - */ - private CharBuffer charBuffer = CharBuffer.allocate(0); - - /** - * Reusable match result. - */ - private final MatchResult matchResult = new MatchResult(); - - /** - * The {@link Dictionary} this lookup is using. - */ - private final Dictionary dictionary; - - /** - *

- * Creates a new object of this class using the given FSA for word lookups - * and encoding for converting characters to bytes. - * - * @throws IllegalArgumentException - * if FSA's root node cannot be acquired (dictionary is empty). - */ - public DictionaryLookup(Dictionary dictionary) - throws IllegalArgumentException { - this.dictionary = dictionary; - this.dictionaryMetadata = dictionary.metadata; - this.rootNode = dictionary.fsa.getRootNode(); - this.fsa = dictionary.fsa; - this.matcher = new FSATraversal(fsa); - this.finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode()); - - if (rootNode == 0) { - throw new IllegalArgumentException( - "Dictionary must have at least the root node."); - } - - if (dictionaryMetadata == null) { - throw new IllegalArgumentException( - "Dictionary metadata must not be null."); - } - - try { - Charset charset = Charset.forName(dictionaryMetadata.encoding); - encoder = charset.newEncoder(); - decoder = charset.newDecoder().onMalformedInput( - CodingErrorAction.REPORT).onUnmappableCharacter( - CodingErrorAction.REPORT); - } catch (UnsupportedCharsetException e) { - throw new RuntimeException( - "FSA's encoding charset is not supported: " - + dictionaryMetadata.encoding); - } - } - - /** - * Searches the automaton for a symbol sequence equal to word, - * followed by a separator. The result is a stem (decompressed accordingly - * to the dictionary's specification) and an optional tag data. - */ - public List lookup(CharSequence word) { - final byte separator = dictionaryMetadata.separator; - - // Encode word characters into bytes in the same encoding as the FSA's. - charBuffer.clear(); - charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length()); - for (int i = 0; i < word.length(); i++) - charBuffer.put(word.charAt(i)); - charBuffer.flip(); - byteBuffer = charsToBytes(charBuffer, byteBuffer); - - // Try to find a partial match in the dictionary. - final MatchResult match = matcher.match(matchResult, byteBuffer - .array(), 0, byteBuffer.remaining(), rootNode); - - if (match.kind == SEQUENCE_IS_A_PREFIX) { - /* - * The entire sequence exists in the dictionary. A separator should - * be the next symbol. - */ - final int arc = fsa.getArc(match.node, separator); - - /* - * The situation when the arc points to a final node should NEVER - * happen. After all, we want the word to have SOME base form. - */ - if (arc != 0 && !fsa.isArcFinal(arc)) { - // There is such a word in the dictionary. Return its base forms. - int formsCount = 0; - - finalStatesIterator.restartFrom(fsa.getEndNode(arc)); - while (finalStatesIterator.hasNext()) { - final ByteBuffer bb = finalStatesIterator.next(); - final byte[] ba = bb.array(); - final int bbSize = bb.remaining(); - - if (formsCount >= forms.length) { - forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE); - for (int k = 0; k < forms.length; k++) { - if (forms[k] == null) - forms[k] = new WordData(decoder); - } - } - - /* - * Now, expand the prefix/ suffix 'compression' and store - * the base form. - */ - final WordData wordData = forms[formsCount++]; - wordData.reset(); - - wordData.wordBuffer = byteBuffer; - wordData.wordCharSequence = word; - - /* - * Find the separator byte's position splitting word form - * and tag. - */ - int sepPos; - for (sepPos = 0; sepPos < bbSize; sepPos++) { - if (ba[sepPos] == separator) - break; - } - - /* - * Decode the stem into stem buffer. - */ - wordData.stemBuffer.clear(); - wordData.stemBuffer = decodeStem(wordData.stemBuffer, ba, - sepPos, byteBuffer, dictionaryMetadata); - wordData.stemBuffer.flip(); - - // Skip separator character. - sepPos++; - - /* - * Decode the tag data. - */ - wordData.tagBuffer = BufferUtils.ensureCapacity( - wordData.tagBuffer, bbSize - sepPos); - wordData.tagBuffer.clear(); - wordData.tagBuffer.put(ba, sepPos, bbSize - sepPos); - wordData.tagBuffer.flip(); - } - - formsList.wrap(forms, 0, formsCount); - return formsList; - } - } else { - /* - * this case is somewhat confusing: we should have hit the separator - * first... I don't really know how to deal with it at the time - * being. - */ - } - - return Collections.emptyList(); - } - - /** - * Decode the base form of an inflected word and save its decoded form into - * a byte buffer. - * - * @param bb - * The byte buffer to save the result to. A new buffer may be - * allocated if the capacity of bb is not large - * enough to store the result. The buffer is not flipped upon - * return. - * - * @param inflectedBuffer - * Inflected form's bytes (decoded properly). - * - * @param bytes - * Bytes of the encoded base form, starting at 0 index. - * - * @param len - * Length of the encode base form. - * - * @return Returns either bb or a new buffer whose capacity is - * large enough to store the output of the decoded data. - */ - public static ByteBuffer decodeStem(ByteBuffer bb, byte[] bytes, int len, - ByteBuffer inflectedBuffer, DictionaryMetadata metadata) { - bb.clear(); - - // Empty length? Weird, but return an empty buffer. - if (len == 0) { - return bb; - } - - // Determine inflected string's length in bytes, in the same encoding. - final byte[] infBytes = inflectedBuffer.array(); - final int infLen = inflectedBuffer.remaining(); - final int code0 = bytes[0] - 'A'; - - final boolean fsaPrefixes = metadata.usesPrefixes; - final boolean fsaInfixes = metadata.usesInfixes; - - // Increase buffer size, if needed. - if (bb.capacity() < infLen + len) { - bb = ByteBuffer.allocate(infLen + len); - } - - if (code0 >= 0) { - if (!fsaPrefixes && !fsaInfixes) { - if (code0 <= infLen) { - bb.put(infBytes, 0, infLen - code0); - bb.put(bytes, 1, len - 1); - return bb; - } - } else if (fsaPrefixes && !fsaInfixes) { - if (len > 1) { - final int stripAtEnd = bytes[1] - 'A' + code0; - if (stripAtEnd <= infLen) { - bb.put(infBytes, code0, infLen - stripAtEnd); - bb.put(bytes, 2, len - 2); - return bb; - } - } - } else if (fsaInfixes) { - // Note: Prefixes are silently assumed here. - if (len > 2) { - final int stripAtBeginning = bytes[1] - 'A' + code0; - final int stripAtEnd = bytes[2] - 'A' + stripAtBeginning; - if (stripAtEnd <= infLen) { - bb.put(infBytes, 0, code0); - bb.put(infBytes, stripAtBeginning, infLen - stripAtEnd); - bb.put(bytes, 3, len - 3); - return bb; - } - } - } - } - - /* - * This is a fallback in case some junk is detected above. Return the - * base form only if this is the case. - */ - bb.clear(); - bb.put(bytes, 0, len); - return bb; - } - - /** - * Encode a character sequence into a byte buffer, optionally expanding - * buffer. - */ - private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) { - bytes.clear(); - final int maxCapacity = (int) (chars.remaining() * encoder - .maxBytesPerChar()); - if (bytes.capacity() <= maxCapacity) { - bytes = ByteBuffer.allocate(maxCapacity); - } - - chars.mark(); - encoder.reset(); - encoder.encode(chars, bytes, true); - bytes.flip(); - chars.reset(); - - return bytes; - } - - /** - * Return an iterator over all {@link WordData} entries available in the - * embedded {@link Dictionary}. - */ - public Iterator iterator() { - return new DictionaryIterator(dictionary, decoder, true); - } - - /** - * @return Return the {@link Dictionary} used by this object. - */ - public Dictionary getDictionary() { - return dictionary; - } -} diff --git a/src/morfologik/stemming/DictionaryMetadata.java b/src/morfologik/stemming/DictionaryMetadata.java deleted file mode 100644 index ce5e507..0000000 --- a/src/morfologik/stemming/DictionaryMetadata.java +++ /dev/null @@ -1,122 +0,0 @@ -package morfologik.stemming; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.*; - -/** - * Description of attributes, their types and default values. - * - * @see Dictionary - */ -public final class DictionaryMetadata { - /** - * Attribute name for {@link #separator}. - */ - public final static String ATTR_NAME_SEPARATOR = "fsa.dict.separator"; - - /** - * Attribute name for {@link #encoding}. - */ - public final static String ATTR_NAME_ENCODING = "fsa.dict.encoding"; - - /** - * Attribute name for {@link #usesPrefixes}. - */ - public final static String ATTR_NAME_USES_PREFIXES = "fsa.dict.uses-prefixes"; - - /** - * Attribute name for {@link #usesInfixes}. - */ - public final static String ATTR_NAME_USES_INFIXES = "fsa.dict.uses-infixes"; - - /** - * A separator character between fields (stem, lemma, form). The character - * must be within byte range (FSA uses bytes internally). - */ - public final byte separator; - - /** - * Encoding used for converting bytes to characters and vice versa. - */ - public final String encoding; - - /** - * True if the dictionary was compiled with prefix compression. - */ - public final boolean usesPrefixes; - - /** - * True if the dictionary was compiled with infix compression. - */ - public final boolean usesInfixes; - - /** - * Other meta data not included above. - */ - public final Map metadata; - - /** - * Creates an immutable instance of {@link DictionaryMetadata}. - */ - public DictionaryMetadata(char separator, String encoding, - boolean usesPrefixes, boolean usesInfixes, - Map metadata) { - this.encoding = encoding; - this.usesPrefixes = usesPrefixes; - this.usesInfixes = usesInfixes; - - try { - final byte[] separatorBytes = new String(new char[] { separator }) - .getBytes(encoding); - if (separatorBytes.length != 1) { - throw new RuntimeException( - "Separator character '" - + separator - + "' must be a single byte after transformation with encoding: " - + encoding); - } - this.separator = separatorBytes[0]; - } catch (UnsupportedEncodingException e) { - throw new RuntimeException("Encoding not supported on this VM: " - + encoding); - } - - this.metadata = Collections - .unmodifiableMap(new HashMap(metadata)); - } - - /** - * Converts attributes in a {@link Map} to an instance of {@link Dictionary} - * , validating attribute values. - */ - static DictionaryMetadata fromMap(Properties properties) throws IOException { - final String separator = properties.getProperty(ATTR_NAME_SEPARATOR); - if (separator == null || separator.length() != 1) { - throw new IOException("Attribute " + ATTR_NAME_SEPARATOR - + " must be " + "a single character."); - } - - final String encoding = properties.getProperty(ATTR_NAME_ENCODING); - if (encoding == null || encoding.length() == 0) { - throw new IOException("Attribute " + ATTR_NAME_ENCODING - + " must be " + "present and non-empty."); - } - - final boolean usesPrefixes = Boolean.valueOf( - properties.getProperty(ATTR_NAME_USES_PREFIXES, "false")) - .booleanValue(); - - final boolean usesInfixes = Boolean.valueOf( - properties.getProperty(ATTR_NAME_USES_INFIXES, "false")) - .booleanValue(); - - final HashMap metadata = new HashMap(); - for (Map.Entry e : properties.entrySet()) { - metadata.put(e.getKey().toString(), e.getValue().toString()); - } - - return new DictionaryMetadata(separator.charAt(0), encoding, - usesPrefixes, usesInfixes, metadata); - } -} diff --git a/src/morfologik/stemming/IStemmer.java b/src/morfologik/stemming/IStemmer.java deleted file mode 100644 index 6e59526..0000000 --- a/src/morfologik/stemming/IStemmer.java +++ /dev/null @@ -1,20 +0,0 @@ -package morfologik.stemming; - -import java.util.List; - -/** - * A generic "stemmer" interface in Morfologik. - */ -public interface IStemmer { - /** - * Returns a list of {@link WordData} entries for a given word. The returned - * list is never null. Depending on the stemmer's - * implementation the {@link WordData} may carry the stem and additional - * information (tag) or just the stem. - *

- * The returned list and any object it contains are not usable after a - * subsequent call to this method. Any data that should be stored in between - * must be copied by the caller. - */ - public List lookup(CharSequence word); -} diff --git a/src/morfologik/stemming/PolishStemmer.java b/src/morfologik/stemming/PolishStemmer.java deleted file mode 100644 index 299f76a..0000000 --- a/src/morfologik/stemming/PolishStemmer.java +++ /dev/null @@ -1,43 +0,0 @@ -package morfologik.stemming; - -import java.util.Iterator; -import java.util.List; - -/** - * A dictionary-based stemmer for the Polish language. This stemmer requires an - * FSA-compiled dictionary to be present in classpath resources. - * - * Objects of this class are not thread safe. - * - * @see morfologik.stemming.DictionaryLookup - */ -public final class PolishStemmer implements IStemmer, Iterable { - /** - * Dictionary lookup delegate. - */ - private final DictionaryLookup delegate; - - /** - * This constructor is initialized with a built-in dictionary or fails with - * a runtime exception if the dictionary is not available. - */ - public PolishStemmer() { - final String languageCode = "pl"; - - this.delegate = new DictionaryLookup(Dictionary.getForLanguage(languageCode)); - } - - /** - * {@inheritDoc} - */ - public List lookup(CharSequence word) { - return delegate.lookup(word); - } - - /** - * Iterates over all dictionary forms stored in this stemmer. - */ - public Iterator iterator() { - return delegate.iterator(); - } -} diff --git a/src/morfologik/stemming/WordData.java b/src/morfologik/stemming/WordData.java deleted file mode 100644 index 4341bc4..0000000 --- a/src/morfologik/stemming/WordData.java +++ /dev/null @@ -1,247 +0,0 @@ -package morfologik.stemming; - -import java.io.UnsupportedEncodingException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.*; - -import morfologik.util.BufferUtils; - -/** - * Stem and tag data associated with a given word. - * - *

- * Important notes: - *

    - *
  • Objects of this class are volatile (their content changes on - * subsequent calls to {@link DictionaryLookup} class. If you need a copy of the - * stem or tag data for a given word, you have to create a custom buffer - * yourself and copy the associated data, perform {@link #clone()} or create - * strings (they are immutable) using {@link #getStem()} and then - * {@link CharSequence#toString()}.
  • - *
  • Objects of this class must not be used in any Java collections. In fact - * both equals and hashCode methods are overridden and throw exceptions to - * prevent accidental damage.
  • - *
- */ -public final class WordData implements Cloneable { - /** - * Error information if somebody puts us in a Java collection. - */ - private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use" - + " in Java collections framework (volatile content). Refer to documentation."; - - /** Character encoding in internal buffers. */ - private final CharsetDecoder decoder; - - /** - * Inflected word form data. - */ - CharSequence wordCharSequence; - - /** - * Character sequence after converting {@link #stemBuffer} using - * {@link #decoder}. - */ - private CharBuffer stemCharSequence; - - /** - * Character sequence after converting {@link #tagBuffer} using - * {@link #decoder}. - */ - private CharBuffer tagCharSequence; - - /** Byte buffer holding the inflected word form data. */ - ByteBuffer wordBuffer; - - /** Byte buffer holding stem data. */ - ByteBuffer stemBuffer; - - /** Byte buffer holding tag data. */ - ByteBuffer tagBuffer; - - /** - * Package scope constructor. - */ - WordData(CharsetDecoder decoder) { - this.decoder = decoder; - - stemBuffer = ByteBuffer.allocate(0); - tagBuffer = ByteBuffer.allocate(0); - stemCharSequence = CharBuffer.allocate(0); - tagCharSequence = CharBuffer.allocate(0); - } - - /** - * A constructor for tests only. - */ - WordData(String stem, String tag, String encoding) { - this(Charset.forName(encoding).newDecoder()); - - try { - if (stem != null) - stemBuffer.put(stem.getBytes(encoding)); - if (tag != null) - tagBuffer.put(tag.getBytes(encoding)); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException(e); - } - } - - /** - * Copy the stem's binary data (no charset decoding) to a custom byte - * buffer. If the buffer is null or not large enough to hold the result, a - * new buffer is allocated. - * - * @param target - * Target byte buffer to copy the stem buffer to or - * null if a new buffer should be allocated. - * - * @return Returns target or the new reallocated buffer. - */ - public ByteBuffer getStemBytes(ByteBuffer target) { - target = BufferUtils.ensureCapacity(target, stemBuffer.remaining()); - stemBuffer.mark(); - target.put(stemBuffer); - stemBuffer.reset(); - target.flip(); - return target; - } - - /** - * Copy the tag's binary data (no charset decoding) to a custom byte buffer. - * If the buffer is null or not large enough to hold the result, a new - * buffer is allocated. - * - * @param target - * Target byte buffer to copy the tag buffer to or - * null if a new buffer should be allocated. - * - * @return Returns target or the new reallocated buffer. - */ - public ByteBuffer getTagBytes(ByteBuffer target) { - target = BufferUtils.ensureCapacity(target, tagBuffer.remaining()); - tagBuffer.mark(); - target.put(tagBuffer); - tagBuffer.reset(); - target.flip(); - return target; - } - - /** - * Copy the inflected word's binary data (no charset decoding) to a custom - * byte buffer. If the buffer is null or not large enough to hold the - * result, a new buffer is allocated. - * - * @param target - * Target byte buffer to copy the word buffer to or - * null if a new buffer should be allocated. - * - * @return Returns target or the new reallocated buffer. - */ - public ByteBuffer getWordBytes(ByteBuffer target) { - target = BufferUtils.ensureCapacity(target, wordBuffer.remaining()); - wordBuffer.mark(); - target.put(wordBuffer); - wordBuffer.reset(); - target.flip(); - return target; - } - - /** - * @return Return tag data decoded to a character sequence or - * null if no associated tag data exists. - */ - public CharSequence getTag() { - tagCharSequence = decode(tagBuffer, tagCharSequence); - return tagCharSequence.remaining() == 0 ? null : tagCharSequence; - } - - /** - * @return Return stem data decoded to a character sequence or - * null if no associated stem data exists. - */ - public CharSequence getStem() { - stemCharSequence = decode(stemBuffer, stemCharSequence); - return stemCharSequence.remaining() == 0 ? null : stemCharSequence; - } - - /** - * @return Return inflected word form data. Usually the parameter passed to - * {@link DictionaryLookup#lookup(CharSequence)}. - */ - public CharSequence getWord() { - return wordCharSequence; - } - - /* - * - */ - @Override - public boolean equals(Object obj) { - throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); - } - - /* - * - */ - @Override - public int hashCode() { - throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); - } - - /** - * Declare a covariant of {@link Object#clone()} that returns a deep copy of - * this object. The content of all internal buffers is copied. - */ - @Override - protected WordData clone() { - final WordData clone = new WordData(this.decoder); - clone.wordCharSequence = cloneCharSequence(wordCharSequence); - clone.wordBuffer = getWordBytes(null); - clone.stemBuffer = getStemBytes(null); - clone.tagBuffer = getTagBytes(null); - return clone; - } - - /** - * Clone char sequences only if not immutable. - */ - private CharSequence cloneCharSequence(CharSequence chs) { - if (chs instanceof String) - return chs; - return chs.toString(); - } - - /** - * Reset internal structures for storing another word's data. - */ - void reset() { - this.wordCharSequence = null; - this.wordBuffer = null; - this.stemCharSequence.clear(); - this.tagCharSequence.clear(); - this.stemBuffer.clear(); - this.tagBuffer.clear(); - } - - /** - * Decode byte buffer, optionally expanding the char buffer to. - */ - private CharBuffer decode(ByteBuffer bytes, CharBuffer chars) { - chars.clear(); - final int maxCapacity = (int) (bytes.remaining() * decoder - .maxCharsPerByte()); - if (chars.capacity() <= maxCapacity) { - chars = CharBuffer.allocate(maxCapacity); - } - - bytes.mark(); - decoder.reset(); - decoder.decode(bytes, chars, true); - chars.flip(); - bytes.reset(); - - return chars; - } -} diff --git a/src/morfologik/tools/FSABuildTool.java b/src/morfologik/tools/FSABuildTool.java deleted file mode 100644 index 17ff102..0000000 --- a/src/morfologik/tools/FSABuildTool.java +++ /dev/null @@ -1,486 +0,0 @@ -package morfologik.tools; - -import java.io.*; -import java.util.*; - -import morfologik.fsa.*; - -import org.apache.commons.cli.*; - -import com.carrotsearch.hppc.IntIntOpenHashMap; -import com.carrotsearch.hppc.cursors.IntIntCursor; - -/** - * Convert from plain text input to a serialized FSA in any of the - * available {@link Format}s. - */ -public final class FSABuildTool extends Tool { - /** - * One megabyte. - */ - private final static int MB = 1024 * 1024; - - /** - * The serialization format to use for the binary output. - */ - public enum Format { - FSA5, - CFSA2; - - public FSASerializer getSerializer() { - switch (this) { - case FSA5: - return new FSA5Serializer(); - - case CFSA2: - return new CFSA2Serializer(); - - default: - throw new RuntimeException(); - } - } - } - - /** - * Be more verbose about progress. - */ - private boolean printProgress; - - /** - * Serializer used for emitting the FSA. - */ - private FSASerializer serializer; - - /** - * Output format name. - */ - private Format format; - - /** - * Warn about CR characters in the input (usually not what you want). - */ - private boolean crWarning = false; - - /** - * If true, the input is not buffered and sorted in-memory, but - * must be sorted externally (using the "C" convention: unsigned byte values). - */ - private boolean inputSorted; - - /** - * Print additional statistics about the output automaton. - */ - private boolean statistics; - - /** - * The actual construction of the FSA. - */ - private FSABuilder builder = new FSABuilder(); - - /** - * Start time. - */ - private long start = System.currentTimeMillis(); - - private IMessageLogger logger; - - /** - * Gets fed with the lines read from the input. - */ - private static interface LineConsumer { - /** - * Process the buffer, return the same buffer or a new buffer (for - * swapping). - */ - byte[] process(byte[] buffer, int pos); - } - - /** - * To help break out of the anonymous delegate on error. - */ - @SuppressWarnings("serial") - private static class TerminateProgramException extends RuntimeException { - public TerminateProgramException(String msg) { - super(msg); - } - - public synchronized Throwable fillInStackTrace() { - return null; - } - } - - /** - * Command line entry point after parsing arguments. - */ - protected void go(CommandLine line) throws Exception { - String[] args = line.getArgs(); - if (args.length != 0) { - printUsage(); - return; - } - - // Parse the input options. - parseOptions(line); - - logger = new WriterMessageLogger(new PrintWriter(System.err)); - this.serializer.withLogger(logger); - - try { - InputStream inputStream = initializeInput(line); - - if (inputSorted) { - logger.log("Assuming input is already sorted"); - } - - final FSA fsa; - if (inputSorted) { - fsa = processSortedInput(inputStream); - } else { - fsa = processUnsortedInput(inputStream); - } - if (crWarning) logger.log("Warning: input contained carriage returns?"); - - if (statistics) { - logger.startPart("Statistics"); - FSAInfo info = new FSAInfo(fsa); - TreeMap fanout = FSAUtils.calculateFanOuts(fsa, fsa.getRootNode()); - logger.endPart(); - - final IntIntOpenHashMap numbers = new IntIntOpenHashMap(); - fsa.visitInPostOrder(new StateVisitor() { - public boolean accept(int state) { - int thisNodeNumber = 0; - for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { - thisNodeNumber += - (fsa.isArcFinal(arc) ? 1 : 0) + - (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc))); - } - numbers.put(state, thisNodeNumber); - return true; - } - }); - - int singleRLC = 0; - for (IntIntCursor c : numbers) { - if (c.value == 1) singleRLC++; - } - - logger.log("Nodes", info.nodeCount); - logger.log("Arcs", info.arcsCount); - logger.log("Tail nodes", singleRLC); - - logger.log("States with the given # of outgoing arcs:"); - for (Map.Entry e : fanout.entrySet()) { - logger.log(" #" + e.getKey(), e.getValue()); - } - - logger.log("FSA builder properties:"); - for (Map.Entry e : builder.getInfo().entrySet()) { - logger.log(e.getKey().toString(), e.getValue()); - } - } - - // Save the result. - logger.startPart("Serializing " + format); - serializer.serialize(fsa, initializeOutput(line)).close(); - logger.endPart(); - } catch (OutOfMemoryError e) { - logger.log("Error: Out of memory. Pass -Xmx1024m argument (or more) to java."); - } - } - - /** - * Process unsorted input (sort and construct FSA). - */ - private FSA processUnsortedInput(InputStream inputStream) - throws IOException { - final FSA root; - logger.startPart("Reading input"); - final ArrayList input = readInput(inputStream); - logger.endPart(); - - logger.log("Input sequences", input.size()); - - logger.startPart("Sorting"); - Collections.sort(input, FSABuilder.LEXICAL_ORDERING); - logger.endPart(); - - logger.startPart("Building FSA"); - for (byte [] bb : input) - builder.add(bb, 0, bb.length); - root = builder.complete(); - logger.endPart(); - return root; - } - - /** - * - */ - private FSA processSortedInput(InputStream inputStream) - throws IOException { - - int lines = forAllLines(inputStream, new LineConsumer() { - private byte [] current; - private byte [] previous = null; - private int previousLen; - private int line; - - public byte[] process(byte[] current, int currentLen) { - line++; - - // Verify the order. - if (previous != null) { - if (FSABuilder.compare(previous, 0, previousLen, current, 0, currentLen) > 0) { - logger.log("\n\nERROR: The input is not sorted: \n" + - dumpLine(previous, previousLen) + "\n" + - dumpLine(current, currentLen)); - throw new TerminateProgramException("Input is not sorted."); - } - } - - // Add to the automaton. - builder.add(current, 0, currentLen); - - // Swap buffers. - this.current = previous != null ? previous : new byte [current.length]; - this.previous = current; - this.previousLen = currentLen; - - return this.current; - } - }); - - logger.startPart("Building FSA"); - FSA fsa = builder.complete(); - logger.endPart(); - logger.log("Input sequences", lines); - - return fsa; - } - - /** - * Dump input line, byte-by-byte. - */ - protected String dumpLine(byte[] line, int length) { - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < length; i++) { - if (i > 0) builder.append(" "); - builder.append(String.format("%02x", line[i])); - } - builder.append(" | "); - for (int i = 0; i < length; i++) { - if (Character.isLetterOrDigit(line[i])) - builder.append((char) line[i]); - else - builder.append("."); - } - return builder.toString(); - } - - /** - * Parse input options. - */ - private void parseOptions(CommandLine line) { - String opt; - - opt = SharedOptions.outputFormatOption.getOpt(); - if (line.hasOption(opt)) { - String formatValue = line.getOptionValue(opt); - try { - format = Format.valueOf(formatValue.toUpperCase()); - } catch (IllegalArgumentException e) { - throw new TerminateProgramException("Not a valid format: " - + formatValue); - } - } else { - format = Format.FSA5; - } - serializer = format.getSerializer(); - - opt = SharedOptions.fillerCharacterOption.getLongOpt(); - if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) { - String chr = line.getOptionValue(opt); - checkSingleByte(chr); - serializer.withFiller(chr.getBytes()[0]); - } - - opt = SharedOptions.annotationSeparatorCharacterOption.getLongOpt(); - if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.SEPARATORS)) { - String chr = line.getOptionValue(opt); - checkSingleByte(chr); - serializer.withAnnotationSeparator(chr.getBytes()[0]); - } - - opt = SharedOptions.withNumbersOption.getOpt(); - if (line.hasOption(opt) && requiredCapability(opt, FSAFlags.NUMBERS)) { - serializer.withNumbers(); - } - - opt = SharedOptions.progressOption.getLongOpt(); - if (line.hasOption(opt)) { - printProgress = true; - } - - opt = SharedOptions.inputSortedOption.getLongOpt(); - if (line.hasOption(opt)) { - inputSorted = true; - } - - opt = SharedOptions.statistics.getLongOpt(); - if (line.hasOption(opt)) { - statistics = true; - } - } - - private boolean requiredCapability(String opt, FSAFlags flag) { - if (!serializer.getFlags().contains(flag)) { - throw new RuntimeException("This serializer does not support option: " + opt); - } - return true; - } - - /** - * Check if the argument is a single byte after conversion using platform-default - * encoding. - */ - public static void checkSingleByte(String chr) { - if (chr.getBytes().length == 1) - return; - - throw new IllegalArgumentException("Filler and annotation characters must be single" + - "-byte values, " + chr + " has " + chr.getBytes().length + " bytes."); - } - - /** - * Read all the input lines, unsorted. - */ - private ArrayList readInput(InputStream is) throws IOException { - final ArrayList result = new ArrayList(); - forAllLines(is, new LineConsumer() { - public byte[] process(byte[] buffer, int pos) { - result.add(java.util.Arrays.copyOf(buffer, pos)); - return buffer; - } - }); - return result; - } - - /** - * Apply line consumer to all non-empty lines. - */ - private int forAllLines(InputStream is, LineConsumer lineConsumer) throws IOException { - int lines = 0; - byte[] buffer = new byte[0]; - int line = 0, b, pos = 0; - while ((b = is.read()) != -1) { - if (b == '\r' && !crWarning) { - crWarning = true; - } - - if (b == '\n') { - if (pos > 0) { - buffer = lineConsumer.process(buffer, pos); - pos = 0; - lines++; - } - - if (printProgress && line++ > 0 && (line % 1000000) == 0) { - logger.log(String.format(Locale.ENGLISH, "%6.2fs, sequences: %d", elapsedTime(), line)); - } - } else { - if (pos >= buffer.length) { - buffer = java.util.Arrays.copyOf(buffer, buffer.length + 10); - } - buffer[pos++] = (byte) b; - } - } - - if (pos > 0) { - lineConsumer.process(buffer, pos); - lines++; - } - - return lines; - } - - private double elapsedTime() { - return (System.currentTimeMillis() - start) / 1000.0d; - } - - @Override - protected void printUsage() { - final HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp(this.getClass().getName(), options, true); - } - - @Override - protected void initializeOptions(Options options) { - options.addOption(SharedOptions.inputFileOption); - options.addOption(SharedOptions.outputFileOption); - - options.addOption(SharedOptions.outputFormatOption); - - options.addOption(SharedOptions.fillerCharacterOption); - options.addOption(SharedOptions.annotationSeparatorCharacterOption); - - options.addOption(SharedOptions.withNumbersOption); - options.addOption(SharedOptions.progressOption); - - options.addOption(SharedOptions.inputSortedOption); - - options.addOption(SharedOptions.statistics); - } - - /** - * - */ - private static OutputStream initializeOutput(CommandLine line) - throws IOException, ParseException { - final OutputStream output; - final String opt = SharedOptions.outputFileOption.getOpt(); - if (line.hasOption(opt)) { - // Use output file. - output = new FileOutputStream((File) line.getParsedOptionValue(opt)); - } else { - // Use standard output. - output = System.out; - } - return new BufferedOutputStream(output); - } - - /** - * - */ - private InputStream initializeInput(CommandLine line) - throws IOException, ParseException { - final InputStream input; - final String opt = SharedOptions.inputFileOption.getOpt(); - - if (line.hasOption(opt)) { - // Use input file. - File inputFile = (File) line.getParsedOptionValue(opt); - if (!inputSorted && inputFile.length() > 20 * MB) { - logger.log("WARN: The input file is quite large, avoid\n" + - " in-memory sorting by piping pre-sorted\n" + - " input directly to fsa_build. Linux:\n" + - " export LC_ALL=C && \\\n" + - " sort input | \\\n" + - " java -jar morfologik.jar fsa_build --sorted -o dict.fsa"); - } - - input = new FileInputStream(inputFile); - } else { - // Use standard input. - input = System.in; - } - return new BufferedInputStream(input); - } - - /** - * Command line entry point. - */ - public static void main(String[] args) throws Exception { - final FSABuildTool tool = new FSABuildTool(); - tool.go(args); - } -} \ No newline at end of file diff --git a/src/morfologik/tools/FSADumpTool.java b/src/morfologik/tools/FSADumpTool.java deleted file mode 100644 index 281d187..0000000 --- a/src/morfologik/tools/FSADumpTool.java +++ /dev/null @@ -1,286 +0,0 @@ -package morfologik.tools; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.Locale; -import java.util.Map; - -import morfologik.fsa.*; -import morfologik.stemming.*; -import morfologik.util.FileUtils; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.Options; - -/** - * This utility will dump the information and contents of a given {@link FSA} - * dictionary. It can dump dictionaries in the raw form (as fed to the - * fsa_build program) or decoding compressed stem forms. - */ -public final class FSADumpTool extends Tool { - /** - * Writer used to print messages and dictionary dump. - */ - private OutputStream os; - - /** - * Print raw data only, no headers. - */ - private boolean dataOnly; - - /** - * Decode from prefix/infix/suffix encodings. - */ - private boolean decode; - - /** - * Dump graphviz DOT file instead of automaton sequences. - */ - private boolean dot; - - /** - * Command line entry point after parsing arguments. - */ - protected void go(CommandLine line) throws Exception { - final File dictionaryFile = (File) line - .getParsedOptionValue(SharedOptions.fsaDictionaryFileOption - .getOpt()); - - dataOnly = line.hasOption(SharedOptions.dataOnly.getOpt()); - decode = line.hasOption(SharedOptions.decode.getOpt()); - dot = line.hasOption(SharedOptions.dot.getLongOpt()); - - FileUtils.assertExists(dictionaryFile, true, false); - - dump(dictionaryFile); - } - - /** - * Dumps the content of a dictionary to a file. - */ - private void dump(File dictionaryFile) - throws UnsupportedEncodingException, IOException { - final long start = System.currentTimeMillis(); - - final Dictionary dictionary; - final FSA fsa; - - if (!dictionaryFile.canRead()) { - printWarning("Dictionary file does not exist: " - + dictionaryFile.getAbsolutePath()); - return; - } - - this.os = new BufferedOutputStream(System.out, 1024 * 32); - - if (hasMetadata(dictionaryFile)) { - dictionary = Dictionary.read(dictionaryFile); - fsa = dictionary.fsa; - - final String encoding = dictionary.metadata.encoding; - if (!Charset.isSupported(encoding)) { - printWarning("Dictionary's charset is not supported " - + "on this JVM: " + encoding); - return; - } - } else { - dictionary = null; - fsa = FSA.read(new FileInputStream(dictionaryFile)); - printWarning("Warning: FSA automaton without metadata file."); - } - - printExtra("FSA properties"); - printExtra("--------------------"); - printExtra("FSA implementation : " + fsa.getClass().getName()); - printExtra("Compiled with flags : " + fsa.getFlags().toString()); - - if (!dataOnly) { - final FSAInfo info = new FSAInfo(fsa); - printExtra("Number of arcs : " - + info.arcsCount + "/" + info.arcsCountTotal); - printExtra("Number of nodes : " + info.nodeCount); - printExtra("Number of final st. : " + info.finalStatesCount); - printExtra(""); - } - - // Separator for dumping. - char separator = '\t'; - - if (fsa instanceof FSA5) { - printExtra("FSA5 properties"); - printExtra("--------------------"); - printFSA5((FSA5) fsa); - printExtra(""); - } - - if (dictionary != null) { - printExtra("Dictionary metadata"); - printExtra("--------------------"); - printExtra("Encoding : " + dictionary.metadata.encoding); - printExtra("Separator byte : 0x" - + Integer.toHexString(dictionary.metadata.separator) - + " ('" + decodeSeparator(dictionary) + "')"); - printExtra("Uses prefixes : " - + dictionary.metadata.usesPrefixes); - printExtra("Uses infixes : " - + dictionary.metadata.usesInfixes); - printExtra(""); - - printExtra("Dictionary metadata (all keys)"); - printExtra("---------------------------------"); - - for (Map.Entry e : dictionary.metadata.metadata - .entrySet()) { - printExtra(String - .format("%-27s : %s", e.getKey(), e.getValue())); - } - printExtra(""); - } - - int sequences = 0; - if (decode) { - if (dictionary == null) { - printWarning("No dictionary metadata available."); - return; - } - - printExtra("Decoded FSA data (in the encoding above)"); - printExtra("----------------------------------------"); - - final DictionaryLookup dl = new DictionaryLookup(dictionary); - final StringBuilder builder = new StringBuilder(); - final OutputStreamWriter osw = new OutputStreamWriter(os, - dictionary.metadata.encoding); - - CharSequence t; - for (WordData wd : dl) { - builder.setLength(0); - builder.append(wd.getWord()); - builder.append(separator); - - t = wd.getStem(); - if (t == null) - t = ""; - builder.append(t); - builder.append(separator); - - t = wd.getTag(); - if (t == null) - t = ""; - builder.append(t); - builder.append('\n'); - - osw.write(builder.toString()); - sequences++; - } - osw.flush(); - } else { - if (dot) { - Writer w = new OutputStreamWriter(os); - FSAUtils.toDot(w, fsa, fsa.getRootNode()); - w.flush(); - } else { - printExtra("FSA data (raw bytes in the encoding above)"); - printExtra("------------------------------------------"); - - for (ByteBuffer bb : fsa) { - os.write(bb.array(), 0, bb.remaining()); - os.write(0x0a); - sequences++; - } - } - } - - printExtra("--------------------"); - - final long millis = Math.max(1, System.currentTimeMillis() - start); - printExtra(String - .format( - Locale.ENGLISH, - "Dictionary dumped in %.3f second(s), %d sequences (%d sequences/sec.).", - millis / 1000.0, sequences, - (int) (sequences / (millis / 1000.0)))); - - os.flush(); - } - - /** - * Print {@link FSA5}-specific stuff. - */ - private void printFSA5(FSA5 fsa) throws IOException { - printExtra("GTL : " + fsa.gtl); - printExtra("Node extra data : " + fsa.nodeDataLength); - printExtra("Annotation separator: " + byteAsChar(fsa.annotation)); - printExtra("Filler character : " + byteAsChar(fsa.filler)); - } - - /** - * Convert a byte to a character, no charset decoding, simple ASCII range mapping. - */ - private char byteAsChar(byte v) { - char chr = (char) (v & 0xff); - if (chr < 127) - return chr; - else - return '?'; - } - - /* - * - */ - private void printExtra(String msg) throws IOException { - if (dataOnly) - return; - os.write(msg.getBytes()); - os.write(0x0a); - } - - /* - * - */ - private void printWarning(String msg) { - System.err.println(msg); - } - - /* - * - */ - private String decodeSeparator(Dictionary dictionary) { - try { - return new String(new byte[] { dictionary.metadata.separator }, - dictionary.metadata.encoding); - } catch (UnsupportedEncodingException e) { - return ""; - } - } - - /** - * Check if there is a metadata file for the given FSA automaton. - */ - private static boolean hasMetadata(File fsaFile) { - final File featuresFile = new File(fsaFile.getParent(), Dictionary - .getExpectedFeaturesName(fsaFile.getName())); - - return featuresFile.canRead(); - } - - /** - * Command line options for the tool. - */ - protected void initializeOptions(Options options) { - options.addOption(SharedOptions.fsaDictionaryFileOption); - options.addOption(SharedOptions.dataOnly); - options.addOption(SharedOptions.decode); - options.addOption(SharedOptions.dot); - } - - /** - * Command line entry point. - */ - public static void main(String[] args) throws Exception { - final FSADumpTool fsaDump = new FSADumpTool(); - fsaDump.go(args); - } -} \ No newline at end of file diff --git a/src/morfologik/tools/IMessageLogger.java b/src/morfologik/tools/IMessageLogger.java deleted file mode 100644 index 14f9f00..0000000 --- a/src/morfologik/tools/IMessageLogger.java +++ /dev/null @@ -1,25 +0,0 @@ -package morfologik.tools; - -public interface IMessageLogger { - - /** - * Log progress to the console. - */ - public void log(String msg); - - /** - * Log message header and save current time. - */ - public void startPart(String header); - - /** - * - */ - public void endPart(); - - /** - * Log a two-part message. - */ - public void log(String header, Object v); - -} \ No newline at end of file diff --git a/src/morfologik/tools/InflectionFramesTool.java b/src/morfologik/tools/InflectionFramesTool.java deleted file mode 100644 index 612f62c..0000000 --- a/src/morfologik/tools/InflectionFramesTool.java +++ /dev/null @@ -1,118 +0,0 @@ -package morfologik.tools; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.charset.*; -import java.util.*; -import java.util.Map.Entry; - -import morfologik.stemming.*; -import morfologik.stemming.Dictionary; - -import org.junit.Test; - -/** - * Calculate inflection frames from the Polish dictionary. - */ -public class InflectionFramesTool { - public static void main(String[] args) throws IOException { - new InflectionFramesTool().inflectionFrames(); - } - - /* */ - @Test - @SuppressWarnings( { "unused" }) - public void inflectionFrames() throws IOException { - final Dictionary pl = Dictionary.getForLanguage("pl"); - final DictionaryLookup dict = new DictionaryLookup(pl); - - final CharsetDecoder decoder = Charset.forName(pl.metadata.encoding) - .newDecoder().onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - final HashMap> forms = - new HashMap>(); - - ByteBuffer stemBuffer = ByteBuffer.allocate(0); - ByteBuffer inflBuffer = ByteBuffer.allocate(0); - ByteBuffer stemDecoded = ByteBuffer.allocate(0); - - int limit = Integer.MAX_VALUE; - - final Iterator i = new DictionaryIterator(pl, decoder, false); - while (i.hasNext() && limit-- > 0) { - final WordData wd = i.next(); - - final CharSequence inflected = wd.getWord(); - final CharSequence stemEncoded = wd.getStem(); - final CharSequence tag = wd.getTag(); - if (tag == null) - continue; - - inflBuffer.clear(); - inflBuffer = wd.getWordBytes(inflBuffer); - - stemBuffer.clear(); - stemBuffer = wd.getStemBytes(stemBuffer); - - stemDecoded = DictionaryLookup.decodeStem(stemDecoded, stemBuffer - .array(), stemBuffer.remaining(), inflBuffer, pl.metadata); - stemDecoded.flip(); - - final String stem = decoder.decode(stemDecoded).toString(); - final String form = tag.toString().intern(); - - ArrayList frames = forms.get(stem); - if (frames == null) { - forms.put(stem, frames = new ArrayList()); - } - - if (!frames.contains(form)) { - frames.add(form); - } - } - - // Sort the forms so that we get a unique key. Then iteratively add them - // to another hash (by form this time). - final HashMap> frames = - new HashMap>(); - - StringBuilder key = new StringBuilder(); - for (Map.Entry> e : forms.entrySet()) { - Collections.sort(e.getValue()); - - key.setLength(0); - for (String s : e.getValue()) - key.append(s).append(" "); - - final String k = key.toString(); - ArrayList words = frames.get(k); - if (words == null) { - frames.put(k, words = new ArrayList()); - } - words.add(e.getKey()); - - e.setValue(null); - } - - // Print inflection frames. - ArrayList>> entries = - new ArrayList>>(); - - entries.addAll(frames.entrySet()); - Collections.sort(entries, - new Comparator>>() { - public int compare(Entry> o1, - Entry> o2) { - return o2.getValue().size() - o1.getValue().size(); - } - }); - - for (Map.Entry> e : entries) { - System.out.println(String.format("%6d %s %s", - e.getValue().size(), e.getKey(), e.getValue())); - } - - System.out.println("Total frames: " + frames.size()); - } -} diff --git a/src/morfologik/tools/Launcher.java b/src/morfologik/tools/Launcher.java deleted file mode 100644 index 667a6e1..0000000 --- a/src/morfologik/tools/Launcher.java +++ /dev/null @@ -1,159 +0,0 @@ -package morfologik.tools; - -import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Method; -import java.net.URL; -import java.util.Enumeration; -import java.util.Iterator; -import java.util.TreeMap; -import java.util.jar.Manifest; - -import morfologik.util.FileUtils; - -/** - * A launcher for other command-line tools. - */ -public final class Launcher { - /** - * Tool description. - */ - final static class ToolInfo { - public final Class clazz; - public final String info; - - public ToolInfo(Class clazz, String info) { - this.clazz = clazz; - this.info = info; - } - - public void invoke(String[] subArgs) throws Exception { - final Method m = clazz.getMethod("main", - new Class[] { String[].class }); - m.invoke(null, new Object[] { subArgs }); - } - } - - /** - * Command line entry point. - */ - public static void main(String[] args) throws Exception { - // If so, tools are unavailable and a classpath error has been logged. - final TreeMap tools = initTools(); - - if (tools == null) - { - return; - } - - if (args.length == 0) { - System.out - .println("Provide tool name and its command-line options. " - + "Available tools:"); - for (String key : tools.keySet()) { - final ToolInfo toolInfo = tools.get(key); - System.out.println(String.format(" %-10s - %s", key, - toolInfo.info)); - } - } else { - final String toolName = args[0]; - if (!tools.containsKey(toolName)) { - System.out.println("Unknown tool: " + toolName); - return; - } - - final String[] subArgs = new String[args.length - 1]; - System.arraycopy(args, 1, subArgs, 0, subArgs.length); - - final ToolInfo toolInfo = (ToolInfo) tools.get(toolName); - toolInfo.invoke(subArgs); - } - } - - /** - * Initialize and check tools' availability. - */ - static TreeMap initTools() { - TreeMap tools = new TreeMap(); - - tools.put("fsa_build", new ToolInfo(FSABuildTool.class, - "Create an automaton from plain text files.")); - - tools.put("fsa_dump", new ToolInfo(FSADumpTool.class, - "Dump an FSA dictionary.")); - - tools.put("tab2morph", new ToolInfo(MorphEncodingTool.class, - "Convert tabbed dictionary to fsa encoding format.")); - - tools.put("plstem", new ToolInfo(PolishStemmingTool.class, - "Apply Polish dictionary stemming to the input.")); - - // Prune unavailable tools. - for (Iterator i = tools.values().iterator(); i.hasNext();) { - ToolInfo ti = i.next(); - try { - ti.clazz.newInstance().isAvailable(); - } catch (NoClassDefFoundError e) { - logJarWarning(); - return null; - } catch (Throwable e) { - System.out.println("Tools could not be initialized because" + - " of an exception during initialization: " - + e.getClass().getName() + ", " + e.getMessage()); - return null; - } - } - - return tools; - } - - /** - * Log a warning about missing JAR dependencies. - */ - private static void logJarWarning() { - System.out.println("Tools are unavailable, at least one JAR dependency missing."); - - try { - final Class clazz = Launcher.class; - final ClassLoader classLoader = clazz.getClassLoader(); - - final String clazzName = clazz.getName().replace('.', '/') + ".class"; - // Figure out our own class path location. - final URL launcherLocation = classLoader.getResource(clazzName); - if (launcherLocation == null) - return; - - String launcherPrefix = launcherLocation.toString() - .replace(clazzName, ""); - - // Figure our our location's MANIFEST.MF (class loader may be hitting a few). - URL manifestResource = null; - Enumeration manifests = classLoader.getResources("META-INF/MANIFEST.MF"); - while (manifests.hasMoreElements()) - { - URL candidate = manifests.nextElement(); - if (candidate.toString().startsWith(launcherPrefix)) - { - manifestResource = candidate; - break; - } - } - - if (manifestResource == null) - return; - - InputStream stream = null; - try { - stream = manifestResource.openStream(); - Manifest manifest = new Manifest(stream); - - System.out.println("Required JARs: " - + manifest.getMainAttributes().getValue("Class-Path")); - } catch (IOException e) { - FileUtils.close(stream); - } - } catch (IOException e) { - // Ignore. - } - } -} diff --git a/src/morfologik/tools/MorphEncoder.java b/src/morfologik/tools/MorphEncoder.java deleted file mode 100644 index 236c1aa..0000000 --- a/src/morfologik/tools/MorphEncoder.java +++ /dev/null @@ -1,399 +0,0 @@ -package morfologik.tools; - -import java.io.UnsupportedEncodingException; - -import morfologik.fsa.FSA5; - -/** - * A class that converts tabular data to fsa morphological format. Three formats - * are supported: - *
    - *
  • standard, see {@link #standardEncode}
  • - *
  • prefix, see {@link #prefixEncode}
  • - *
  • infix, see {@link #infixEncode}
  • - *
- */ -public final class MorphEncoder { - private final byte annotationSeparator; - - private static final int MAX_PREFIX_LEN = 3; - private static final int MAX_INFIX_LEN = 3; - - private static final String UTF8 = "UTF-8"; - - public MorphEncoder() { - this(FSA5.DEFAULT_ANNOTATION); - } - - public MorphEncoder(byte annotationSeparator) { - this.annotationSeparator = annotationSeparator; - } - - public static int commonPrefix(final byte[] s1, final byte[] s2) { - final int maxLen = Math.min(s1.length, s2.length); - for (int i = 0; i < maxLen; i++) { - if (s1[i] != s2[i]) { - return i; - } - } - return maxLen; - } - - private static byte[] subsequence(final byte[] bytes, final int start) { - final byte[] newArray = new byte[bytes.length - start]; - System.arraycopy(bytes, start, newArray, 0, bytes.length - start); - return newArray; - } - - private static int copyTo(byte[] dst, final int pos, final byte[] src) { - System.arraycopy(src, 0, dst, pos, src.length); - return src.length; - } - - private static int copyTo(byte[] dst, final int pos, final byte src) { - byte[] single = new byte[1]; - single[0] = src; - System.arraycopy(single, 0, dst, pos, 1); - return 1; - } - - /** - * This method converts the wordForm, wordLemma and tag to the form: - * - *
-	 * wordForm + Kending + tags
-	 * 
- * - * where '+' is a separator, K is a character that specifies how many - * characters should be deleted from the end of the inflected form to - * produce the lexeme by concatenating the stripped string with the ending. - * - */ - public byte[] standardEncode(final byte[] wordForm, - final byte[] wordLemma, final byte[] wordTag) { - final int l1 = wordForm.length; - final int prefix = commonPrefix(wordForm, wordLemma); - final int len = wordLemma.length - prefix; - int pos = 0; - // 3 = 2 separators and K character - int arrayLen = l1 + len + 3; - if (wordTag != null) { //wordTag may be empty for stemming - arrayLen += wordTag.length; - } - final byte[] bytes = new byte[arrayLen]; - pos += copyTo(bytes, pos, wordForm); - pos += copyTo(bytes, pos, annotationSeparator); - if (prefix == 0) { - pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff)); - pos += copyTo(bytes, pos, wordLemma); - } else { - pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix)); - } - pos += copyTo(bytes, pos, annotationSeparator); - if (wordTag != null) { - pos += copyTo(bytes, pos, wordTag); - } - return bytes; - } - - /** - * This method converts wordform, wordLemma and the tag to the form: - *

- * - *

-	 * inflected_form + LKending + tags
-	 * 
- *

- * where '+' is a separator, L is the number of characters to be deleted - * from the beginning of the word ("A" means none, "B" means one, "C" - 2, - * etc.), K is a character that specifies how many characters should be - * deleted from the end of the inflected form to produce the lexeme by - * concatenating the stripped string with the ending ("A" means none, - * "B' - 1, "C" - 2, and so on). - * - * @param wordForm - * - inflected word form - * @param wordLemma - * - canonical form - * @param wordTag - * - tag - * @return the encoded string - */ - public byte[] prefixEncode(final byte[] wordForm, - final byte[] wordLemma, final byte[] wordTag) { - final int l1 = wordForm.length; - final int prefix = commonPrefix(wordForm, wordLemma); - - // 4 = 2 separators + LK characters - int arrayLen = l1 + wordLemma.length + 4; - if (wordTag != null) { - arrayLen += wordTag.length; - } - final byte[] bytes = new byte[arrayLen]; - int pos = 0; - pos += copyTo(bytes, pos, wordForm); - pos += copyTo(bytes, pos, annotationSeparator); - if (prefix == 0) { - int prefixFound = 0; - int prefix1 = 0; - final int max = Math.min(wordForm.length, MAX_PREFIX_LEN); - for (int i = 1; i <= max; i++) { - prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma); - if (prefix1 > 2) { - prefixFound = i; - break; - } - } - if (prefixFound == 0) { - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff)); - pos += copyTo(bytes, pos, wordLemma); - } else { - pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff)); - pos += copyTo(bytes, pos, - (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1)); - } - } else { - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix)); - } - pos += copyTo(bytes, pos, annotationSeparator); - if (wordTag != null) { - pos += copyTo(bytes, pos, wordTag); - } - final byte[] finalArray = new byte[pos]; - System.arraycopy(bytes, 0, finalArray, 0, pos); - return finalArray; - } - - /** - * This method converts wordform, wordLemma and the tag to the form: - *

-	 * inflected_form + MLKending + tags
-	 * 
- *

- * where '+' is a separator, M is the position of characters to be deleted - * towards the beginning of the inflected form ("A" means from the - * beginning, "B" from the second character, "C" - from the third one, and - * so on), L is the number of characters to be deleted from the position - * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - * character that specifies how many characters should be deleted from the - * end of the inflected form to produce the lexeme by concatenating the - * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - * on). - * - * @param wordForm - * - inflected word form - * @param wordLemma - * - canonical form - * @param wordTag - * - tag - * @return the encoded string - */ - public byte[] infixEncode(final byte[] wordForm, - final byte[] wordLemma, final byte[] wordTag) { - final int l1 = wordForm.length; - int prefixFound = 0; - int prefix1 = 0; - final int prefix = commonPrefix(wordForm, wordLemma); - final int max = Math.min(l1, MAX_INFIX_LEN); - - // 5 = 2 separators + MLK characters - int arrayLen = l1 + wordLemma.length + 5; - if (wordTag != null) { - arrayLen += wordTag.length; - } - final byte[] bytes = new byte[arrayLen]; - int pos = 0; - pos += copyTo(bytes, pos, wordForm); - pos += copyTo(bytes, pos, annotationSeparator); - if (prefix == 0) { - // we may have a prefix - for (int i = 1; i <= max; i++) { - prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma); - if (prefix1 > 2) { - prefixFound = i; - break; - } - } - if (prefixFound == 0) { - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) ((l1 + 65) & 0xff)); - pos += copyTo(bytes, pos, wordLemma); - } else { - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) ((prefixFound + 65) & 0xff)); - pos += copyTo(bytes, pos, - (byte) ((l1 - prefixFound - prefix1 + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1)); - } - } else { // prefix found but we have to check the infix - - for (int i = 1; i <= max; i++) { - prefix1 = commonPrefix(subsequence(wordForm, i), wordLemma); - if (prefix1 > 2) { - prefixFound = i; - break; - } - } - int prefix2 = 0; - int infixFound = 0; - final int max2 = Math.min(l1 - prefix, MAX_INFIX_LEN); - for (int i = 1; i <= max2; i++) { - prefix2 = commonPrefix(subsequence(wordForm, prefix + i), - subsequence(wordLemma, prefix)); - if (prefix2 > 2) { - infixFound = i; - break; - } - } - - if (prefixFound > infixFound) { - if (prefixFound > 0 && (prefix1 > prefix)) { - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, - (byte) ((prefixFound + 65) & 0xff)); - pos += copyTo(bytes, pos, (byte) ((l1 - prefixFound - - prefix1 + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix1)); - } else { - // infixFound == 0 && prefixFound == 0 - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, - (byte) ((l1 - prefix + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix)); - } - } else if (infixFound > 0 && prefix2 > 0) { - // we have an infix, , and if there seems to be a prefix, - // the infix is longer - pos += copyTo(bytes, pos, (byte) ((prefix + 65) & 0xff)); - pos += copyTo(bytes, pos, (byte) ((infixFound + 65) & 0xff)); - pos += copyTo(bytes, pos, (byte) ((l1 - prefix - prefix2 - - infixFound + 65) & 0xff)); - pos += copyTo(bytes, pos, - subsequence(wordLemma, prefix + prefix2)); - } else { - // we have an infix, and if there seems to be a prefix, - // the infix is longer - // but the common prefix of two words is longer - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) 'A'); - pos += copyTo(bytes, pos, (byte) ((l1 - prefix + 65) & 0xff)); - pos += copyTo(bytes, pos, subsequence(wordLemma, prefix)); - } - - } - pos += copyTo(bytes, pos, annotationSeparator); - if (wordTag != null) { - pos += copyTo(bytes, pos, wordTag); - } - final byte[] finalArray = new byte[pos]; - System.arraycopy(bytes, 0, finalArray, 0, pos); - return finalArray; - } - - /** - * Converts a byte array to a given encoding. - * - * @param str - * Byte-array to be converted. - * @return Java String. If decoding is unsuccessful, the string is empty. - */ - protected static String asString(final byte[] str, final String encoding) { - try { - return new String(str, encoding); - } catch (UnsupportedEncodingException e) { - return ""; - } - } - - /** - * A UTF-8 variant of {@link #standardEncode(byte[], byte[], byte[])} This - * method converts the wordForm, wordLemma and tag to the form: - * - *

-	 * wordForm + Kending + tags
-	 * 
- * - * where '+' is a separator, K is a character that specifies how many - * characters should be deleted from the end of the inflected form to - * produce the lexeme by concatenating the stripped string with the ending. - * - * @throws UnsupportedEncodingException - */ - public String standardEncodeUTF8(final String wordForm, - final String wordLemma, final String wordTag) - throws UnsupportedEncodingException { - return asString(standardEncode(wordForm.getBytes(UTF8), wordLemma - .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8); - } - - /** - * A UTF-8 variant of {@link #prefixEncode(byte[], byte[], byte[])} This - * method converts wordform, wordLemma and the tag to the form: - *
-	 * inflected_form + LKending + tags
-	 * 
- *

- * where '+' is a separator, L is the number of characters to be deleted - * from the beginning of the word ("A" means none, "B" means one, "C" - 2, - * etc.), K is a character that specifies how many characters should be - * deleted from the end of the inflected form to produce the lexeme by - * concatenating the stripped string with the ending ("A" means none, - * "B' - 1, "C" - 2, and so on). - * - * @param wordForm - * - inflected word form - * @param wordLemma - * - canonical form - * @param wordTag - * - tag - * @return the encoded string - * @throws UnsupportedEncodingException - */ - public String prefixEncodeUTF8(final String wordForm, - final String wordLemma, final String wordTag) - throws UnsupportedEncodingException { - return asString(prefixEncode(wordForm.getBytes(UTF8), wordLemma - .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8); - } - - /** - * A UTF-8 variant of {@link #infixEncode(byte[], byte[], byte[])}. - * - * This method converts wordform, wordLemma and the tag to the form: - *

-	 * inflected_form + MLKending + tags
-	 * 
- *

- * where '+' is a separator, M is the position of characters to be deleted - * towards the beginning of the inflected form ("A" means from the - * beginning, "B" from the second character, "C" - from the third one, and - * so on), L is the number of characters to be deleted from the position - * specified by M ("A" means none, "B" means one, "C" - 2, etc.), K is a - * character that specifies how many characters should be deleted from the - * end of the inflected form to produce the lexeme by concatenating the - * stripped string with the ending ("A" means none, "B' - 1, "C" - 2, and so - * on). - * - * @param wordForm - * - inflected word form - * @param wordLemma - * - canonical form - * @param wordTag - * - tag - * @return the encoded string - * @throws UnsupportedEncodingException - */ - public String infixEncodeUTF8(final String wordForm, - final String wordLemma, final String wordTag) - throws UnsupportedEncodingException { - return asString(infixEncode(wordForm.getBytes(UTF8), wordLemma - .getBytes(UTF8), wordTag.getBytes(UTF8)), UTF8); - } -} diff --git a/src/morfologik/tools/MorphEncodingTool.java b/src/morfologik/tools/MorphEncodingTool.java deleted file mode 100644 index 223a32b..0000000 --- a/src/morfologik/tools/MorphEncodingTool.java +++ /dev/null @@ -1,213 +0,0 @@ -package morfologik.tools; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; - -import morfologik.fsa.FSA5; - -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; - -/** - * This utility converts the dictionary in a text (tabbed) format into - * the format accepted by the fsa building tools. It is meant to replace - * the Perl and AWK scripts from the original FSA package. - */ -class MorphEncodingTool extends Tool { - - private boolean prefixes = false; - private boolean infixes = false; - private boolean noWarn = false; - - private MorphEncoder encoder; - - /** - * - */ - protected void go(final CommandLine line) throws Exception { - - noWarn = line.hasOption(SharedOptions.noWarnIfTwoFields.getOpt()); - - infixes = line.hasOption(SharedOptions.infixEncoding.getOpt()); - - if (!infixes) { - prefixes = line.hasOption(SharedOptions.prefixEncoding.getOpt()); - } - - char separator = FSA5.DEFAULT_ANNOTATION; - if (line.hasOption(SharedOptions.annotationSeparatorCharacterOption.getLongOpt())) { - String sep = line.getOptionValue(SharedOptions.annotationSeparatorCharacterOption.getLongOpt()); - - if (sep.length() == 1) { - separator = sep.charAt(0); - } - - FSABuildTool.checkSingleByte(Character.toString(separator)); - } - encoder = new MorphEncoder((byte) separator); - - // Determine input and output streams. - final DataInputStream input = initializeInput(line); - final DataOutputStream output = initializeOutput(line); - - try { - process(input, output); - output.flush(); - - } finally { - input.close(); - output.close(); - } - - } - - /** - * Split fields - * @param line - * byte input buffer - * @param pos - * current offset in the file - * @return - * an array of three byte arrays; if there are less - * than three fields, one of byte arrays is null. If the - * line contains more than three fields, they are ignored. - */ - private static byte[][] splitFields(final byte[] line, final int pos) { - byte[][] outputArray = new byte[3][]; - int i = 0; - int prevPos = 0; - int arrayInd = 0; - while (i < pos) { - if (line[i] == (byte)'\t') { //tab - outputArray[arrayInd] = new byte[i - prevPos]; - System.arraycopy(line, prevPos, outputArray[arrayInd], 0, i - prevPos); - prevPos = i + 1; - arrayInd++; - } - i++; - } - return outputArray; - } - - /** - * Process input stream, writing to output stream. - * - */ - protected void process(final DataInputStream input, final DataOutputStream output) - throws IOException { - long lnumber = 0; - try { - int bufPos = 0; - byte[] buf = new byte[0xfffff]; // assumed that line is shorter than - // 64K chars - int dataByte = -1; // not declared within while loop - byte[][] words; - while ((dataByte = input.read()) != -1) { - if (dataByte == (byte) '\n') { - lnumber++; - buf[bufPos++] = 9; - words = splitFields(buf, bufPos); - for (int i = 0; i < words.length; i++) { - if (i < 1 && words[i] == null) { - throw new IllegalArgumentException( - "The input file has less than 2 fields in line: " - + lnumber); - } - if (words[i] == null && !noWarn) { - System.err.println("Line number: " + lnumber + " has less than three fields."); - } - } - - if (infixes) { - output.write(encoder.infixEncode(words[0], words[1], words[2])); - } else if (prefixes) { - output.write(encoder.prefixEncode(words[0], words[1], words[2])); - } else { - output.write(encoder.standardEncode(words[0], words[1], words[2])); - } - - output.writeByte('\n'); // Unix line end only. - bufPos = 0; - } else { - if (dataByte != (byte) '\r') { - buf[bufPos++] = (byte) dataByte; - } - } - } - } finally { - input.close(); - } - } - - /** - * Command line options for the tool. - */ - protected void initializeOptions(Options options) { - options.addOption(SharedOptions.inputFileOption); - options.addOption(SharedOptions.outputFileOption); - options.addOption(SharedOptions.standardEncoding); - options.addOption(SharedOptions.prefixEncoding); - options.addOption(SharedOptions.infixEncoding); - options.addOption(SharedOptions.noWarnIfTwoFields); - options.addOption(SharedOptions.annotationSeparatorCharacterOption); - } - - /** - * - */ - private static DataOutputStream initializeOutput(CommandLine line) - throws IOException, ParseException { - final DataOutputStream output; - final String opt = SharedOptions.outputFileOption.getOpt(); - if (line.hasOption(opt)) { - // Use output file. - output = new DataOutputStream( - new BufferedOutputStream( - new FileOutputStream((File) line - .getParsedOptionValue(opt)))); - } else { - // Use standard output. - output = new DataOutputStream( - new BufferedOutputStream( - System.out)); - } - return output; - } - - /** - * - */ - private static DataInputStream initializeInput(CommandLine line) - throws IOException, ParseException { - final DataInputStream input; - final String opt = SharedOptions.inputFileOption.getOpt(); - if (line.hasOption(opt)) { - // Use input file. - input = new DataInputStream ( - new BufferedInputStream( - new FileInputStream((File) line - .getParsedOptionValue(opt)))); - } else { - // Use standard input. - input = new DataInputStream( - new BufferedInputStream( - System.in)); - } - return input; - } - - /** - * Command line entry point. - */ - public static void main(String[] args) throws Exception { - final MorphEncodingTool tool = new MorphEncodingTool(); - tool.go(args); - } -} \ No newline at end of file diff --git a/src/morfologik/tools/PolishStemmingTool.java b/src/morfologik/tools/PolishStemmingTool.java deleted file mode 100644 index 0abd897..0000000 --- a/src/morfologik/tools/PolishStemmingTool.java +++ /dev/null @@ -1,191 +0,0 @@ -package morfologik.tools; - -import java.io.*; -import java.text.MessageFormat; -import java.util.List; -import java.util.Locale; - -import morfologik.stemming.*; - -import org.apache.commons.cli.*; - -/** - * This utility parses input text, tokenizes it on whitespace and stems input - * words, writing them to the output in column-based format: - * - *

- * word   stem   form
- * word   stem   form
- * 
- * - * Words for which no stems or forms are available have empty values in each - * respective column. Columns are tab-delimited. - */ -class PolishStemmingTool extends Tool { - /** - * - */ - protected void go(CommandLine line) throws Exception { - // Determine input/ output encoding. - final String inputEncoding = getEncodingOption(line, - SharedOptions.inputEncodingOption.getOpt()); - - final String outputEncoding = getEncodingOption(line, - SharedOptions.outputEncodingOption.getOpt()); - - System.out.println("Input encoding: " + inputEncoding); - System.out.println("Output encoding: " + outputEncoding); - - // Determine input and output streams. - final Reader input = initializeInput(line, inputEncoding); - final Writer output = initializeOutput(line, outputEncoding); - - final long start = System.currentTimeMillis(); - try { - final long count = process(input, output); - - output.flush(); - - final long millis = System.currentTimeMillis() - start; - final double time = millis / 1000.0; - final double wordsPerSec = time > 0 ? (count / time) - : Double.POSITIVE_INFINITY; - System.out - .println(new MessageFormat( - "Processed {0} words in {1,number,#.###} seconds ({2,number,#} words per second).", - Locale.ENGLISH).format(new Object[] { - new Long(count), new Double(millis / 1000.0), - new Double(wordsPerSec) })); - } finally { - input.close(); - output.close(); - } - - } - - /** - * Process input stream, writing to output stream. - * - * @return Returns the number of processed words. - */ - protected long process(Reader input, Writer output) throws IOException { - final IStemmer stemmer = new PolishStemmer(); - final StreamTokenizer st = new StreamTokenizer(input); - st.eolIsSignificant(false); - st.wordChars('+', '+'); - - long count = 0; - int token; - while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) { - if (token == StreamTokenizer.TT_WORD) { - final String word = st.sval; - - count++; - final List stems = stemmer.lookup(word); - if (stems.size() == 0) { - output.write(word); - output.write("\t-\t-\n"); - } else { - for (WordData wd : stems) { - output.write(word); - output.write("\t"); - output.write(asString(wd.getStem())); - output.write("\t"); - output.write(asString(wd.getTag())); - output.write("\n"); - } - } - } - } - - return count; - } - - private String asString(CharSequence stem) { - if (stem == null) - return "-"; - return stem.toString(); - } - - /** - * Command line options for the tool. - */ - protected void initializeOptions(Options options) { - options.addOption(SharedOptions.inputFileOption); - options.addOption(SharedOptions.inputEncodingOption); - options.addOption(SharedOptions.outputFileOption); - options.addOption(SharedOptions.outputEncodingOption); - } - - /** - * - */ - private Writer initializeOutput(CommandLine line, String outputEncoding) - throws IOException, ParseException { - final Writer output; - final String opt = SharedOptions.outputFileOption.getOpt(); - if (line.hasOption(opt)) { - // Use output file. - output = new OutputStreamWriter( - new BufferedOutputStream(new FileOutputStream((File) line - .getParsedOptionValue(opt))), outputEncoding); - } else { - // Use standard output. - output = new OutputStreamWriter(System.out, outputEncoding); - } - return output; - } - - /** - * - */ - private Reader initializeInput(CommandLine line, String inputEncoding) - throws IOException, ParseException { - final Reader input; - final String opt = SharedOptions.inputFileOption.getOpt(); - - if (line.hasOption(opt)) { - // Use input file. - input = new InputStreamReader( - new BufferedInputStream(new FileInputStream((File) line - .getParsedOptionValue(opt))), inputEncoding); - } else { - // Use standard input. - input = new InputStreamReader(System.in, inputEncoding); - } - return input; - } - - /** - * - */ - private String getEncodingOption(CommandLine line, String opt) { - String encoding = System.getProperty("file.encoding", "iso-8859-1"); - if (line.hasOption(opt)) { - encoding = line.getOptionValue(opt); - } - return encoding; - } - - /* - * Check if the dictionary is available. - */ - @Override - protected boolean isAvailable() { - boolean available = true; - try { - new PolishStemmer(); - } catch (Throwable t) { - available = false; - } - return available; - } - - /** - * Command line entry point. - */ - public static void main(String[] args) throws Exception { - final PolishStemmingTool tool = new PolishStemmingTool(); - tool.go(args); - } -} \ No newline at end of file diff --git a/src/morfologik/tools/SharedOptions.java b/src/morfologik/tools/SharedOptions.java deleted file mode 100644 index e047d7f..0000000 --- a/src/morfologik/tools/SharedOptions.java +++ /dev/null @@ -1,153 +0,0 @@ -package morfologik.tools; - -import java.io.File; -import java.util.Arrays; - -import org.apache.commons.cli.Option; -import org.apache.commons.cli.OptionBuilder; - -/** - * Options shared between tools. - */ -@SuppressWarnings("static-access") -final class SharedOptions { - public final static Option fsaDictionaryFileOption = OptionBuilder - .hasArg() - .withArgName("file") - .withDescription("Path to the FSA dictionary.") - .withLongOpt("dictionary") - .withType(File.class) - .isRequired(true) - .create("d"); - - public final static Option decode = OptionBuilder - .withDescription("Decode prefix/ infix/ suffix forms (if available).") - .withLongOpt("decode") - .isRequired(false) - .create("x"); - - public final static Option dataOnly = OptionBuilder - .withDescription("Dump only raw FSA data.") - .withLongOpt("raw-data") - .isRequired(false) - .create("r"); - - public final static Option dot = OptionBuilder - .withDescription("Dump the automaton as graphviz DOT file.") - .withLongOpt("dot") - .isRequired(false) - .create(); - - public final static Option inputEncodingOption = OptionBuilder - .hasArg() - .withArgName("codepage") - .withDescription("Input stream encoding.") - .withLongOpt("input-encoding") - .isRequired(false) - .create("ie"); - - public final static Option outputEncodingOption = OptionBuilder - .hasArg() - .withArgName("codepage") - .withDescription("Output stream encoding.") - .withLongOpt("output-encoding") - .isRequired(false) - .create("oe"); - - public final static Option inputFileOption = OptionBuilder - .hasArg() - .withArgName("file") - .withDescription("Input file. If missing, standard input is used.") - .withLongOpt("input") - .withType(File.class) - .isRequired(false) - .create("i"); - - public final static Option outputFileOption = OptionBuilder - .hasArg() - .withArgName("file") - .withDescription("Output file. If missing, standard output is used.") - .withLongOpt("output") - .withType(File.class) - .isRequired(false) - .create("o"); - - public final static Option outputFormatOption = OptionBuilder - .hasArg() - .withArgName("format") - .withDescription("Name of the binary output format. Allowed values: " + Arrays.toString(FSABuildTool.Format.values())) - .withLongOpt("format") - .isRequired(false) - .create("f"); - - public final static Option fillerCharacterOption = OptionBuilder - .hasArg() - .withArgName("char") - .withDescription("Custom filler character") - .isRequired(false) - .withLongOpt("filler") - .create(); - - public final static Option annotationSeparatorCharacterOption = OptionBuilder - .hasArg() - .withArgName("char") - .withDescription("Custom annotation separator character") - .isRequired(false) - .withLongOpt("annotation") - .create(); - - public final static Option withNumbersOption = OptionBuilder - .withDescription("Include numbers required for perfect hashing (larger automaton)") - .isRequired(false) - .withLongOpt("with-numbers") - .create("n"); - - public final static Option progressOption = OptionBuilder - .withDescription("Print more verbose progress information") - .isRequired(false) - .withLongOpt("progress") - .create(); - - public final static Option inputSortedOption = OptionBuilder - .withDescription("Assume the input is already sorted using C-sort (builds FSA directly, no in-memory sorting)") - .isRequired(false) - .withLongOpt("sorted") - .create(); - - public final static Option standardEncoding = OptionBuilder - .withDescription("Encode suffix forms in a standard way") - .withLongOpt("suffix") - .isRequired(false) - .create("suf"); - - public final static Option prefixEncoding = OptionBuilder - .withDescription("Encode suffix forms in a prefix way") - .withLongOpt("prefix") - .isRequired(false) - .create("pre"); - - public final static Option infixEncoding = OptionBuilder - .withDescription("Encode suffix forms in an infix way") - .withLongOpt("infix") - .isRequired(false) - .create("inf"); - - public final static Option noWarnIfTwoFields = OptionBuilder - .withDescription("Suppress warning for lines with only two fields (for stemming dictionaries)") - .withLongOpt("nowarn") - .isRequired(false) - .create("nw"); - - public final static Option statistics = OptionBuilder - .withDescription("Print extra statistics.") - .isRequired(false) - .withLongOpt("stats") - .create(); - - /** - * No instances. Use static fields. - */ - private SharedOptions() { - // empty - } -} diff --git a/src/morfologik/tools/Tool.java b/src/morfologik/tools/Tool.java deleted file mode 100644 index ebc1daf..0000000 --- a/src/morfologik/tools/Tool.java +++ /dev/null @@ -1,84 +0,0 @@ -package morfologik.tools; - -import org.apache.commons.cli.*; - -/** - * Base class for command-line applications. - */ -abstract class Tool { - /** Command line options. */ - protected final Options options = new Options(); - - /** - * Initializes application context. - */ - protected final void go(String[] args) { - initializeOptions(options); - - if (args.length == 0) { - printUsage(); - return; - } - - final Parser parser = new GnuParser(); - final CommandLine line; - try { - line = parser.parse(options, args); - try { - go(line); - } catch (Throwable e) { - printError("Unhandled program error occurred.", e); - } - } catch (MissingArgumentException e) { - printError("Provide the required argument for option: " - + e.getMessage()); - } catch (MissingOptionException e) { - printError("Provide the required option: " + e.getMessage()); - } catch (UnrecognizedOptionException e) { - printError(e.getMessage()); - } catch (ParseException e) { - printError("Could not parse command line: " + e.getMessage()); - } - } - - /** - * Print an error and an associated exception. - */ - protected void printError(String msg, Throwable t) { - printError(msg); - t.printStackTrace(System.err); - } - - /** - * Print an error without an exception. - */ - protected void printError(String msg) { - System.err.println(); - System.err.println(msg); - } - - /** - * Prints usage (options). - */ - protected void printUsage() { - final HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp(this.getClass().getName(), options, true); - } - - /** - * Override and write your stuff using command line options. - */ - protected abstract void go(CommandLine line) throws Exception; - - /** - * Override and initialize options. - */ - protected abstract void initializeOptions(Options options); - - /** - * Is the tool available? true by default. - */ - protected boolean isAvailable() { - return true; - } -} diff --git a/src/morfologik/tools/WriterMessageLogger.java b/src/morfologik/tools/WriterMessageLogger.java deleted file mode 100644 index fceb698..0000000 --- a/src/morfologik/tools/WriterMessageLogger.java +++ /dev/null @@ -1,123 +0,0 @@ -package morfologik.tools; - -import java.io.PrintWriter; -import java.util.*; - -/** - * A logger dumping info to System.err. - */ -public class WriterMessageLogger implements IMessageLogger { - /** - * Start of the world timestamp. - */ - private final static long world = System.currentTimeMillis(); - - /** - * A single part: name, start timestamp. - */ - private static class Part { - final String name; - final long start; - - Part(String name, long start) { - this.name = name; - this.start = start; - } - } - - /** - * Is the output currently indented? - */ - private boolean indent; - - /** - * Active parts. - */ - private ArrayDeque parts = new ArrayDeque(); - - /** - * Output writer. - */ - private final PrintWriter writer; - - /** - * - */ - public WriterMessageLogger(PrintWriter w) { - this.writer = w; - } - - /* - * - */ - @Override - public void log(String msg) { - cancelIndent(); - - writer.println(msg); - writer.flush(); - } - - /* - * - */ - @Override - public void log(String header, Object v) { - cancelIndent(); - - if (v instanceof Integer || v instanceof Long) { - writer.println(String.format(Locale.ENGLISH, "%-30s %,11d", header, v)); - } else { - writer.println(String.format(Locale.ENGLISH, "%-30s %11s", header, v.toString())); - } - writer.flush(); - } - - /* - * - */ - @Override - public void startPart(String header) { - cancelIndent(); - - Part p = new Part(header, System.currentTimeMillis()); - parts.addLast(p); - - writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "...")); - writer.flush(); - - indent = true; - } - - /* - * - */ - @Override - public void endPart() { - long now = System.currentTimeMillis(); - Part p = parts.removeLast(); - - if (!indent) { - writer.print(String.format(Locale.ENGLISH, "%-30s", p.name + "...")); - } - - writer.println( - String.format(Locale.ENGLISH, "%13.2f sec. [%6.2f sec.]", - (now - p.start) / 1000.0, - (now - world) / 1000.0)); - writer.flush(); - - indent = false; - } - - /* - * - */ - private void cancelIndent() { - if (indent) { - System.err.println(); - } - - indent = false; - } -} diff --git a/src/morfologik/util/Arrays.java b/src/morfologik/util/Arrays.java deleted file mode 100644 index 4d1d840..0000000 --- a/src/morfologik/util/Arrays.java +++ /dev/null @@ -1,68 +0,0 @@ -package morfologik.util; - -/** - * Compatibility layer for JVM 1.5. - */ -public final class Arrays { - private Arrays() { - // No instances. - } - - /** - * Compare two lists of objects for reference-equality. - */ - public static boolean referenceEquals(Object[] a1, int a1s, Object[] a2, int a2s, int length) { - for (int i = 0; i < length; i++) - if (a1[a1s++] != a2[a2s++]) - return false; - - return true; - } - - /** - * Compare two arrays for equality. - */ - public static boolean equals(byte[] a1, int a1s, byte [] a2, int a2s, int length) { - for (int i = 0; i < length; i++) - if (a1[a1s++] != a2[a2s++]) - return false; - - return true; - } - - /** - * Compare two arrays for equality. - */ - public static boolean equals(boolean[] a1, int a1s, boolean[] a2, int a2s, int length) { - for (int i = 0; i < length; i++) - if (a1[a1s++] != a2[a2s++]) - return false; - - return true; - } - - /** - * Compare two arrays for equality. - */ - public static boolean equals(int[] a1, int a1s, int[] a2, int a2s, int length) { - for (int i = 0; i < length; i++) - if (a1[a1s++] != a2[a2s++]) - return false; - - return true; - } - - /** - * Convert an array of strings to bytes. - */ - public static String toString(byte [] bytes, int start, int length) - { - if (bytes.length != length) - { - final byte [] sub = new byte [length]; - System.arraycopy(bytes, start, sub, 0, length); - bytes = sub; - } - return java.util.Arrays.toString(bytes); - } -} diff --git a/src/morfologik/util/BufferUtils.java b/src/morfologik/util/BufferUtils.java deleted file mode 100644 index 6ccfbc6..0000000 --- a/src/morfologik/util/BufferUtils.java +++ /dev/null @@ -1,54 +0,0 @@ -package morfologik.util; - -import java.nio.ByteBuffer; -import java.nio.CharBuffer; - -/** - * Utility functions for buffers. - */ -public final class BufferUtils { - - /** - * No instances. - */ - private BufferUtils() { - // empty - } - - /** - * Ensure the byte buffer's capacity. If a new buffer is allocated, its - * content is empty (the old buffer's contents is not copied). - * - * @param buffer - * The buffer to check or null if a new buffer - * should be allocated. - */ - public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) { - if (buffer == null || buffer.capacity() < capacity) { - buffer = ByteBuffer.allocate(capacity); - } - return buffer; - } - - /** - * Ensure the char buffer's capacity. If a new buffer is allocated, its - * content is empty (the old buffer's contents is not copied). - * - * @param buffer - * The buffer to check or null if a new buffer - * should be allocated. - */ - public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) { - if (buffer == null || buffer.capacity() < capacity) { - buffer = CharBuffer.allocate(capacity); - } - return buffer; - } - - /** - * Convert a byte buffer to a string in platform default encoding. - */ - public static String toString(ByteBuffer sequence) { - return new String(sequence.array(), sequence.position(), sequence.remaining()); - } -} \ No newline at end of file diff --git a/src/morfologik/util/FileUtils.java b/src/morfologik/util/FileUtils.java deleted file mode 100644 index 5d62212..0000000 --- a/src/morfologik/util/FileUtils.java +++ /dev/null @@ -1,137 +0,0 @@ -package morfologik.util; - -import java.io.*; - -/** - * Utility functions. - */ -public final class FileUtils { - - /** - * No instances. - */ - private FileUtils() { - // empty - } - - /** - * Checks if the given file exists. - */ - public static void assertExists(File fsaFile, boolean requireFile, - boolean requireDirectory) throws IOException { - if (!fsaFile.exists()) { - throw new IOException("File does not exist: " - + fsaFile.getAbsolutePath()); - } - - if (requireFile) { - if (!fsaFile.isFile() || !fsaFile.canRead()) { - throw new IOException("File cannot be read: " - + fsaFile.getAbsolutePath()); - } - } - - if (requireDirectory) { - if (!fsaFile.isDirectory()) { - throw new IOException("Not a directory: " - + fsaFile.getAbsolutePath()); - } - } - } - - /** - * Force any non-null closeables. - */ - public static void close(Closeable... closeables) { - for (Closeable c : closeables) { - if (c != null) { - try { - c.close(); - } catch (IOException e) { - // too bad - } - } - } - } - - /** - * Reads all bytes from an input stream (until EOF). - */ - public static byte[] readFully(InputStream stream) throws IOException { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16); - final byte[] buffer = new byte[1024 * 8]; - int bytesCount; - while ((bytesCount = stream.read(buffer)) > 0) { - baos.write(buffer, 0, bytesCount); - } - return baos.toByteArray(); - } - - /** - * Read enough bytes to fill array If there are not enough - * bytes, throw an exception. - */ - public static void readFully(InputStream in, byte[] array) - throws IOException { - int offset = 0; - int cnt; - while ((cnt = in.read(array, offset, array.length - offset)) > 0) { - offset += cnt; - - if (offset == array.length) - break; - } - - if (cnt < 0) - throw new EOFException(); - } - - /** - * Read exactly 4 bytes from the input stream. - */ - public static int readInt(InputStream in) throws IOException { - int v = 0; - for (int i = 0; i < 4; i++) { - v = (v << 8) | (readByte(in) & 0xff); - } - return v; - } - - /** - * - */ - public static void writeInt(OutputStream os, int v) throws IOException { - os.write( v >>> 24); - os.write((v >>> 16) & 0xff); - os.write((v >>> 8) & 0xff); - os.write( v & 0xff); - } - - /** - * Read exactly 2 bytes from the input stream. - */ - public static short readShort(InputStream in) throws IOException { - return (short) (readByte(in) << 8 | - readByte(in) & 0xff); - } - - /** - * Read exactly one byte from the input stream. - * - * @throws EOFException if EOF is reached. - */ - public static byte readByte(InputStream in) throws IOException { - int b = in.read(); - if (b == -1) - throw new EOFException(); - return (byte) b; - } - - /** - * - */ - public static void writeShort(OutputStream os, short v) throws IOException { - os.write((v >>> 8) & 0xff); - os.write( v & 0xff); - } -} \ No newline at end of file diff --git a/src/morfologik/util/ResourceUtils.java b/src/morfologik/util/ResourceUtils.java deleted file mode 100644 index 2c7bd23..0000000 --- a/src/morfologik/util/ResourceUtils.java +++ /dev/null @@ -1,58 +0,0 @@ -package morfologik.util; - -import java.io.*; -import java.net.*; - -/** - * Resource management utilities. - */ -public final class ResourceUtils { - /** - * No instances. - */ - private ResourceUtils() { - } - - /** - * Returns an input stream to the resource. - * - * @param resource - * The path leading to the resource. Can be an URL, a path - * leading to a class resource or a {@link File}. - * - * @return InputStream instance. - * @throws IOException - * If the resource could not be found or opened. - */ - public static InputStream openInputStream(String resource) - throws IOException { - try { - // See if the resource is an URL first. - final URL url = new URL(resource); - // success, load the resource. - return url.openStream(); - } catch (MalformedURLException e) { - // No luck. Fallback to class loader paths. - } - - // Try current thread's class loader first. - final ClassLoader ldr = Thread.currentThread().getContextClassLoader(); - - InputStream is; - if (ldr != null && (is = ldr.getResourceAsStream(resource)) != null) { - return is; - } else if ((is = ResourceUtils.class.getResourceAsStream(resource)) != null) { - return is; - } else if ((is = ClassLoader.getSystemResourceAsStream(resource)) != null) { - return is; - } - - // Try file path - final File f = new File(resource); - if (f.exists() && f.isFile() && f.canRead()) { - return new FileInputStream(f); - } - - throw new IOException("Could not locate resource: " + resource); - } -} -- cgit v1.2.3