summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJian Zhen <zhenjl@gmail.com>2015-01-21 01:25:44 -0800
committerJian Zhen <zhenjl@gmail.com>2015-01-21 01:25:44 -0800
commit5544009e5c2ef0506300f908bb65ca10e4409d25 (patch)
tree5bd4cf6a86d4513630ab6fdac1e4c47265f82344
parent219b90c62524c535e006109d2756579726ec1ee2 (diff)
docs
-rw-r--r--README.md52
-rw-r--r--cmd/compare/compare.go54
-rw-r--r--cmd/switchvsmap/switchvsmap.go14
-rw-r--r--porter2.go (renamed from english.go)47
-rw-r--r--porter2_test.go (renamed from english_test.go)49
-rw-r--r--stemmer.go19
6 files changed, 152 insertions, 83 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9cfdb9b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,52 @@
+porter2
+=======
+
+Porter2 implements the [english Porter2 stemmer](http://snowball.tartarus.org/algorithms/english/stemmer.html). It is written completely using finite state machines to do suffix comparison, rather than the string-based or tree-based approaches. As a result, it is 660% faster compare to string comparison-based approach.
+
+```
+import "github.com/surgebase/porter2"
+
+fmt.Println(porter2.Stem("seaweed")) // should get seawe
+```
+
+This implementation has been successfully validated with the dataset from http://snowball.tartarus.org/algorithms/english/
+
+### Performance
+
+This implementation by far has the highest performance of the various Go-based implementations, AFAICT. I tested a few of the implementations and the results are below.
+
+| Implementation | Time | Algorithm |
+|----------------|------|-----------|
+| [surgebase](https://github.com/surgebase/porter2) | 319.009358ms | Porter2 |
+| [dchest](https://github.com/dchest/stemmer) | 2.106912401s | Porter2 |
+| [reiver](https://github.com/reiver/go-porterstemmer) | 469.305709ms | Porter |
+| [kljensen](https://github.com/kljensen/snowball) | 5.725917198s | Porter |
+| [agonopol](https://github.com/agonopol/go-stem) | 3.991158277s | Porter |
+
+To run the test again, you can run cmd/compare/compare.go (`go run compare.go`).
+
+### State Machines
+
+Most of the implementations, like the ones in the table above, rely completely on suffix string comparison. Basically there's a list of suffixes, and the code will loop through the list to see if there's a match. Given most of the time you are looking for the longest match, so you order the list so the longest is the first one. So if you are luckly, the match will be early on the list. But regardless that's a huge performance hit.
+
+This implementation is based completely on finite state machines to perform suffix comparison. You compare each chacter of the string starting at the last character going backwards. The state machines at each step will determine what the longest suffix is. You can think of the state machine as an unrolled tree.
+
+However, writing large state machines can be very error-prone. So I wrote a [quick tool](https://github.com/surgebase/porter2/tree/master/cmd/suffixfsm) to generate most of the state machines. The tool basically takes a file of suffixes, creates a tree, then unrolls the tree by dumping each of the nodes.
+
+You can run the tool by `go run suffixfsm.go <filename>`.
+
+### License
+
+Copyright (c) 2014 Dataence, LLC. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/cmd/compare/compare.go b/cmd/compare/compare.go
index bbfd04e..024fcac 100644
--- a/cmd/compare/compare.go
+++ b/cmd/compare/compare.go
@@ -1,12 +1,28 @@
+// Copyright (c) 2014 Dataence, LLC. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
package main
import (
"fmt"
"time"
- "github.com/dchest/stemmer/porter2"
- "github.com/reiver/go-porterstemmer"
- p2 "github.com/surgebase/porter2"
+ agonopol "github.com/agonopol/go-stem"
+ dchest "github.com/dchest/stemmer/porter2"
+ kljensen "github.com/kljensen/snowball"
+ reiver "github.com/reiver/go-porterstemmer"
+ surgebase "github.com/surgebase/porter2"
)
func main() {
@@ -74,25 +90,23 @@ func main() {
n := 10000
- var es p2.EnglishStemmer
-
now := time.Now()
for i := 0; i < n; i++ {
for _, a := range actions {
- es.Stem(a)
+ surgebase.Stem(a)
}
}
since := time.Since(now)
- fmt.Println("surge:", since)
+ fmt.Println("surgebase:", since)
//eng := porter2.Stemmer
now = time.Now()
for i := 0; i < n; i++ {
for _, a := range actions {
- porter2.EngStem.Stem(a)
+ dchest.Stemmer.Stem(a)
}
}
@@ -103,10 +117,32 @@ func main() {
for i := 0; i < n; i++ {
for _, a := range actions {
- porterstemmer.StemString(a)
+ reiver.StemString(a)
}
}
since = time.Since(now)
fmt.Println("reiver:", since)
+
+ now = time.Now()
+
+ for i := 0; i < n; i++ {
+ for _, a := range actions {
+ kljensen.Stem(a, "english", true)
+ }
+ }
+
+ since = time.Since(now)
+ fmt.Println("kljensen:", since)
+
+ now = time.Now()
+
+ for i := 0; i < n; i++ {
+ for _, a := range actions {
+ agonopol.Stem([]byte(a))
+ }
+ }
+
+ since = time.Since(now)
+ fmt.Println("agonopol:", since)
}
diff --git a/cmd/switchvsmap/switchvsmap.go b/cmd/switchvsmap/switchvsmap.go
index a1d01a0..281fcbd 100644
--- a/cmd/switchvsmap/switchvsmap.go
+++ b/cmd/switchvsmap/switchvsmap.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2014 Dataence, LLC. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
package main
import (
diff --git a/english.go b/porter2.go
index d3d941e..4d1f3b7 100644
--- a/english.go
+++ b/porter2.go
@@ -12,15 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// Porter2 implements the english Porter2 stemmer. It is written completely using
+// finite state machines to do suffix comparison, rather than the string-based
+// or tree-based approaches. As a result, it is 600% faster compare to string-based
+// implementations.
+//
+// http://snowball.tartarus.org/algorithms/english/stemmer.html
package porter2
import "unicode"
-type EnglishStemmer bool
-
-var _ Stemmer = (*EnglishStemmer)(nil)
-
-func (this EnglishStemmer) Stem(s string) string {
+// Stem takes a string and returns the stemmed version based on the Porter2 algorithm.
+func Stem(s string) string {
// If the word has two letters or less, leave it as it is.
if len(s) <= 2 {
return s
@@ -35,25 +38,25 @@ func (this EnglishStemmer) Stem(s string) string {
var ex bool
// exception1 word list
- if rs, ex = this.exception1(rs); ex {
+ if rs, ex = exception1(rs); ex {
return string(rs)
}
- rs = this.preclude(rs)
+ rs = preclude(rs)
r1, r2 := markR1R2(rs)
- rs = this.step1a(this.step0(rs))
+ rs = step1a(step0(rs))
- if this.exception2(rs) {
+ if exception2(rs) {
return string(rs)
}
- return string(this.postlude(this.step5(this.step4(this.step3(this.step2(this.step1c(this.step1b(rs, r1)), r1), r1, r2), r2), r1, r2)))
+ return string(postlude(step5(step4(step3(step2(step1c(step1b(rs, r1)), r1), r1, r2), r2), r1, r2)))
}
// Remove initial ', if present. Then set initial y, or y after a vowel, to Y.
-func (this EnglishStemmer) preclude(rs []rune) []rune {
+func preclude(rs []rune) []rune {
if rs[0] == '\'' {
rs = rs[1:]
}
@@ -124,7 +127,7 @@ func markRegion(rs []rune) int {
// '
// 's
// 's'
-func (this EnglishStemmer) step0(rs []rune) []rune {
+func step0(rs []rune) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -198,7 +201,7 @@ loop:
// s : delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
// us : do nothing
// ss : do nothing
-func (this EnglishStemmer) step1a(rs []rune) []rune {
+func step1a(rs []rune) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -330,7 +333,7 @@ loop:
// if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
// if the word ends with a double remove the last letter (so hopp -> hop), or
// if the word is short, add e (so hop -> hope)
-func (this EnglishStemmer) step1b(rs []rune, r1 int) []rune {
+func step1b(rs []rune, r1 int) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -508,7 +511,7 @@ switch1b:
// Replace suffix y or Y by i if preceded by a non-vowel which is not the first letter
// of the word (so cry -> cri, by -> by, say -> say)
-func (this EnglishStemmer) step1c(rs []rune) []rune {
+func step1c(rs []rune) []rune {
l := len(rs)
if l > 2 {
@@ -550,7 +553,7 @@ func (this EnglishStemmer) step1c(rs []rune) []rune {
// 22. fulli -> replace by ful
// 23. lessli -> replace by less
// 24. li -> delete if preceded by a valid li-ending
-func (this EnglishStemmer) step2(rs []rune, r1 int) []rune {
+func step2(rs []rune, r1 int) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -1128,7 +1131,7 @@ loop:
// 7. ful -> delete
// 8. ness -> delete
// 9. ative -> delete if in R2
-func (this EnglishStemmer) step3(rs []rune, r1, r2 int) []rune {
+func step3(rs []rune, r1, r2 int) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -1432,7 +1435,7 @@ loop:
// 16. ment -> delete
// 17. ous -> delete
// 18. ion -> delete if preceded by s or t
-func (this EnglishStemmer) step4(rs []rune, r2 int) []rune {
+func step4(rs []rune, r2 int) []rune {
var (
l int = len(rs) // string length
m int // suffix length
@@ -1745,7 +1748,7 @@ loop:
//
// e -> delete if in R2, or in R1 and not preceded by a short syllable
// l -> delete if in R2 and preceded by l
-func (this EnglishStemmer) step5(rs []rune, r1, r2 int) []rune {
+func step5(rs []rune, r1, r2 int) []rune {
l := len(rs)
if l < 1 {
return rs
@@ -1779,7 +1782,7 @@ func (this EnglishStemmer) step5(rs []rune, r1, r2 int) []rune {
}
// Finally, turn any remaining Y letters in the word back into lower case.
-func (this EnglishStemmer) postlude(rs []rune) []rune {
+func postlude(rs []rune) []rune {
for i, r := range rs {
if r == 'Y' {
rs[i] = 'y'
@@ -1813,7 +1816,7 @@ func (this EnglishStemmer) postlude(rs []rune) []rune {
// sky -> sky
// tying -> tie
// ugly -> ugli
-func (this EnglishStemmer) exception1(rs []rune) ([]rune, bool) {
+func exception1(rs []rune) ([]rune, bool) {
l := len(rs)
if l > 6 {
return rs, false
@@ -1952,7 +1955,7 @@ func (this EnglishStemmer) exception1(rs []rune) ([]rune, bool) {
// proceed
// exceed
// succeed
-func (this EnglishStemmer) exception2(rs []rune) bool {
+func exception2(rs []rune) bool {
l := len(rs)
if l != 6 && l != 7 {
return false
diff --git a/english_test.go b/porter2_test.go
index 7904faa..200d04d 100644
--- a/english_test.go
+++ b/porter2_test.go
@@ -316,82 +316,74 @@ var (
)
func TestEnglishStep0(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data0 {
//glog.Debugf("rs=%q, expected=%q", string(rs), string(expect[i]))
- assert.Equal(t, false, es.step0(rs), expect0[i])
+ assert.Equal(t, false, step0(rs), expect0[i])
}
}
func TestEnglishStep1a(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data1a {
- assert.Equal(t, false, es.step1a(rs), expect1a[i])
+ assert.Equal(t, false, step1a(rs), expect1a[i])
//glog.Debugf("rs=%q, expected=%q, got=%q", string(rs), string(expect1a[i]), string(s))
}
}
func TestEnglishStep1b(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data1b {
r1, _ := markR1R2(rs)
//glog.Debugf("rs=%q, expected=%q, r1=%d", string(rs), string(expect1b[i]), r1)
- s := es.step1b(rs, r1)
+ s := step1b(rs, r1)
assert.Equal(t, true, s, expect1b[i])
}
}
func TestEnglishStep1c(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data1c {
- //glog.Debugf("rs=%q, expected=%q, got=%q", string(rs), string(expect1c[i]), string(es.step1c(rs)))
- assert.Equal(t, false, es.step1c(rs), expect1c[i])
+ //glog.Debugf("rs=%q, expected=%q, got=%q", string(rs), string(expect1c[i]), string(step1c(rs)))
+ assert.Equal(t, false, step1c(rs), expect1c[i])
}
}
func TestEnglishStep2(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data2 {
r1, _ := markR1R2(rs)
- s := es.step2(rs, r1)
+ s := step2(rs, r1)
//glog.Debugf("rs=%q, expected=%q, got=%q, r1=%d", string(rs), string(expect2[i]), string(s), r1)
assert.Equal(t, false, s, expect2[i])
}
}
func TestEnglishStep3(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data3 {
r1, r2 := markR1R2(rs)
- s := es.step3(rs, r1, r2)
+ s := step3(rs, r1, r2)
//glog.Debugf("rs=%q, expected=%q, got=%q, r1=%d", string(rs), string(expect3[i]), string(s), r1)
assert.Equal(t, true, s, expect3[i])
}
}
func TestEnglishStep4(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data4 {
_, r2 := markR1R2(rs)
- s := es.step4(rs, r2)
+ s := step4(rs, r2)
//glog.Debugf("rs=%q, expected=%q, got=%q, r1=%d, r2=%d", string(rs), string(expect4[i]), string(s), r1, r2)
assert.Equal(t, true, s, expect4[i])
}
}
func TestEnglishStep5(t *testing.T) {
- var es EnglishStemmer
for i, rs := range data5 {
r1, r2 := markR1R2(rs)
- s := es.step5(rs, r1, r2)
+ s := step5(rs, r1, r2)
//glog.Debugf("rs=%q, expected=%q, got=%q, r1=%d, r2=%d", string(rs), string(expect5[i]), string(s), r1, r2)
assert.Equal(t, true, s, expect5[i])
}
@@ -416,10 +408,9 @@ func TestEnglishIsShortWord(t *testing.T) {
}
func TestEnglishExceptions1(t *testing.T) {
- var es EnglishStemmer
for k, v := range exceptions1 {
- rs, ex := es.exception1([]rune(k))
+ rs, ex := exception1([]rune(k))
//glog.Debugf("rs=%q, expected=%q, got=%q", k, v, string(rs))
assert.True(t, false, ex)
assert.Equal(t, false, []rune(v), rs)
@@ -428,35 +419,31 @@ func TestEnglishExceptions1(t *testing.T) {
}
func TestEnglishExceptions2(t *testing.T) {
- var es EnglishStemmer
for k, v := range exceptions2 {
- assert.Equal(t, true, v, es.exception2([]rune(k)))
+ assert.Equal(t, true, v, exception2([]rune(k)))
}
}
func TestEnglishStem(t *testing.T) {
- var es EnglishStemmer
- glog.Debugf("%s", es.Stem("seaweed"))
+ glog.Debugf("%s", Stem("seaweed"))
}
func BenchmarkEnglishStep0(b *testing.B) {
- var es EnglishStemmer
for i := 0; i < b.N; i++ {
for _, rs := range data0 {
- es.step0(rs)
+ step0(rs)
}
}
}
func BenchmarkEnglishStep1a(b *testing.B) {
- var es EnglishStemmer
for i := 0; i < b.N; i++ {
for _, rs := range data1a {
- es.step0(rs)
+ step0(rs)
}
}
}
@@ -470,11 +457,9 @@ func BenchmarkEnglishException1(b *testing.B) {
b.ResetTimer()
- var es EnglishStemmer
-
for i := 0; i < b.N; i++ {
for _, k := range words {
- es.exception1(k)
+ exception1(k)
}
}
}
@@ -485,8 +470,6 @@ func TestEnglishVocOutput(t *testing.T) {
defer infile.Close()
defer outfile.Close()
- var es EnglishStemmer
-
for inscan.Scan() {
if !outscan.Scan() {
break
@@ -495,7 +478,7 @@ func TestEnglishVocOutput(t *testing.T) {
word := inscan.Text()
expect := outscan.Text()
//glog.Debugf("word=%q, expect=%q", word, expect)
- actual := es.Stem(word)
+ actual := Stem(word)
if actual != expect {
glog.Debugf("word=%q, actual=%q != expect=%q", word, actual, expect)
}
diff --git a/stemmer.go b/stemmer.go
deleted file mode 100644
index 9a92872..0000000
--- a/stemmer.go
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2014 Dataence, LLC. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package porter2
-
-type Stemmer interface {
- Stem(s string) string
-}