Skip to content

Commit

Permalink
Reduce allocations (#29)
Browse files Browse the repository at this point in the history
* First pass

Allocs down to 75, ~double throughput?

* One more

* Better benchmark

* Update to later Go version, lose the build cache

* Add go.sum just to make the Actions cache happy

* Rename sLen → suffixLength

* Add name to contributors
  • Loading branch information
clipperhouse authored Aug 13, 2024
1 parent 98e7b6e commit 2569472
Show file tree
Hide file tree
Showing 36 changed files with 29,690 additions and 29,669 deletions.
16 changes: 3 additions & 13 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,15 @@ jobs:
test:
strategy:
matrix:
go-version: [1.14.x, 1.15.x]
go-version: [1.19.x, 1.20.x, 1.21.x, 1.22.x]
os: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Install Go
uses: actions/setup-go@v2
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
uses: actions/checkout@v2
- uses: actions/cache@v2
with:
path: |
~/go/pkg/mod # Module download cache
~/.cache/go-build # Build cache (Linux)
~/Library/Caches/go-build # Build cache (Mac)
'%LocalAppData%\go-build' # Build cache (Windows)
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-
uses: actions/checkout@v4
- name: Test
run: go test ./...
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ I know of a few other stemmers availble in Go:
* [Anton Södergren](https://github.com/AAAton)
* [Eivind Moland](https://github.com/eivindam)
* [ Tamás Gulácsi](https://github.com/tgulacsi)
* [@clipperhouse](https://github.com/clipperhouse)
* Your name should be here!


Expand Down
16 changes: 4 additions & 12 deletions english/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

// Replaces all different kinds of apostrophes with a single
// kind: "'" -- that is, "\x27", or unicode codepoint 39.
//
func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int) {
for i, r := range word.RS {
switch r {
Expand All @@ -27,7 +26,6 @@ func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int

// Trim off leading apostropes. (Slight variation from
// NLTK implementation here, in which only the first is removed.)
//
func trimLeftApostrophes(word *snowballword.SnowballWord) {
var (
numApostrophes int
Expand All @@ -49,7 +47,6 @@ func trimLeftApostrophes(word *snowballword.SnowballWord) {
}

// Capitalize all 'Y's preceded by vowels or starting a word
//
func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
for i, r := range word.RS {

Expand All @@ -64,7 +61,6 @@ func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
}

// Uncapitalize all 'Y's
//
func uncapitalizeYs(word *snowballword.SnowballWord) {
for i, r := range word.RS {

Expand All @@ -88,10 +84,9 @@ func uncapitalizeYs(word *snowballword.SnowballWord) {
// is no such non-vowel.
//
// See http://snowball.tartarus.org/texts/r1r2.html
//
func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {

specialPrefix, _ := word.FirstPrefix("gener", "commun", "arsen")
specialPrefix := word.FirstPrefix("gener", "commun", "arsen")

if specialPrefix != "" {
r1start = len(specialPrefix)
Expand All @@ -103,7 +98,6 @@ func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {
}

// Checks if a rune is a lowercase English vowel.
//
func isLowerVowel(r rune) bool {
switch r {
case 97, 101, 105, 111, 117, 121:
Expand All @@ -114,7 +108,6 @@ func isLowerVowel(r rune) bool {

// Returns the stemmed version of a word if it is a special
// case, otherwise returns the empty string.
//
func stemSpecialWord(word string) (stemmed string) {
switch word {
case "skis":
Expand Down Expand Up @@ -202,7 +195,6 @@ func stemSpecialWord(word string) (stemmed string) {
}

// Return `true` if the input `word` is an English stop word.
//
func IsStopWord(word string) bool {
switch word {
case "a", "about", "above", "after", "again", "against", "all", "am", "an",
Expand All @@ -226,7 +218,6 @@ func IsStopWord(word string) bool {
}

// A word is called short if it ends in a short syllable, and if R1 is null.
//
func isShortWord(w *snowballword.SnowballWord) (isShort bool) {

// If r1 is not empty, the word is not short
Expand All @@ -241,9 +232,10 @@ func isShortWord(w *snowballword.SnowballWord) (isShort bool) {
// Return true if the indicies at `w.RS[:i]` end in a short syllable.
// Define a short syllable in a word as either
// (a) a vowel followed by a non-vowel other than w, x or Y
// and preceded by a non-vowel, or
// (b) a vowel at the beginning of the word followed by a non-vowel.
//
// and preceded by a non-vowel, or
//
// (b) a vowel at the beginning of the word followed by a non-vowel.
func endsShortSyllable(w *snowballword.SnowballWord, i int) bool {

if i == 2 {
Expand Down
8 changes: 3 additions & 5 deletions english/english_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*
Herein lie all the tests of the Snowball English stemmer.
Herein lie all the tests of the Snowball English stemmer.
Many of the tests are drawn from cases where this implementation
did not match the results of the Python NLTK implementation.
Many of the tests are drawn from cases where this implementation
did not match the results of the Python NLTK implementation.
*/
package english

Expand All @@ -15,7 +15,6 @@ import (

// Test stopWords for things we know should be true
// or false.
//
func Test_stopWords(t *testing.T) {

// Test true
Expand Down Expand Up @@ -47,7 +46,6 @@ func Test_stopWords(t *testing.T) {

// Test specialWords for things we know should be present
// and not present.
//
func Test_specialWords(t *testing.T) {

// Test true
Expand Down
8 changes: 5 additions & 3 deletions english/step0.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 0 is to strip off apostrophes and "s".
//
func step0(w *snowballword.SnowballWord) bool {
suffix, suffixRunes := w.FirstSuffix("'s'", "'s", "'")
suffix := w.FirstSuffix("'s'", "'s", "'")
if suffix == "" {
return false
}
w.RemoveLastNRunes(len(suffixRunes))
suffixLength := utf8.RuneCountInString(suffix)
w.RemoveLastNRunes(suffixLength)
return true
}
13 changes: 7 additions & 6 deletions english/step1a.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 1a is normalization of various special "s"-endings.
//
func step1a(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
suffix := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
switch suffix {

case "sses":

// Replace by ss
w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune("ss"), true)
return true

case "ies", "ied":
Expand All @@ -28,7 +29,7 @@ func step1a(w *snowballword.SnowballWord) bool {
} else {
repl = "ie"
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

case "us", "ss":
Expand All @@ -37,14 +38,14 @@ func step1a(w *snowballword.SnowballWord) bool {
return false

case "s":

// Delete if the preceding word part contains a vowel
// not immediately before the s (so gas and this retain
// the s, gaps and kiwis lose it)
//
suffixLength := utf8.RuneCountInString(suffix)
for i := 0; i < len(w.RS)-2; i++ {
if isLowerVowel(w.RS[i]) {
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
}
Expand Down
18 changes: 10 additions & 8 deletions english/step1b.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 1b is the normalization of various "ly" and "ed" sufficies.
//
func step1b(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
suffix := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
suffixLength := utf8.RuneCountInString(suffix)

switch suffix {

Expand All @@ -19,14 +21,14 @@ func step1b(w *snowballword.SnowballWord) bool {
case "eed", "eedly":

// Replace by ee if in R1
if len(suffixRunes) <= len(w.RS)-w.R1start {
w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true)
if suffixLength <= len(w.RS)-w.R1start {
w.ReplaceSuffixRunes([]rune(suffix), []rune("ee"), true)
}
return true

case "ed", "edly", "ing", "ingly":
hasLowerVowel := false
for i := 0; i < len(w.RS)-len(suffixRunes); i++ {
for i := 0; i < len(w.RS)-suffixLength; i++ {
if isLowerVowel(w.RS[i]) {
hasLowerVowel = true
break
Expand All @@ -45,11 +47,11 @@ func step1b(w *snowballword.SnowballWord) bool {
originalR2start := w.R2start

// Delete if the preceding word part contains a vowel
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)

// ...and after the deletion...

newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
newSuffix := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
switch newSuffix {

case "":
Expand All @@ -68,7 +70,7 @@ func step1b(w *snowballword.SnowballWord) bool {
case "at", "bl", "iz":

// If the word ends "at", "bl" or "iz" add "e"
w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true)
w.ReplaceSuffixRunes([]rune(newSuffix), []rune(newSuffix+"e"), true)

case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt":

Expand Down
14 changes: 8 additions & 6 deletions english/step2.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 2 is the stemming of various endings found in
// R1 including "al", "ness", and "li".
//
func step2(w *snowballword.SnowballWord) bool {

// Possible sufficies for this step, longest first.
suffix, suffixRunes := w.FirstSuffix(
suffix := w.FirstSuffix(
"ational", "fulness", "iveness", "ization", "ousness",
"biliti", "lessli", "tional", "alism", "aliti", "ation",
"entli", "fulli", "iviti", "ousli", "anci", "abli",
"alli", "ator", "enci", "izer", "bli", "ogi", "li",
)
suffixLength := utf8.RuneCountInString(suffix)

// If it is not in R1, do nothing
if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
if suffix == "" || suffixLength > len(w.RS)-w.R1start {
return false
}

Expand All @@ -39,7 +41,7 @@ func step2(w *snowballword.SnowballWord) bool {
if rsLen >= 3 {
switch w.RS[rsLen-3] {
case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116:
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
}
Expand All @@ -52,7 +54,7 @@ func step2(w *snowballword.SnowballWord) bool {
//
rsLen := len(w.RS)
if rsLen >= 4 && w.RS[rsLen-4] == 108 {
w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune("og"), true)
}
return true
}
Expand Down Expand Up @@ -91,7 +93,7 @@ func step2(w *snowballword.SnowballWord) bool {
case "lessli":
repl = "less"
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

}
13 changes: 8 additions & 5 deletions english/step3.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 3 is the stemming of various longer sufficies
// found in R1.
//
func step3(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix(
suffix := w.FirstSuffix(
"ational", "tional", "alize", "icate", "ative",
"iciti", "ical", "ful", "ness",
)

suffixLength := utf8.RuneCountInString(suffix)

// If it is not in R1, do nothing
if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
if suffix == "" || suffixLength > len(w.RS)-w.R1start {
return false
}

Expand All @@ -28,7 +31,7 @@ func step3(w *snowballword.SnowballWord) bool {
// If in R2, delete.
//
if len(w.RS)-w.R2start >= 5 {
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
return false
Expand All @@ -50,7 +53,7 @@ func step3(w *snowballword.SnowballWord) bool {
case "ful", "ness":
repl = ""
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

}
Loading

0 comments on commit 2569472

Please sign in to comment.