Reduce allocations (#29)

* First pass Allocs down to 75, ~double throughput? * One more * Better benchmark * Update to later Go version, lose the build cache * Add go.sum just to make the Actions cache happy * Rename sLen → suffixLength * Add name to contributors
kljensen · Aug 13, 2024 · 2569472 · 2569472
1 parent 98e7b6e
commit 2569472
Show file tree

Hide file tree

Showing 36 changed files with 29,690 additions and 29,669 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -4,25 +4,15 @@ jobs:
   test:
     strategy:
       matrix:
-        go-version: [1.14.x, 1.15.x]
+        go-version: [1.19.x, 1.20.x, 1.21.x, 1.22.x]
         os: [ubuntu-latest, macos-latest, windows-latest]
     runs-on: ${{ matrix.os }}
     steps:
     - name: Install Go
-      uses: actions/setup-go@v2
+      uses: actions/setup-go@v5
       with:
         go-version: ${{ matrix.go-version }}
     - name: Checkout code
-      uses: actions/checkout@v2
-    - uses: actions/cache@v2
-      with:
-        path: |
-          ~/go/pkg/mod              # Module download cache
-          ~/.cache/go-build         # Build cache (Linux)
-          ~/Library/Caches/go-build # Build cache (Mac)
-          '%LocalAppData%\go-build' # Build cache (Windows)
-        key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
-        restore-keys: |
-          ${{ runner.os }}-go-
+      uses: actions/checkout@v4
     - name: Test
       run: go test ./...
diff --git a/README.md b/README.md
@@ -118,6 +118,7 @@ I know of a few other stemmers availble in Go:
 * [Anton Södergren](https://github.com/AAAton)
 * [Eivind Moland](https://github.com/eivindam)
 * [ Tamás Gulácsi](https://github.com/tgulacsi)
+* [@clipperhouse](https://github.com/clipperhouse)
 * Your name should be here!
 
 

diff --git a/english/common.go b/english/common.go
@@ -7,7 +7,6 @@ import (
 
 // Replaces all different kinds of apostrophes with a single
 // kind: "'" -- that is, "\x27", or unicode codepoint 39.
-//
 func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int) {
 	for i, r := range word.RS {
 		switch r {
@@ -27,7 +26,6 @@ func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int
 
 // Trim off leading apostropes.  (Slight variation from
 // NLTK implementation here, in which only the first is removed.)
-//
 func trimLeftApostrophes(word *snowballword.SnowballWord) {
 	var (
 		numApostrophes int
@@ -49,7 +47,6 @@ func trimLeftApostrophes(word *snowballword.SnowballWord) {
 }
 
 // Capitalize all 'Y's preceded by vowels or starting a word
-//
 func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
 	for i, r := range word.RS {
 
@@ -64,7 +61,6 @@ func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
 }
 
 // Uncapitalize all 'Y's
-//
 func uncapitalizeYs(word *snowballword.SnowballWord) {
 	for i, r := range word.RS {
 
@@ -88,10 +84,9 @@ func uncapitalizeYs(word *snowballword.SnowballWord) {
 // is no such non-vowel.
 //
 // See http://snowball.tartarus.org/texts/r1r2.html
-//
 func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {
 
-	specialPrefix, _ := word.FirstPrefix("gener", "commun", "arsen")
+	specialPrefix := word.FirstPrefix("gener", "commun", "arsen")
 
 	if specialPrefix != "" {
 		r1start = len(specialPrefix)
@@ -103,7 +98,6 @@ func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {
 }
 
 // Checks if a rune is a lowercase English vowel.
-//
 func isLowerVowel(r rune) bool {
 	switch r {
 	case 97, 101, 105, 111, 117, 121:
@@ -114,7 +108,6 @@ func isLowerVowel(r rune) bool {
 
 // Returns the stemmed version of a word if it is a special
 // case, otherwise returns the empty string.
-//
 func stemSpecialWord(word string) (stemmed string) {
 	switch word {
 	case "skis":
@@ -202,7 +195,6 @@ func stemSpecialWord(word string) (stemmed string) {
 }
 
 // Return `true` if the input `word` is an English stop word.
-//
 func IsStopWord(word string) bool {
 	switch word {
 	case "a", "about", "above", "after", "again", "against", "all", "am", "an",
@@ -226,7 +218,6 @@ func IsStopWord(word string) bool {
 }
 
 // A word is called short if it ends in a short syllable, and if R1 is null.
-//
 func isShortWord(w *snowballword.SnowballWord) (isShort bool) {
 
 	// If r1 is not empty, the word is not short
@@ -241,9 +232,10 @@ func isShortWord(w *snowballword.SnowballWord) (isShort bool) {
 // Return true if the indicies at `w.RS[:i]` end in a short syllable.
 // Define a short syllable in a word as either
 // (a) a vowel followed by a non-vowel other than w, x or Y
-//     and preceded by a non-vowel, or
-// (b) a vowel at the beginning of the word followed by a non-vowel.
 //
+//	and preceded by a non-vowel, or
+//
+// (b) a vowel at the beginning of the word followed by a non-vowel.
 func endsShortSyllable(w *snowballword.SnowballWord, i int) bool {
 
 	if i == 2 {

diff --git a/english/english_test.go b/english/english_test.go
@@ -1,8 +1,8 @@
 /*
-	Herein lie all the tests of the Snowball English stemmer.
+Herein lie all the tests of the Snowball English stemmer.
 
-	Many of the tests are drawn from cases where this implementation
-	did not match the results of the Python NLTK implementation.
+Many of the tests are drawn from cases where this implementation
+did not match the results of the Python NLTK implementation.
 */
 package english
 
@@ -15,7 +15,6 @@ import (
 
 // Test stopWords for things we know should be true
 // or false.
-//
 func Test_stopWords(t *testing.T) {
 
 	// Test true
@@ -47,7 +46,6 @@ func Test_stopWords(t *testing.T) {
 
 // Test specialWords for things we know should be present
 // and not present.
-//
 func Test_specialWords(t *testing.T) {
 
 	// Test true

diff --git a/english/step0.go b/english/step0.go
@@ -1,16 +1,18 @@
 package english
 
 import (
+	"unicode/utf8"
+
 	"github.com/kljensen/snowball/snowballword"
 )
 
 // Step 0 is to strip off apostrophes and "s".
-//
 func step0(w *snowballword.SnowballWord) bool {
-	suffix, suffixRunes := w.FirstSuffix("'s'", "'s", "'")
+	suffix := w.FirstSuffix("'s'", "'s", "'")
 	if suffix == "" {
 		return false
 	}
-	w.RemoveLastNRunes(len(suffixRunes))
+	suffixLength := utf8.RuneCountInString(suffix)
+	w.RemoveLastNRunes(suffixLength)
 	return true
 }
diff --git a/english/step1a.go b/english/step1a.go
@@ -1,20 +1,21 @@
 package english
 
 import (
+	"unicode/utf8"
+
 	"github.com/kljensen/snowball/snowballword"
 )
 
 // Step 1a is normalization of various special "s"-endings.
-//
 func step1a(w *snowballword.SnowballWord) bool {
 
-	suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
+	suffix := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
 	switch suffix {
 
 	case "sses":
 
 		// Replace by ss
-		w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true)
+		w.ReplaceSuffixRunes([]rune(suffix), []rune("ss"), true)
 		return true
 
 	case "ies", "ied":
@@ -28,7 +29,7 @@ func step1a(w *snowballword.SnowballWord) bool {
 		} else {
 			repl = "ie"
 		}
-		w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
+		w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 		return true
 
 	case "us", "ss":
@@ -37,14 +38,14 @@ func step1a(w *snowballword.SnowballWord) bool {
 		return false
 
 	case "s":
-
 		// Delete if the preceding word part contains a vowel
 		// not immediately before the s (so gas and this retain
 		// the s, gaps and kiwis lose it)
 		//
+		suffixLength := utf8.RuneCountInString(suffix)
 		for i := 0; i < len(w.RS)-2; i++ {
 			if isLowerVowel(w.RS[i]) {
-				w.RemoveLastNRunes(len(suffixRunes))
+				w.RemoveLastNRunes(suffixLength)
 				return true
 			}
 		}

diff --git a/english/step1b.go b/english/step1b.go
@@ -1,14 +1,16 @@
 package english
 
 import (
+	"unicode/utf8"
+
 	"github.com/kljensen/snowball/snowballword"
 )
 
 // Step 1b is the normalization of various "ly" and "ed" sufficies.
-//
 func step1b(w *snowballword.SnowballWord) bool {
 
-	suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
+	suffix := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
+	suffixLength := utf8.RuneCountInString(suffix)
 
 	switch suffix {
 
@@ -19,14 +21,14 @@ func step1b(w *snowballword.SnowballWord) bool {
 	case "eed", "eedly":
 
 		// Replace by ee if in R1
-		if len(suffixRunes) <= len(w.RS)-w.R1start {
-			w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true)
+		if suffixLength <= len(w.RS)-w.R1start {
+			w.ReplaceSuffixRunes([]rune(suffix), []rune("ee"), true)
 		}
 		return true
 
 	case "ed", "edly", "ing", "ingly":
 		hasLowerVowel := false
-		for i := 0; i < len(w.RS)-len(suffixRunes); i++ {
+		for i := 0; i < len(w.RS)-suffixLength; i++ {
 			if isLowerVowel(w.RS[i]) {
 				hasLowerVowel = true
 				break
@@ -45,11 +47,11 @@ func step1b(w *snowballword.SnowballWord) bool {
 			originalR2start := w.R2start
 
 			// Delete if the preceding word part contains a vowel
-			w.RemoveLastNRunes(len(suffixRunes))
+			w.RemoveLastNRunes(suffixLength)
 
 			// ...and after the deletion...
 
-			newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
+			newSuffix := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
 			switch newSuffix {
 
 			case "":
@@ -68,7 +70,7 @@ func step1b(w *snowballword.SnowballWord) bool {
 			case "at", "bl", "iz":
 
 				// If the word ends "at", "bl" or "iz" add "e"
-				w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true)
+				w.ReplaceSuffixRunes([]rune(newSuffix), []rune(newSuffix+"e"), true)
 
 			case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt":
 

diff --git a/english/step2.go b/english/step2.go
@@ -1,24 +1,26 @@
 package english
 
 import (
+	"unicode/utf8"
+
 	"github.com/kljensen/snowball/snowballword"
 )
 
 // Step 2 is the stemming of various endings found in
 // R1 including "al", "ness", and "li".
-//
 func step2(w *snowballword.SnowballWord) bool {
 
 	// Possible sufficies for this step, longest first.
-	suffix, suffixRunes := w.FirstSuffix(
+	suffix := w.FirstSuffix(
 		"ational", "fulness", "iveness", "ization", "ousness",
 		"biliti", "lessli", "tional", "alism", "aliti", "ation",
 		"entli", "fulli", "iviti", "ousli", "anci", "abli",
 		"alli", "ator", "enci", "izer", "bli", "ogi", "li",
 	)
+	suffixLength := utf8.RuneCountInString(suffix)
 
 	// If it is not in R1, do nothing
-	if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
+	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
 		return false
 	}
 
@@ -39,7 +41,7 @@ func step2(w *snowballword.SnowballWord) bool {
 		if rsLen >= 3 {
 			switch w.RS[rsLen-3] {
 			case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116:
-				w.RemoveLastNRunes(len(suffixRunes))
+				w.RemoveLastNRunes(suffixLength)
 				return true
 			}
 		}
@@ -52,7 +54,7 @@ func step2(w *snowballword.SnowballWord) bool {
 		//
 		rsLen := len(w.RS)
 		if rsLen >= 4 && w.RS[rsLen-4] == 108 {
-			w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true)
+			w.ReplaceSuffixRunes([]rune(suffix), []rune("og"), true)
 		}
 		return true
 	}
@@ -91,7 +93,7 @@ func step2(w *snowballword.SnowballWord) bool {
 	case "lessli":
 		repl = "less"
 	}
-	w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
+	w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 	return true
 
 }
diff --git a/english/step3.go b/english/step3.go
@@ -1,21 +1,24 @@
 package english
 
 import (
+	"unicode/utf8"
+
 	"github.com/kljensen/snowball/snowballword"
 )
 
 // Step 3 is the stemming of various longer sufficies
 // found in R1.
-//
 func step3(w *snowballword.SnowballWord) bool {
 
-	suffix, suffixRunes := w.FirstSuffix(
+	suffix := w.FirstSuffix(
 		"ational", "tional", "alize", "icate", "ative",
 		"iciti", "ical", "ful", "ness",
 	)
 
+	suffixLength := utf8.RuneCountInString(suffix)
+
 	// If it is not in R1, do nothing
-	if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
+	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
 		return false
 	}
 
@@ -28,7 +31,7 @@ func step3(w *snowballword.SnowballWord) bool {
 		// If in R2, delete.
 		//
 		if len(w.RS)-w.R2start >= 5 {
-			w.RemoveLastNRunes(len(suffixRunes))
+			w.RemoveLastNRunes(suffixLength)
 			return true
 		}
 		return false
@@ -50,7 +53,7 @@ func step3(w *snowballword.SnowballWord) bool {
 	case "ful", "ness":
 		repl = ""
 	}
-	w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
+	w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 	return true
 
 }