Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce allocations #29

Merged
merged 7 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,15 @@ jobs:
test:
strategy:
matrix:
go-version: [1.14.x, 1.15.x]
go-version: [1.19.x, 1.20.x, 1.21.x, 1.22.x]
os: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Install Go
uses: actions/setup-go@v2
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
uses: actions/checkout@v2
- uses: actions/cache@v2
with:
path: |
~/go/pkg/mod # Module download cache
~/.cache/go-build # Build cache (Linux)
~/Library/Caches/go-build # Build cache (Mac)
'%LocalAppData%\go-build' # Build cache (Windows)
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-
uses: actions/checkout@v4
- name: Test
run: go test ./...
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ I know of a few other stemmers availble in Go:
* [Anton Södergren](https://github.com/AAAton)
* [Eivind Moland](https://github.com/eivindam)
* [ Tamás Gulácsi](https://github.com/tgulacsi)
* [@clipperhouse](https://github.com/clipperhouse)
* Your name should be here!


Expand Down
16 changes: 4 additions & 12 deletions english/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

// Replaces all different kinds of apostrophes with a single
// kind: "'" -- that is, "\x27", or unicode codepoint 39.
//
func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int) {
for i, r := range word.RS {
switch r {
Expand All @@ -27,7 +26,6 @@ func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int

// Trim off leading apostropes. (Slight variation from
// NLTK implementation here, in which only the first is removed.)
//
func trimLeftApostrophes(word *snowballword.SnowballWord) {
var (
numApostrophes int
Expand All @@ -49,7 +47,6 @@ func trimLeftApostrophes(word *snowballword.SnowballWord) {
}

// Capitalize all 'Y's preceded by vowels or starting a word
//
func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
for i, r := range word.RS {

Expand All @@ -64,7 +61,6 @@ func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
}

// Uncapitalize all 'Y's
//
func uncapitalizeYs(word *snowballword.SnowballWord) {
for i, r := range word.RS {

Expand All @@ -88,10 +84,9 @@ func uncapitalizeYs(word *snowballword.SnowballWord) {
// is no such non-vowel.
//
// See http://snowball.tartarus.org/texts/r1r2.html
//
func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {

specialPrefix, _ := word.FirstPrefix("gener", "commun", "arsen")
specialPrefix := word.FirstPrefix("gener", "commun", "arsen")

if specialPrefix != "" {
r1start = len(specialPrefix)
Expand All @@ -103,7 +98,6 @@ func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {
}

// Checks if a rune is a lowercase English vowel.
//
func isLowerVowel(r rune) bool {
switch r {
case 97, 101, 105, 111, 117, 121:
Expand All @@ -114,7 +108,6 @@ func isLowerVowel(r rune) bool {

// Returns the stemmed version of a word if it is a special
// case, otherwise returns the empty string.
//
func stemSpecialWord(word string) (stemmed string) {
switch word {
case "skis":
Expand Down Expand Up @@ -202,7 +195,6 @@ func stemSpecialWord(word string) (stemmed string) {
}

// Return `true` if the input `word` is an English stop word.
//
func IsStopWord(word string) bool {
switch word {
case "a", "about", "above", "after", "again", "against", "all", "am", "an",
Expand All @@ -226,7 +218,6 @@ func IsStopWord(word string) bool {
}

// A word is called short if it ends in a short syllable, and if R1 is null.
//
func isShortWord(w *snowballword.SnowballWord) (isShort bool) {

// If r1 is not empty, the word is not short
Expand All @@ -241,9 +232,10 @@ func isShortWord(w *snowballword.SnowballWord) (isShort bool) {
// Return true if the indicies at `w.RS[:i]` end in a short syllable.
// Define a short syllable in a word as either
// (a) a vowel followed by a non-vowel other than w, x or Y
// and preceded by a non-vowel, or
// (b) a vowel at the beginning of the word followed by a non-vowel.
//
// and preceded by a non-vowel, or
//
// (b) a vowel at the beginning of the word followed by a non-vowel.
func endsShortSyllable(w *snowballword.SnowballWord, i int) bool {

if i == 2 {
Expand Down
8 changes: 3 additions & 5 deletions english/english_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*
Herein lie all the tests of the Snowball English stemmer.
Herein lie all the tests of the Snowball English stemmer.
Many of the tests are drawn from cases where this implementation
did not match the results of the Python NLTK implementation.
Many of the tests are drawn from cases where this implementation
did not match the results of the Python NLTK implementation.
*/
package english

Expand All @@ -15,7 +15,6 @@ import (

// Test stopWords for things we know should be true
// or false.
//
func Test_stopWords(t *testing.T) {

// Test true
Expand Down Expand Up @@ -47,7 +46,6 @@ func Test_stopWords(t *testing.T) {

// Test specialWords for things we know should be present
// and not present.
//
func Test_specialWords(t *testing.T) {

// Test true
Expand Down
8 changes: 5 additions & 3 deletions english/step0.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 0 is to strip off apostrophes and "s".
//
func step0(w *snowballword.SnowballWord) bool {
suffix, suffixRunes := w.FirstSuffix("'s'", "'s", "'")
suffix := w.FirstSuffix("'s'", "'s", "'")
if suffix == "" {
return false
}
w.RemoveLastNRunes(len(suffixRunes))
suffixLength := utf8.RuneCountInString(suffix)
w.RemoveLastNRunes(suffixLength)
return true
}
13 changes: 7 additions & 6 deletions english/step1a.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 1a is normalization of various special "s"-endings.
//
func step1a(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
suffix := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
switch suffix {

case "sses":

// Replace by ss
w.ReplaceSuffixRunes(suffixRunes, []rune("ss"), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune("ss"), true)
return true

case "ies", "ied":
Expand All @@ -28,7 +29,7 @@ func step1a(w *snowballword.SnowballWord) bool {
} else {
repl = "ie"
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

case "us", "ss":
Expand All @@ -37,14 +38,14 @@ func step1a(w *snowballword.SnowballWord) bool {
return false

case "s":

// Delete if the preceding word part contains a vowel
// not immediately before the s (so gas and this retain
// the s, gaps and kiwis lose it)
//
suffixLength := utf8.RuneCountInString(suffix)
for i := 0; i < len(w.RS)-2; i++ {
if isLowerVowel(w.RS[i]) {
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
}
Expand Down
18 changes: 10 additions & 8 deletions english/step1b.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 1b is the normalization of various "ly" and "ed" sufficies.
//
func step1b(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
suffix := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
suffixLength := utf8.RuneCountInString(suffix)

switch suffix {

Expand All @@ -19,14 +21,14 @@ func step1b(w *snowballword.SnowballWord) bool {
case "eed", "eedly":

// Replace by ee if in R1
if len(suffixRunes) <= len(w.RS)-w.R1start {
w.ReplaceSuffixRunes(suffixRunes, []rune("ee"), true)
if suffixLength <= len(w.RS)-w.R1start {
w.ReplaceSuffixRunes([]rune(suffix), []rune("ee"), true)
}
return true

case "ed", "edly", "ing", "ingly":
hasLowerVowel := false
for i := 0; i < len(w.RS)-len(suffixRunes); i++ {
for i := 0; i < len(w.RS)-suffixLength; i++ {
if isLowerVowel(w.RS[i]) {
hasLowerVowel = true
break
Expand All @@ -45,11 +47,11 @@ func step1b(w *snowballword.SnowballWord) bool {
originalR2start := w.R2start

// Delete if the preceding word part contains a vowel
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)

// ...and after the deletion...

newSuffix, newSuffixRunes := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
newSuffix := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
switch newSuffix {

case "":
Expand All @@ -68,7 +70,7 @@ func step1b(w *snowballword.SnowballWord) bool {
case "at", "bl", "iz":

// If the word ends "at", "bl" or "iz" add "e"
w.ReplaceSuffixRunes(newSuffixRunes, []rune(newSuffix+"e"), true)
w.ReplaceSuffixRunes([]rune(newSuffix), []rune(newSuffix+"e"), true)

case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt":

Expand Down
14 changes: 8 additions & 6 deletions english/step2.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 2 is the stemming of various endings found in
// R1 including "al", "ness", and "li".
//
func step2(w *snowballword.SnowballWord) bool {

// Possible sufficies for this step, longest first.
suffix, suffixRunes := w.FirstSuffix(
suffix := w.FirstSuffix(
"ational", "fulness", "iveness", "ization", "ousness",
"biliti", "lessli", "tional", "alism", "aliti", "ation",
"entli", "fulli", "iviti", "ousli", "anci", "abli",
"alli", "ator", "enci", "izer", "bli", "ogi", "li",
)
suffixLength := utf8.RuneCountInString(suffix)

// If it is not in R1, do nothing
if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
if suffix == "" || suffixLength > len(w.RS)-w.R1start {
return false
}

Expand All @@ -39,7 +41,7 @@ func step2(w *snowballword.SnowballWord) bool {
if rsLen >= 3 {
switch w.RS[rsLen-3] {
case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116:
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
}
Expand All @@ -52,7 +54,7 @@ func step2(w *snowballword.SnowballWord) bool {
//
rsLen := len(w.RS)
if rsLen >= 4 && w.RS[rsLen-4] == 108 {
w.ReplaceSuffixRunes(suffixRunes, []rune("og"), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune("og"), true)
}
return true
}
Expand Down Expand Up @@ -91,7 +93,7 @@ func step2(w *snowballword.SnowballWord) bool {
case "lessli":
repl = "less"
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

}
13 changes: 8 additions & 5 deletions english/step3.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
package english

import (
"unicode/utf8"

"github.com/kljensen/snowball/snowballword"
)

// Step 3 is the stemming of various longer sufficies
// found in R1.
//
func step3(w *snowballword.SnowballWord) bool {

suffix, suffixRunes := w.FirstSuffix(
suffix := w.FirstSuffix(
"ational", "tional", "alize", "icate", "ative",
"iciti", "ical", "ful", "ness",
)

suffixLength := utf8.RuneCountInString(suffix)

// If it is not in R1, do nothing
if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
if suffix == "" || suffixLength > len(w.RS)-w.R1start {
return false
}

Expand All @@ -28,7 +31,7 @@ func step3(w *snowballword.SnowballWord) bool {
// If in R2, delete.
//
if len(w.RS)-w.R2start >= 5 {
w.RemoveLastNRunes(len(suffixRunes))
w.RemoveLastNRunes(suffixLength)
return true
}
return false
Expand All @@ -50,7 +53,7 @@ func step3(w *snowballword.SnowballWord) bool {
case "ful", "ness":
repl = ""
}
w.ReplaceSuffixRunes(suffixRunes, []rune(repl), true)
w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
return true

}
Loading