Skip to content

Commit

Permalink
MB-56946: update analyzer interface and registry
Browse files Browse the repository at this point in the history
+ Update Analyzer interface to
   + handle errors produced during analysis.
   + use "any" as return type of Analysis (generating values other than tokenStream, like embedding vectors)
+ Update the analyzers registry to allow runtime registration/removal

+ This is to let bleve users register/deregister analyzers at runtime and use it in their index mappings

NOTE:
This will break backward compatibility for bleve customers registering their own analyzer implementations.
  • Loading branch information
moshaad7 committed Sep 9, 2024
1 parent e9c45ff commit c452e11
Show file tree
Hide file tree
Showing 42 changed files with 292 additions and 66 deletions.
5 changes: 4 additions & 1 deletion analysis/benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ func BenchmarkAnalysis(b *testing.B) {
b.Fatal(err)
}

ts := analyzer.Analyze(bleveWikiArticle)
ts, err := analysis.AnalyzeForTokens(analyzer, bleveWikiArticle)
if err != nil {
b.Fatalf("error analyzing text: %v", err)
}
freqs := analysis.TokenFrequency(ts, nil, index.IncludeTermVectors)
if len(freqs) != 511 {
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/ar/analyzer_ar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,10 @@ func TestArabicAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/cjk/analyzer_cjk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,10 @@ func TestCJKAnalyzer(t *testing.T) {
if err != nil {
t.Fatal(err)
}
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/ckb/analyzer_ckb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,10 @@ func TestSoraniAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
8 changes: 6 additions & 2 deletions analysis/lang/ckb/sorani_stemmer_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func TestSoraniStemmerFilter(t *testing.T) {

// in order to match the lucene tests
// we will test with an analyzer, not just the stemmer
analyzer := analysis.DefaultAnalyzer{
analyzer := &analysis.DefaultAnalyzer{
Tokenizer: single.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{
NewSoraniNormalizeFilter(),
Expand Down Expand Up @@ -283,7 +283,11 @@ func TestSoraniStemmerFilter(t *testing.T) {
}

for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Errorf("error analyzing input: %v", err)
}

if !reflect.DeepEqual(actual, test.output) {
t.Errorf("for input %s(% x)", test.input, test.input)
t.Errorf("\texpected:")
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/da/analyzer_da_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ func TestDanishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/de/analyzer_de_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,10 @@ func TestGermanAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/en/analyzer_en_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ func TestEnglishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/es/analyzer_es_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@ func TestSpanishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
15 changes: 12 additions & 3 deletions analysis/lang/fa/analyzer_fa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,10 @@ func TestPersianAnalyzerVerbs(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down Expand Up @@ -600,7 +603,10 @@ func TestPersianAnalyzerVerbsDefective(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down Expand Up @@ -671,7 +677,10 @@ func TestPersianAnalyzerOthers(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/fi/analyzer_fi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestFinishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/fr/analyzer_fr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ func TestFrenchAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/hi/analyzer_hi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ func TestHindiAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/hr/analyzer_hr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,10 @@ func TestCroatianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/hu/analyzer_hu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestHungarianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/it/analyzer_it_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ func TestItalianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/nl/analyzer_nl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestDutchAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/no/analyzer_no_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestNorwegianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/pl/analyzer_pl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ func TestPolishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/pt/analyzer_pt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestPortugueseAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/ro/analyzer_ro_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestRomanianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/ru/analyzer_ru_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ func TestRussianAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/sv/analyzer_sv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ func TestSwedishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
5 changes: 4 additions & 1 deletion analysis/lang/tr/analyzer_tr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ func TestTurkishAnalyzer(t *testing.T) {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
actual, err := analysis.AnalyzeForTokens(analyzer, test.input)
if err != nil {
t.Fatalf("error analyzing input: %v", err)
}
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
Expand Down
Loading

0 comments on commit c452e11

Please sign in to comment.