Skip to content

Commit

Permalink
Patch in exact search for meilisearch (#29671)
Browse files Browse the repository at this point in the history
meilisearch does not have an search option to contorl fuzzynes per query
right now:
 - meilisearch/meilisearch#1192
 - https://github.com/orgs/meilisearch/discussions/377
 - meilisearch/meilisearch#1096

so we have to create a workaround by post-filter the search result in
gitea until this is addressed.

For future works I added an option in backend only atm, to enable
fuzzynes for issue indexer too.
And also refactored the code so the fuzzy option is equal in logic to
code indexer


---
*Sponsored by Kithara Software GmbH*
  • Loading branch information
6543 authored Mar 9, 2024
1 parent baeb251 commit 7fdc048
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 33 deletions.
12 changes: 6 additions & 6 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {

// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
)

if isMatch {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
} else {
if isFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
} else {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
}

if len(repoIDs) > 0 {
Expand Down
8 changes: 4 additions & 4 deletions modules/indexer/code/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
}

// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypeBestFields
if isMatch {
searchType = esMultiMatchTypePhrasePrefix
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchType := esMultiMatchTypePhrasePrefix
if isFuzzy {
searchType = esMultiMatchTypeBestFields
}

kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
Expand Down
2 changes: 1 addition & 1 deletion modules/indexer/code/indexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {

for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
assert.NoError(t, err)
assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs)
Expand Down
4 changes: 2 additions & 2 deletions modules/indexer/code/internal/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type Indexer interface {
internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
}

// NewDummyIndexer returns a dummy indexer
Expand All @@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return fmt.Errorf("indexer is not ready")
}

func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, fmt.Errorf("indexer is not ready")
}
5 changes: 3 additions & 2 deletions modules/indexer/code/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}

// PerformSearch perform a search on a repository
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
if len(keyword) == 0 {
return 0, nil, nil, nil
}

total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
if err != nil {
return 0, nil, nil, err
}
Expand Down
7 changes: 7 additions & 0 deletions modules/indexer/internal/bleve/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
return q
}

// PrefixQuery generates a match prefix query for the given prefix and field
func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
q := bleve.NewPrefixQuery(matchPrefix)
q.FieldVal = field
return q
}

// BoolFieldQuery generates a bool field query for the given value and field
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
q := bleve.NewBoolFieldQuery(value)
Expand Down
17 changes: 12 additions & 5 deletions modules/indexer/issues/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query

if options.Keyword != "" {
keywordQueries := []query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
if options.IsFuzzyKeyword {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
}...))
} else {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.PrefixQuery(options.Keyword, "title"),
inner_bleve.PrefixQuery(options.Keyword, "content"),
inner_bleve.PrefixQuery(options.Keyword, "comments"),
}...))
}
queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
}

if len(options.RepoIDs) > 0 || options.AllPublic {
Expand Down
12 changes: 11 additions & 1 deletion modules/indexer/issues/elasticsearch/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ import (

const (
issueIndexerLatestVersion = 1
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
)

var _ internal.Indexer = &Indexer{}
Expand Down Expand Up @@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
query := elastic.NewBoolQuery()

if options.Keyword != "" {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))

searchType := esMultiMatchTypePhrasePrefix
if options.IsFuzzyKeyword {
searchType = esMultiMatchTypeBestFields
}

query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
}

if len(options.RepoIDs) > 0 {
Expand Down
2 changes: 2 additions & 0 deletions modules/indexer/issues/internal/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ type SearchResult struct {
type SearchOptions struct {
Keyword string // keyword to search

IsFuzzyKeyword bool // if false the levenshtein distance is 0

RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories

Expand Down
91 changes: 85 additions & 6 deletions modules/indexer/issues/meilisearch/meilisearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package meilisearch

import (
"context"
"errors"
"strconv"
"strings"

Expand All @@ -16,12 +17,15 @@ import (
)

const (
issueIndexerLatestVersion = 2
issueIndexerLatestVersion = 3

// TODO: make this configurable if necessary
maxTotalHits = 10000
)

// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")

var _ internal.Indexer = &Indexer{}

// Indexer implements Indexer interface
Expand All @@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
},
DisplayedAttributes: []string{
"id",
"title",
"content",
"comments",
},
FilterableAttributes: []string{
"repo_id",
Expand Down Expand Up @@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
return nil, err
}

hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hits = append(hits, internal.Match{
ID: int64(hit.(map[string]any)["id"].(float64)),
})
hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
if err != nil {
return nil, err
}

return &internal.SearchResult{
Expand All @@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
}
return field + ":asc"
}

// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
// and you can only change "typo tolerance" per index. So we have to post-filter the results
// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
hit, ok := hit.(map[string]any)
if !ok {
return nil, ErrMalformedResponse
}

if !isFuzzy {
keyword = strings.ToLower(keyword)

// declare a anon func to check if the title, content or at least one comment contains the keyword
found, err := func() (bool, error) {
// check if title match first
title, ok := hit["title"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(title), keyword) {
return true, nil
}

// check if content has a match
content, ok := hit["content"].(string)
if !ok {
return false, ErrMalformedResponse
} else if strings.Contains(strings.ToLower(content), keyword) {
return true, nil
}

// now check for each comment if one has a match
// so we first try to cast and skip if there are no comments
comments, ok := hit["comments"].([]any)
if !ok {
return false, ErrMalformedResponse
} else if len(comments) == 0 {
return false, nil
}

// now we iterate over all and report as soon as we detect one match
for i := range comments {
comment, ok := comments[i].(string)
if !ok {
return false, ErrMalformedResponse
}
if strings.Contains(strings.ToLower(comment), keyword) {
return true, nil
}
}

// we got no match
return false, nil
}()

if err != nil {
return nil, err
} else if !found {
continue
}
}
issueID, ok := hit["id"].(float64)
if !ok {
return nil, ErrMalformedResponse
}
hits = append(hits, internal.Match{
ID: int64(issueID),
})
}
return hits, nil
}
45 changes: 45 additions & 0 deletions modules/indexer/issues/meilisearch/meilisearch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ import (
"testing"
"time"

"code.gitea.io/gitea/modules/indexer/issues/internal"
"code.gitea.io/gitea/modules/indexer/issues/internal/tests"

"github.com/meilisearch/meilisearch-go"
"github.com/stretchr/testify/assert"
)

func TestMeilisearchIndexer(t *testing.T) {
Expand Down Expand Up @@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {

tests.TestIndexer(t, indexer)
}

func TestNonFuzzyWorkaround(t *testing.T) {
// get unexpected return
_, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
Hits: []any{"aa", "bb", "cc", "dd"},
}, "bowling", false)
assert.ErrorIs(t, err, ErrMalformedResponse)

validResponse := &meilisearch.SearchResponse{
Hits: []any{
map[string]any{
"id": float64(11),
"title": "a title",
"content": "issue body with no match",
"comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
},
map[string]any{
"id": float64(22),
"title": "Bowling as title",
"content": "",
"comments": []any{},
},
map[string]any{
"id": float64(33),
"title": "Bowl-ing as fuzzy match",
"content": "",
"comments": []any{},
},
},
}

// nonFuzzy
hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)

// fuzzy
hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
assert.NoError(t, err)
assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
}
4 changes: 2 additions & 2 deletions routers/web/explore/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand Down Expand Up @@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
)

if (len(repoIDs) > 0) || isAdmin {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down
4 changes: 2 additions & 2 deletions routers/web/repo/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func Search(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand All @@ -43,7 +43,7 @@ func Search(ctx *context.Context) {
}

total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down
4 changes: 2 additions & 2 deletions routers/web/user/code.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
keyword := ctx.FormTrim("q")

queryType := ctx.FormTrim("t")
isMatch := queryType == "match"
isFuzzy := queryType != "match"

ctx.Data["Keyword"] = keyword
ctx.Data["Language"] = language
Expand Down Expand Up @@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
)

if len(repoIDs) > 0 {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
if err != nil {
if code_indexer.IsAvailable(ctx) {
ctx.ServerError("SearchResults", err)
Expand Down

0 comments on commit 7fdc048

Please sign in to comment.