-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
369 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
.idea | ||
|
||
examples/examples | ||
examples/test.blv | ||
|
||
# Binaries for programs and plugins | ||
*.exe | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,31 @@ | ||
// Copyright 2016 Evans. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package gsebleve | ||
|
||
import ( | ||
"errors" | ||
|
||
"github.com/blevesearch/bleve/v2/analysis" | ||
"github.com/blevesearch/bleve/v2/registry" | ||
) | ||
|
||
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { | ||
tokenizerName, ok := config["tokenizer"].(string) | ||
if !ok { | ||
return nil, errors.New("must have tokenizer") | ||
} | ||
|
||
tokenizer, err := cache.TokenizerNamed(tokenizerName) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
az := &analysis.Analyzer{Tokenizer: tokenizer} | ||
return az, nil | ||
} | ||
|
||
func init() { | ||
registry.RegisterAnalyzer(TokenName, analyzerConstructor) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,154 @@ | ||
// Copyright 2016 Evans. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package gsebleve | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/blevesearch/bleve/v2/analysis" | ||
"github.com/blevesearch/bleve/v2/registry" | ||
"github.com/go-ego/gse" | ||
) | ||
|
||
const ( | ||
TokenName = "gse" | ||
) | ||
|
||
// GseCut gse cut token structure | ||
type GseCut struct { | ||
seg *gse.Segmenter | ||
// stop string | ||
opt string | ||
trim string | ||
} | ||
|
||
// NewGseCut create a gse cut tokenizer | ||
func NewGse(dicts, stop, opt, trim string) (*GseCut, error) { | ||
var ( | ||
seg gse.Segmenter | ||
err error | ||
) | ||
|
||
seg.SkipLog = true | ||
if dicts == "" { | ||
dicts = "zh" | ||
} | ||
|
||
if strings.Contains(dicts, "emend") { | ||
dicts = strings.Replace(dicts, "emend, ", "", 1) | ||
err = seg.LoadDictEmbed(dicts) | ||
} else { | ||
err = seg.LoadDict(dicts) | ||
} | ||
|
||
if stop != "" { | ||
if strings.Contains(stop, "emend") { | ||
stop = strings.Replace(stop, "emend, ", "", 1) | ||
seg.LoadStopEmbed(stop) | ||
} else { | ||
seg.LoadStop(stop) | ||
} | ||
} | ||
return &GseCut{&seg, opt, trim}, err | ||
} | ||
|
||
// Trim trim the unused token string | ||
func (c *GseCut) Trim(s []string) []string { | ||
if c.trim == "symbol" { | ||
return c.seg.TrimSymbol(s) | ||
} | ||
|
||
if c.trim == "punct" { | ||
return c.seg.TrimPunct(s) | ||
} | ||
|
||
if c.trim == "trim" { | ||
return c.seg.Trim(s) | ||
} | ||
|
||
return s | ||
} | ||
|
||
// Cut option the gse cut mode | ||
func (c *GseCut) Cut(text string, opt string) []string { | ||
if c.trim == "html" { | ||
return c.seg.CutTrimHtml(text) | ||
} | ||
|
||
if c.trim == "url" { | ||
return c.seg.CutUrl(text) | ||
} | ||
|
||
if opt == "search-hmm" { | ||
return c.seg.CutSearch(text, true) | ||
} | ||
if opt == "search" { | ||
return c.seg.CutSearch(text) | ||
} | ||
|
||
if opt == "search-dag" { | ||
return c.seg.CutSearch(text, false) | ||
} | ||
|
||
if opt == "all" { | ||
return c.seg.CutAll(text) | ||
} | ||
|
||
if opt == "hmm" { | ||
return c.seg.Cut(text, true) | ||
} | ||
|
||
if opt == "dag" { | ||
return c.seg.Cut(text, false) | ||
} | ||
|
||
return c.seg.Cut(text) | ||
} | ||
|
||
// Tokenize cut the text to bleve token stream | ||
func (c *GseCut) Tokenize(text []byte) analysis.TokenStream { | ||
result := make(analysis.TokenStream, 0) | ||
cuts := c.Trim(c.Cut(string(text), c.opt)) | ||
// fmt.Println("cuts: ", cuts) | ||
azs := c.seg.Analyze(cuts) | ||
for _, az := range azs { | ||
token := analysis.Token{ | ||
Term: []byte(az.Text), | ||
Start: az.Start, | ||
End: az.End, | ||
Position: az.Position, | ||
Type: analysis.Ideographic, | ||
} | ||
result = append(result, &token) | ||
} | ||
return result | ||
} | ||
|
||
func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { | ||
dicts, ok := config["dicts"].(string) | ||
if !ok { | ||
dicts = "" | ||
} | ||
stop, ok := config["stop"].(string) | ||
if !ok { | ||
stop = "" | ||
} | ||
|
||
opt, ok := config["opt"].(string) | ||
if !ok { | ||
opt = "" | ||
} | ||
|
||
trim, ok := config["trim"].(string) | ||
if !ok { | ||
trim = "" | ||
} | ||
|
||
return NewGse(dicts, stop, opt, trim) | ||
} | ||
|
||
func init() { | ||
registry.RegisterTokenizer(TokenName, tokenizerConstructor) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,38 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
|
||
"github.com/blevesearch/bleve/v2" | ||
gse "github.com/vcaesar/gse-bleve" | ||
) | ||
|
||
func main() { | ||
opt := gse.Option{ | ||
Index: "test.blv", | ||
Dicts: "emend, zh", Stop: "", | ||
Opt: "search-hmm", Trim: "trim"} | ||
|
||
index, err := gse.New(opt) | ||
if err != nil { | ||
fmt.Println("new mapping error is: ", err) | ||
} | ||
|
||
text := `他在命运的沉浮中随波逐流, 扮演着受害与加害者的双重角色` | ||
err = index.Index("1", text) | ||
index.Index("3", text+"沉浮") | ||
index.Index("4", `In view, a humble vaudevillian veteran cast vicariously as both victim and villain vicissitudes of fate.`) | ||
index.Index("2", `It's difficult to understand the sum of a person's life.`) | ||
if err != nil { | ||
fmt.Println("index error: ", err) | ||
} | ||
|
||
query := "命运的沉浮" | ||
req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query)) | ||
req.Highlight = bleve.NewHighlight() | ||
res, err := index.Search(req) | ||
fmt.Println(res, err) | ||
|
||
os.RemoveAll("test.blv") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.