Skip to content

Commit

Permalink
add basic gse bleve function
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Oct 3, 2021
1 parent 7b94c49 commit d034aa5
Show file tree
Hide file tree
Showing 8 changed files with 369 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.idea

examples/examples
examples/test.blv

# Binaries for programs and plugins
*.exe
Expand Down
46 changes: 45 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,48 @@
[![codecov](https://codecov.io/gh/vcaesar/gse-bleve/branch/master/graph/badge.svg)](https://codecov.io/gh/vcaesar/gse-bleve)
[![Go Report Card](https://goreportcard.com/badge/github.com/vcaesar/gse-bleve)](https://goreportcard.com/report/github.com/vcaesar/gse-bleve)
[![GoDoc](https://godoc.org/github.com/vcaesar/gse-bleve?status.svg)](https://godoc.org/github.com/vcaesar/gse-bleve)
[![Release](https://github-release-version.herokuapp.com/github/vcaesar/gse-bleve/release.svg?style=flat)](https://github.com/vcaesar/gse-bleve/releases/latest)
[![Release](https://github-release-version.herokuapp.com/github/vcaesar/gse-bleve/release.svg?style=flat)](https://github.com/vcaesar/gse-bleve/releases/latest)


## Use

```go
package main

import (
"fmt"
"os"

"github.com/blevesearch/bleve/v2"
gse "github.com/vcaesar/gse-bleve"
)

func main() {
opt := gse.Option{
Index: "test.blv",
Dicts: "emend, zh", Stop: "",
Opt: "search-hmm", Trim: "trim"}

index, err := gse.New(opt)
if err != nil {
fmt.Println("new mapping error is: ", err)
}

text := `他在命运的沉浮中随波逐流, 扮演着受害与加害者的双重角色`
err = index.Index("1", text)
index.Index("3", text+"沉浮")
index.Index("4", `In view, a humble vaudevillian veteran cast vicariously as both victim and villain vicissitudes of fate.`)
index.Index("2", `It's difficult to understand the sum of a person's life.`)
if err != nil {
fmt.Println("index error: ", err)
}

query := "命运的沉浮"
req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query))
req.Highlight = bleve.NewHighlight()
res, err := index.Search(req)
fmt.Println(res, err)

os.RemoveAll("test.blv")
}
```
30 changes: 30 additions & 0 deletions analyzer.go
Original file line number Diff line number Diff line change
@@ -1 +1,31 @@
// Copyright 2016 Evans. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package gsebleve

import (
"errors"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, errors.New("must have tokenizer")
}

tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}

az := &analysis.Analyzer{Tokenizer: tokenizer}
return az, nil
}

func init() {
registry.RegisterAnalyzer(TokenName, analyzerConstructor)
}
153 changes: 153 additions & 0 deletions bleve.go
Original file line number Diff line number Diff line change
@@ -1 +1,154 @@
// Copyright 2016 Evans. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package gsebleve

import (
"strings"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/go-ego/gse"
)

const (
TokenName = "gse"
)

// GseCut gse cut token structure
type GseCut struct {
seg *gse.Segmenter
// stop string
opt string
trim string
}

// NewGseCut create a gse cut tokenizer
func NewGse(dicts, stop, opt, trim string) (*GseCut, error) {
var (
seg gse.Segmenter
err error
)

seg.SkipLog = true
if dicts == "" {
dicts = "zh"
}

if strings.Contains(dicts, "emend") {
dicts = strings.Replace(dicts, "emend, ", "", 1)
err = seg.LoadDictEmbed(dicts)
} else {
err = seg.LoadDict(dicts)
}

if stop != "" {
if strings.Contains(stop, "emend") {
stop = strings.Replace(stop, "emend, ", "", 1)
seg.LoadStopEmbed(stop)
} else {
seg.LoadStop(stop)
}
}
return &GseCut{&seg, opt, trim}, err
}

// Trim trim the unused token string
func (c *GseCut) Trim(s []string) []string {
if c.trim == "symbol" {
return c.seg.TrimSymbol(s)
}

if c.trim == "punct" {
return c.seg.TrimPunct(s)
}

if c.trim == "trim" {
return c.seg.Trim(s)
}

return s
}

// Cut option the gse cut mode
func (c *GseCut) Cut(text string, opt string) []string {
if c.trim == "html" {
return c.seg.CutTrimHtml(text)
}

if c.trim == "url" {
return c.seg.CutUrl(text)
}

if opt == "search-hmm" {
return c.seg.CutSearch(text, true)
}
if opt == "search" {
return c.seg.CutSearch(text)
}

if opt == "search-dag" {
return c.seg.CutSearch(text, false)
}

if opt == "all" {
return c.seg.CutAll(text)
}

if opt == "hmm" {
return c.seg.Cut(text, true)
}

if opt == "dag" {
return c.seg.Cut(text, false)
}

return c.seg.Cut(text)
}

// Tokenize cut the text to bleve token stream
func (c *GseCut) Tokenize(text []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
cuts := c.Trim(c.Cut(string(text), c.opt))
// fmt.Println("cuts: ", cuts)
azs := c.seg.Analyze(cuts)
for _, az := range azs {
token := analysis.Token{
Term: []byte(az.Text),
Start: az.Start,
End: az.End,
Position: az.Position,
Type: analysis.Ideographic,
}
result = append(result, &token)
}
return result
}

func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
dicts, ok := config["dicts"].(string)
if !ok {
dicts = ""
}
stop, ok := config["stop"].(string)
if !ok {
stop = ""
}

opt, ok := config["opt"].(string)
if !ok {
opt = ""
}

trim, ok := config["trim"].(string)
if !ok {
trim = ""
}

return NewGse(dicts, stop, opt, trim)
}

func init() {
registry.RegisterTokenizer(TokenName, tokenizerConstructor)
}
37 changes: 37 additions & 0 deletions examples/main.go
Original file line number Diff line number Diff line change
@@ -1 +1,38 @@
package main

import (
"fmt"
"os"

"github.com/blevesearch/bleve/v2"
gse "github.com/vcaesar/gse-bleve"
)

func main() {
opt := gse.Option{
Index: "test.blv",
Dicts: "emend, zh", Stop: "",
Opt: "search-hmm", Trim: "trim"}

index, err := gse.New(opt)
if err != nil {
fmt.Println("new mapping error is: ", err)
}

text := `他在命运的沉浮中随波逐流, 扮演着受害与加害者的双重角色`
err = index.Index("1", text)
index.Index("3", text+"沉浮")
index.Index("4", `In view, a humble vaudevillian veteran cast vicariously as both victim and villain vicissitudes of fate.`)
index.Index("2", `It's difficult to understand the sum of a person's life.`)
if err != nil {
fmt.Println("index error: ", err)
}

query := "命运的沉浮"
req := bleve.NewSearchRequest(bleve.NewQueryStringQuery(query))
req.Highlight = bleve.NewHighlight()
res, err := index.Search(req)
fmt.Println(res, err)

os.RemoveAll("test.blv")
}
19 changes: 19 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,26 @@ require (
)

require (
github.com/RoaringBitmap/roaring v0.9.4 // indirect
github.com/bits-and-blooms/bitset v1.2.0 // indirect
github.com/blevesearch/bleve_index_api v1.0.1 // indirect
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
github.com/blevesearch/mmap-go v1.0.3 // indirect
github.com/blevesearch/scorch_segment_api/v2 v2.1.0 // indirect
github.com/blevesearch/segment v0.9.0 // indirect
github.com/blevesearch/snowballstem v0.9.0 // indirect
github.com/blevesearch/upsidedown_store_api v1.0.1 // indirect
github.com/blevesearch/vellum v1.0.6 // indirect
github.com/blevesearch/zapx/v11 v11.3.0 // indirect
github.com/blevesearch/zapx/v12 v12.3.0 // indirect
github.com/blevesearch/zapx/v13 v13.3.0 // indirect
github.com/blevesearch/zapx/v14 v14.3.0 // indirect
github.com/blevesearch/zapx/v15 v15.3.0 // indirect
github.com/golang/protobuf v1.3.2 // indirect
github.com/golang/snappy v0.0.1 // indirect
github.com/mschoch/smat v0.2.0 // indirect
github.com/steveyen/gtreap v0.1.0 // indirect
github.com/vcaesar/cedar v0.10.1 // indirect
go.etcd.io/bbolt v1.3.5 // indirect
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd // indirect
)
Loading

0 comments on commit d034aa5

Please sign in to comment.