Skip to content

Commit

Permalink
add separator tokenizer, field mapping and more function
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Oct 6, 2021
1 parent 19d6a89 commit 655a68a
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 13 deletions.
1 change: 1 addition & 0 deletions analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (

func init() {
registry.RegisterAnalyzer(TokenName, analyzerConstructor)
registry.RegisterAnalyzer(SeparateName, analyzerConstructor)
}
88 changes: 78 additions & 10 deletions bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import (
)

const (
TokenName = "gse"
TokenName = "gse"
SeparateName = "sep"
)

// GseCut gse cut token structure
Expand All @@ -24,14 +25,33 @@ type GseCut struct {
trim string
}

// Separator type separator tokenizer struct
type Separator struct {
seg *gse.Segmenter
sep string
trim string
}

// NewSep create a separator tokenizer
func NewSep(sep, trim string) (*Separator, error) {
var seg gse.Segmenter
seg.Dict = gse.NewDict()
seg.Init()
return &Separator{&seg, sep, trim}, nil
}

// NewGseCut create a gse cut tokenizer
func NewGse(dicts, stop, opt, trim string) (*GseCut, error) {
func NewGse(dicts, stop, opt, trim string, alpha bool) (*GseCut, error) {
var (
seg gse.Segmenter
err error
)

seg.SkipLog = true
if alpha {
seg.AlphaNum = true
}

if dicts == "" {
dicts = "zh"
}
Expand All @@ -56,16 +76,26 @@ func NewGse(dicts, stop, opt, trim string) (*GseCut, error) {

// Trim trim the unused token string
func (c *GseCut) Trim(s []string) []string {
if c.trim == "symbol" {
return c.seg.TrimSymbol(s)
return Trim(s, c.trim, c.seg)
}

// Trim trim the unused token string
func (c *Separator) Trim(s []string) []string {
return Trim(s, c.trim, c.seg)
}

// Trim trim the unused token string
func Trim(s []string, trim string, seg *gse.Segmenter) []string {
if trim == "symbol" {
return seg.TrimSymbol(s)
}

if c.trim == "punct" {
return c.seg.TrimPunct(s)
if trim == "punct" {
return seg.TrimPunct(s)
}

if c.trim == "trim" {
return c.seg.Trim(s)
if trim == "trim" {
return seg.Trim(s)
}

return s
Expand Down Expand Up @@ -126,6 +156,24 @@ func (c *GseCut) Tokenize(text []byte) analysis.TokenStream {
return result
}

// Tokenize cut the text to bleve token stream
func (s *Separator) Tokenize(text []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
cuts := s.Trim(strings.Split(string(text), s.sep))
azs := s.seg.Analyze(cuts)
for _, az := range azs {
token := analysis.Token{
Term: []byte(az.Text),
Start: az.Start,
End: az.End,
Position: az.Position,
Type: analysis.Ideographic,
}
result = append(result, &token)
}
return result
}

func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
dicts, ok := config["dicts"].(string)
if !ok {
Expand All @@ -137,7 +185,7 @@ func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache)
}

opt, ok := config["opt"].(string)
if !ok {
if !ok || opt == "" {
opt = ""
}

Expand All @@ -146,9 +194,29 @@ func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache)
trim = ""
}

return NewGse(dicts, stop, opt, trim)
alpha, ok := config["alpha"].(bool)
if !ok {
alpha = false
}

return NewGse(dicts, stop, opt, trim, alpha)
}

func tokenizerConstructor2(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
sep, ok := config["sep"].(string)
if !ok {
sep = " "
}

trim, ok := config["trim"].(string)
if !ok {
trim = ""
}

return NewSep(sep, trim)
}

func init() {
registry.RegisterTokenizer(TokenName, tokenizerConstructor)
registry.RegisterTokenizer(SeparateName, tokenizerConstructor2)
}
71 changes: 68 additions & 3 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,37 @@ import (
type Option struct {
Index string
Dicts, Stop, Opt, Trim string
Alpha bool
Name, Sep string
}

// NewMappingSep new separator mapping
func NewMappingSep(sep string, trim ...string) (*mapping.IndexMappingImpl, error) {
mapping := bleve.NewIndexMapping()
trimOpt := ""
if len(trim) > 0 {
trimOpt = trim[0]
}

err := mapping.AddCustomTokenizer(SeparateName, map[string]interface{}{
"type": SeparateName,
"sep": sep,
"trim": trimOpt,
})
if err != nil {
return nil, err
}

err = mapping.AddCustomAnalyzer(SeparateName, map[string]interface{}{
"type": SeparateName,
"tokenizer": SeparateName,
})
if err != nil {
return nil, err
}

mapping.DefaultAnalyzer = SeparateName
return mapping, nil
}

// NewMapping new bleve index mapping
Expand All @@ -25,9 +56,11 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) {
"stop": opt.Stop,
"opt": opt.Opt,
"trim": opt.Trim,
"alpha": opt.Alpha,
})

if err != nil {
return mapping, err
return nil, err
}

err = mapping.AddCustomAnalyzer(TokenName, map[string]interface{}{
Expand All @@ -36,7 +69,7 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) {
})

if err != nil {
return mapping, err
return nil, err
}

mapping.DefaultAnalyzer = TokenName
Expand All @@ -45,7 +78,15 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) {

// New new bleve index
func New(opt Option) (bleve.Index, error) {
mapping, err := NewMapping(opt)
var (
mapping *mapping.IndexMappingImpl
err error
)
if opt.Name == "sep" {
mapping, err = NewMappingSep(opt.Sep, opt.Trim)
} else {
mapping, err = NewMapping(opt)
}
if err != nil {
return nil, err
}
Expand All @@ -62,3 +103,27 @@ func NewMem(opt Option) (bleve.Index, error) {

return bleve.NewMemOnly(mapping)
}

// NewTextMap new text field mapping with gse
func NewTextMap() *mapping.FieldMapping {
return &mapping.FieldMapping{
Type: "text",
Analyzer: "gse",
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}

// NewSepMap new text field mapping with sep
func NewSepMap() *mapping.FieldMapping {
return &mapping.FieldMapping{
Type: "text",
Analyzer: "sep",
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}

0 comments on commit 655a68a

Please sign in to comment.