diff --git a/analyzer.go b/analyzer.go index 6b8e154..c227b89 100644 --- a/analyzer.go +++ b/analyzer.go @@ -28,4 +28,5 @@ func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( func init() { registry.RegisterAnalyzer(TokenName, analyzerConstructor) + registry.RegisterAnalyzer(SeparateName, analyzerConstructor) } diff --git a/bleve.go b/bleve.go index ac36111..e1fd5c9 100644 --- a/bleve.go +++ b/bleve.go @@ -13,7 +13,8 @@ import ( ) const ( - TokenName = "gse" + TokenName = "gse" + SeparateName = "sep" ) // GseCut gse cut token structure @@ -24,14 +25,33 @@ type GseCut struct { trim string } +// Separator type separator tokenizer struct +type Separator struct { + seg *gse.Segmenter + sep string + trim string +} + +// NewSep create a separator tokenizer +func NewSep(sep, trim string) (*Separator, error) { + var seg gse.Segmenter + seg.Dict = gse.NewDict() + seg.Init() + return &Separator{&seg, sep, trim}, nil +} + // NewGseCut create a gse cut tokenizer -func NewGse(dicts, stop, opt, trim string) (*GseCut, error) { +func NewGse(dicts, stop, opt, trim string, alpha bool) (*GseCut, error) { var ( seg gse.Segmenter err error ) seg.SkipLog = true + if alpha { + seg.AlphaNum = true + } + if dicts == "" { dicts = "zh" } @@ -56,16 +76,26 @@ func NewGse(dicts, stop, opt, trim string) (*GseCut, error) { // Trim trim the unused token string func (c *GseCut) Trim(s []string) []string { - if c.trim == "symbol" { - return c.seg.TrimSymbol(s) + return Trim(s, c.trim, c.seg) +} + +// Trim trim the unused token string +func (c *Separator) Trim(s []string) []string { + return Trim(s, c.trim, c.seg) +} + +// Trim trim the unused token string +func Trim(s []string, trim string, seg *gse.Segmenter) []string { + if trim == "symbol" { + return seg.TrimSymbol(s) } - if c.trim == "punct" { - return c.seg.TrimPunct(s) + if trim == "punct" { + return seg.TrimPunct(s) } - if c.trim == "trim" { - return c.seg.Trim(s) + if trim == "trim" { + return seg.Trim(s) } return s @@ -126,6 +156,24 @@ func (c *GseCut) Tokenize(text []byte) analysis.TokenStream { return result } +// Tokenize cut the text to bleve token stream +func (s *Separator) Tokenize(text []byte) analysis.TokenStream { + result := make(analysis.TokenStream, 0) + cuts := s.Trim(strings.Split(string(text), s.sep)) + azs := s.seg.Analyze(cuts) + for _, az := range azs { + token := analysis.Token{ + Term: []byte(az.Text), + Start: az.Start, + End: az.End, + Position: az.Position, + Type: analysis.Ideographic, + } + result = append(result, &token) + } + return result +} + func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { dicts, ok := config["dicts"].(string) if !ok { @@ -137,7 +185,7 @@ func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) } opt, ok := config["opt"].(string) - if !ok { + if !ok || opt == "" { opt = "" } @@ -146,9 +194,29 @@ func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) trim = "" } - return NewGse(dicts, stop, opt, trim) + alpha, ok := config["alpha"].(bool) + if !ok { + alpha = false + } + + return NewGse(dicts, stop, opt, trim, alpha) +} + +func tokenizerConstructor2(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + sep, ok := config["sep"].(string) + if !ok { + sep = " " + } + + trim, ok := config["trim"].(string) + if !ok { + trim = "" + } + + return NewSep(sep, trim) } func init() { registry.RegisterTokenizer(TokenName, tokenizerConstructor) + registry.RegisterTokenizer(SeparateName, tokenizerConstructor2) } diff --git a/index.go b/index.go index 2bddb52..2b3605f 100644 --- a/index.go +++ b/index.go @@ -13,6 +13,37 @@ import ( type Option struct { Index string Dicts, Stop, Opt, Trim string + Alpha bool + Name, Sep string +} + +// NewMappingSep new separator mapping +func NewMappingSep(sep string, trim ...string) (*mapping.IndexMappingImpl, error) { + mapping := bleve.NewIndexMapping() + trimOpt := "" + if len(trim) > 0 { + trimOpt = trim[0] + } + + err := mapping.AddCustomTokenizer(SeparateName, map[string]interface{}{ + "type": SeparateName, + "sep": sep, + "trim": trimOpt, + }) + if err != nil { + return nil, err + } + + err = mapping.AddCustomAnalyzer(SeparateName, map[string]interface{}{ + "type": SeparateName, + "tokenizer": SeparateName, + }) + if err != nil { + return nil, err + } + + mapping.DefaultAnalyzer = SeparateName + return mapping, nil } // NewMapping new bleve index mapping @@ -25,9 +56,11 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) { "stop": opt.Stop, "opt": opt.Opt, "trim": opt.Trim, + "alpha": opt.Alpha, }) + if err != nil { - return mapping, err + return nil, err } err = mapping.AddCustomAnalyzer(TokenName, map[string]interface{}{ @@ -36,7 +69,7 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) { }) if err != nil { - return mapping, err + return nil, err } mapping.DefaultAnalyzer = TokenName @@ -45,7 +78,15 @@ func NewMapping(opt Option) (*mapping.IndexMappingImpl, error) { // New new bleve index func New(opt Option) (bleve.Index, error) { - mapping, err := NewMapping(opt) + var ( + mapping *mapping.IndexMappingImpl + err error + ) + if opt.Name == "sep" { + mapping, err = NewMappingSep(opt.Sep, opt.Trim) + } else { + mapping, err = NewMapping(opt) + } if err != nil { return nil, err } @@ -62,3 +103,27 @@ func NewMem(opt Option) (bleve.Index, error) { return bleve.NewMemOnly(mapping) } + +// NewTextMap new text field mapping with gse +func NewTextMap() *mapping.FieldMapping { + return &mapping.FieldMapping{ + Type: "text", + Analyzer: "gse", + Store: true, + Index: true, + IncludeInAll: true, + DocValues: true, + } +} + +// NewSepMap new text field mapping with sep +func NewSepMap() *mapping.FieldMapping { + return &mapping.FieldMapping{ + Type: "text", + Analyzer: "sep", + Store: true, + Index: true, + IncludeInAll: true, + DocValues: true, + } +}