Skip to content

Commit

Permalink
refactor: trie
Browse files Browse the repository at this point in the history
  • Loading branch information
nopdan committed Mar 7, 2024
1 parent 5498bec commit cbe22ad
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 269 deletions.
4 changes: 2 additions & 2 deletions pkg/dict/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ func (d *Dict) init() {
} else {
switch d.algorithm {
case "greedy", "":
d.Matcher = matcher.NewTrie(false, false)
d.Matcher = matcher.NewTrie(false)
case "ordered":
d.Matcher = matcher.NewTrie(true, false)
d.Matcher = matcher.NewTrie(true)
case "dynamic":
// TODO
fallthrough
Expand Down
6 changes: 3 additions & 3 deletions pkg/dict/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ type Dict struct {
// 起顶码长
push int
// 选重键
selectKeys [][]byte
selectKeys []string
// 是否只用码表里的单字
Single bool
// 匹配算法 greedy|ordered|dynamic
Expand Down Expand Up @@ -54,9 +54,9 @@ func WithPush(push int) DictOption {

func WithSelectKeys(keys string) DictOption {
return func(opt *Dict) {
res := make([][]byte, 0, 10)
res := make([]string, 0, 10)
for i := range len(keys) {
res = append(res, []byte{keys[i]})
res = append(res, string(keys[i]))
}
opt.selectKeys = res
}
Expand Down
7 changes: 1 addition & 6 deletions pkg/dict/duoduo.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@ package dict

import (
"bufio"
"slices"
"strings"

"github.com/nopdan/gosmq/pkg/util"
)

// 加载多多或者冰凌码表
Expand Down Expand Up @@ -46,7 +43,5 @@ func (d *Dict) addSuffix(code string, pos int) string {
return code
}
// 添加自定义选重键
tmp := util.UnsafeToBytes(code)
tmp = slices.Concat(tmp, d.getSelectKey(pos))
return util.UnsafeToString(tmp)
return code + d.getSelectKey(pos)
}
8 changes: 3 additions & 5 deletions pkg/dict/jisu.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,12 @@ func findSuffixInteger(s string) (string, string) {
return s, ""
}

func (d *Dict) getSelectKey(num int) []byte {
func (d *Dict) getSelectKey(num int) string {
if num < 1 {
return []byte{}
return ""
}
for num > len(d.selectKeys)-2 {
d.selectKeys = append(d.selectKeys,
util.UnsafeToBytes(strconv.Itoa(len(d.selectKeys)+1)),
)
d.selectKeys = append(d.selectKeys, strconv.Itoa(len(d.selectKeys)+1))
}
return d.selectKeys[num-1]
}
2 changes: 1 addition & 1 deletion pkg/matcher/matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type Matcher interface {
type Result struct {
// 匹配到(或匹配失败)的单个字符
Char rune
Size int // 字节数 >= 1
Size int // 字节数
Length int // utf-8字符数 >= 1
Pos int // 候选位置
Code string // 编码
Expand Down
12 changes: 6 additions & 6 deletions pkg/matcher/single.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,23 @@ import (
"unicode/utf8"
)

type single struct {
type Single struct {
dict map[rune]*struct {
code string
pos int
}
}

func NewSingle() *single {
s := new(single)
func NewSingle() *Single {
s := new(Single)
s.dict = make(map[rune]*struct {
code string
pos int
}, 1024)
return s
}

func (s *single) Insert(word, code string, pos int) {
func (s *Single) Insert(word, code string, pos int) {
char, _ := utf8.DecodeRuneInString(word)
cp, ok := s.dict[char]
if ok {
Expand All @@ -40,10 +40,10 @@ func (s *single) Insert(word, code string, pos int) {
}
}

func (s *single) Build() {
func (s *Single) Build() {
}

func (s *single) Match(brd *bytes.Reader, res *Result) {
func (s *Single) Match(brd *bytes.Reader, res *Result) {
res.Reset()
ch, size, _ := brd.ReadRune()
res.Char = ch
Expand Down
207 changes: 58 additions & 149 deletions pkg/matcher/trie.go
Original file line number Diff line number Diff line change
@@ -1,190 +1,99 @@
package matcher

import (
"fmt"
"slices"
"time"
"bytes"
"io"
)

type trie struct {
root *trieNode
values []value

tails []tail
useTail bool // 是否压缩 tail

count uint32 // 插入词的数量
ordered bool // 是否按照码表的顺序
}

type trieNode struct {
ch map[rune]*trieNode

valueIdx int32
tailIdx int32
pass uint32 // 经过节点的次数
type Trie struct {
root *trieNode
count int // 插入词的数量
ordered bool // 是否按照码表的顺序
}

type value struct {
code string
pos int
order uint32 // 插入节点的顺序
order int
}

type tail struct {
runes []rune
valueIdx int32
type trieNode struct {
ch map[rune]*trieNode
value *value
}

func NewTrie(ordered bool, useTail bool) *trie {
t := new(trie)
t.root = newTrieNode()
t.values = make([]value, 0, 1e4)
func NewTrie(ordered bool) *Trie {
t := new(Trie)
t.root = new(trieNode)
t.root.ch = make(map[rune]*trieNode, 8192)
t.ordered = ordered
if useTail {
t.useTail = useTail
t.tails = make([]tail, 0, 1000)
}
return t
}

func newTrieNode() *trieNode {
tn := new(trieNode)
tn.valueIdx = -1
tn.tailIdx = -1
return tn
}

func (t *trie) Insert(word, code string, pos int) {
func (t *Trie) Insert(word, code string, pos int) {
node := t.root
for _, v := range word {
if node.ch == nil {
node.ch = make(map[rune]*trieNode)
node.ch[v] = newTrieNode()
} else if node.ch[v] == nil {
node.ch[v] = newTrieNode()
node.ch[v] = &trieNode{}
} else if _, ok := node.ch[v]; !ok {
node.ch[v] = &trieNode{}
}
node.pass++
node = node.ch[v]
}
t.count++
// 新词
if node.valueIdx == -1 {
node.valueIdx = int32(len(t.values))
t.values = append(t.values, value{code, pos, t.count})
return
}
// 已经存在的词
// 取排在前面的
if t.ordered {
return
}
// 取码长较短的
value := &t.values[node.valueIdx]
if len(value.code) > len(code) {
value.code = code
value.pos = pos
value.order = t.count
}
}

func (t *trie) Build() {
if t.useTail {
start := time.Now()
node := t.root
node.build(&t.tails)
fmt.Printf("构建 tail 耗时: %dms\n", time.Since(start).Milliseconds())
}
}

func (node *trieNode) build(tails *[]tail) {
if node.ch == nil {
return
}
if node.pass == 1 {
node.mergeTail(tails)
return
}
for _, ch := range node.ch {
ch.build(tails)
}
}

// 取唯一的孩子节点
func getUniqueNode(node map[rune]*trieNode) (rune, *trieNode) {
if len(node) != 1 {
panic("children node not unique")
}
for rn, ch := range node {
return rn, ch
}
return 0, nil
}

// 合并 tail 节点
func (head *trieNode) mergeTail(tails *[]tail) {
rn, node := getUniqueNode(head.ch)
// 单字 tail
// AB ABC B->C(tail)
if node.ch == nil {
return
}
// 多字 tail
// AB ABCD B->CD(tail)
runes := []rune{rn}
for node.ch != nil {
rn, node = getUniqueNode(node.ch)
runes = append(runes, rn)
if node.value == nil {
node.value = &value{code, pos, t.count}
} else if !t.ordered {
// 贪心,已经存在的词,取码长较短的
if len(node.value.code) > len(code) {
node.value.code = code
node.value.pos = pos
node.value.order = t.count
}
}
head.ch = nil
head.tailIdx = int32(len(*tails))
*tails = append(*tails, tail{runes, node.valueIdx})
}

// 前缀树最长匹配
func (t *trie) Match(text []rune) (int, string, int) {
func (t *Trie) Build() {}

func (t *Trie) Match(brd *bytes.Reader, res *Result) {
res.Reset()
node := t.root
wordLen := 0
res := new(value)

match := func(p int, _tail tail) {
if p+len(_tail.runes) > len(text) {
return
res.Length = 1 // 至少匹配一个字
var Char rune
var CharSize int
var Size, Length int
var order int
for {
char, size, err := brd.ReadRune()
if err != nil {
break
}
if slices.Equal(_tail.runes, text[p:p+len(_tail.runes)]) {
val := &t.values[_tail.valueIdx]
// 跳过码表顺序在后面的词
if t.ordered && res.order != 0 && val.order > res.order {
return
}
wordLen = p + len(_tail.runes)
res = val
if Char == 0 {
Char = char
CharSize = size
}
}
Size += size
Length++

for p := 0; p < len(text); {
node = node.ch[text[p]]
p++
node = node.ch[char]
if node == nil {
break
}
if node.valueIdx != -1 {
val := &t.values[node.valueIdx]
// 跳过码表顺序在后面的词
if t.ordered && res.order != 0 && val.order > res.order {
} else {
wordLen = p
res = val
if node.value != nil {
if !t.ordered || node.value.order > order {
order = node.value.order
res.Size = Size
res.Length = Length
res.Code = node.value.code
res.Pos = node.value.pos
}
}

// 匹配 tail
if t.useTail && node.tailIdx != -1 {
_tail := t.tails[node.tailIdx]
match(p, _tail)
break
}
}
return wordLen, res.code, res.pos
if res.Length == 1 {
res.Char = Char
brd.Seek(int64(CharSize-Size), io.SeekCurrent)
} else {
brd.Seek(int64(res.Size-Size), io.SeekCurrent)
}
}
Loading

0 comments on commit cbe22ad

Please sign in to comment.