Skip to content

Commit

Permalink
refactor: match
Browse files Browse the repository at this point in the history
  • Loading branch information
nopdan committed Mar 6, 2024
1 parent e4f134c commit 5498bec
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 128 deletions.
45 changes: 43 additions & 2 deletions pkg/matcher/matcher.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,51 @@
package matcher

import "bytes"

type Matcher interface {
// 插入一个词条 word code pos
Insert(string, string, int)
// 构建
Build()
// 匹配下一个词,返回匹配到的词长,编码和候选位置
Match([]rune) (int, string, int)
// 匹配下一个词,始终会推进 Reader。
// 匹配失败,Pos 为 0
Match(*bytes.Reader, *Result)
}

type Result struct {
// 匹配到(或匹配失败)的单个字符
Char rune
Size int // 字节数 >= 1
Length int // utf-8字符数 >= 1
Pos int // 候选位置
Code string // 编码
}

func (r *Result) Reset() {
r.SetChar(0).SetSize(0).SetLength(0).SetPos(0).SetCode("")
}

func (r *Result) SetChar(char rune) *Result {
r.Char = char
return r
}

func (r *Result) SetSize(size int) *Result {
r.Size = size
return r
}

func (r *Result) SetLength(length int) *Result {
r.Length = length
return r
}

func (r *Result) SetPos(pos int) *Result {
r.Pos = pos
return r
}

func (r *Result) SetCode(code string) *Result {
r.Code = code
return r
}
55 changes: 40 additions & 15 deletions pkg/matcher/single.go
Original file line number Diff line number Diff line change
@@ -1,31 +1,56 @@
package matcher

type codePos struct {
code string
pos int
}
import (
"bytes"
"unicode/utf8"
)

type single map[rune]*codePos
type single struct {
dict map[rune]*struct {
code string
pos int
}
}

func NewSingle() *single {
t := make(single, 1024)
return &t
s := new(single)
s.dict = make(map[rune]*struct {
code string
pos int
}, 1024)
return s
}

func (s *single) Insert(word, code string, pos int) {
char := []rune(word)[0]
// 同一个字取码长较短的
if cp, ok := (*s)[char]; !ok || len(cp.code) > len(code) {
(*s)[char] = &codePos{code, pos}
char, _ := utf8.DecodeRuneInString(word)
cp, ok := s.dict[char]
if ok {
if len(cp.code) < len(code) {
// 同一个字取码长较短的
s.dict[char].pos = pos
}
return
}
s.dict[char] = &struct {
code string
pos int
}{
code: code,
pos: pos,
}
}

func (s *single) Build() {
}

func (s *single) Match(text []rune) (int, string, int) {
if v, ok := (*s)[text[0]]; ok {
return 1, v.code, v.pos
func (s *single) Match(brd *bytes.Reader, res *Result) {
res.Reset()
ch, size, _ := brd.ReadRune()
res.Char = ch
res.Size = size
res.Length = 1
if v, ok := s.dict[ch]; ok {
res.Code = v.code
res.Pos = v.pos
}
return 0, "", 1
}
13 changes: 7 additions & 6 deletions pkg/smq/config.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package smq

import (
"fmt"
"runtime"
"sync"

Expand Down Expand Up @@ -65,26 +66,26 @@ func (c *Config) Race() [][]*result.Result {
var wg sync.WaitGroup
for i, text := range c.textList {
// 分段计算当前文章,pIdx 为每一段的索引
pIdx := 0
pIdx := -1
for {
text, err := text.Iter()
// fmt.Println(util.UnsafeToString(text))
if err != nil {
break
}
pIdx++
for j, dict := range c.dictList {
wg.Add(1)
// go 1.22 修复了 range 循环问题
go func() {
go func(i, j, pIdx int) {
defer wg.Done()
mRes := c.match(text, dict)
mRes.TextIdx = i
mRes.DictIdx = j
mRes.PartIdx = pIdx
fmt.Printf("text idx: %d dict idx: %d part idx: %d\n", i, j, pIdx)
ch <- mRes
}()
}(i, j, pIdx)
}
pIdx++
}
}

Expand Down Expand Up @@ -113,7 +114,7 @@ func (c *Config) Race() [][]*result.Result {
res[i] = make([]*result.Result, dNum)
for j := range dNum {
// TODO
mRes[i][j].Print(false)
// mRes[i][j].Print(false)
res[i][j] = mRes[i][j].ToResult()
}
}
Expand Down
164 changes: 77 additions & 87 deletions pkg/smq/match.go
Original file line number Diff line number Diff line change
@@ -1,138 +1,128 @@
package smq

import (
"bytes"
"io"
"unicode"
"unsafe"

"github.com/nopdan/gosmq/pkg/dict"
"github.com/nopdan/gosmq/pkg/feeling"
"github.com/nopdan/gosmq/pkg/matcher"
"github.com/nopdan/gosmq/pkg/result"
"github.com/nopdan/gosmq/pkg/util"
)

func (c *Config) match(buffer []byte, dict *dict.Dict) *result.MatchRes {

// 初始化
mRes := result.NewMatchRes()
feel := feeling.New(mRes, dict.SpacePref)
brd := bytes.NewReader(buffer)
res := new(matcher.Result)

Handler := func(word, code string, wordLen, pos int) {
util.Increase(&mRes.WordLenDist, wordLen)
util.Increase(&mRes.CollisionDist, pos)
util.Increase(&mRes.CodeLenDist, len(code))

for i := 0; i < len(code); i++ {
feel.Process(code[i])
process := func(res *matcher.Result) {
mRes.Commit.Count++
util.Increase(&mRes.WordLenDist, res.Length)
util.Increase(&mRes.CollisionDist, res.Pos)
util.Increase(&mRes.CodeLenDist, len(res.Code))
for i := range len(res.Code) {
feel.Process(res.Code[i])
}
// 匹配到词组
if res.Length >= 2 {
mRes.Commit.Word++
mRes.Commit.WordChars += res.Length
if res.Pos == 1 {
mRes.Commit.WordFirst++ // 首选词
}
}
if !c.Split && !c.Stat {
return
}

var word string
if res.Char > 32 {
word = string([]rune{res.Char})
} else {
brd.Seek(-1*int64(res.Size), io.SeekCurrent)
data := make([]byte, res.Size)
brd.Read(data)
word = util.UnsafeToString(data)
}
// 启用分词
if c.Split {
mRes.Segment = append(mRes.Segment, result.WordCode{
Word: word,
Code: code,
Code: res.Code,
})
}
// 启用统计
if c.Stat {
if _, ok := mRes.StatData[word]; !ok {
mRes.StatData[word] = &result.CodePosCount{
Code: code,
Pos: pos,
Code: res.Code,
Pos: res.Pos,
Count: 1}
} else {
mRes.StatData[word].Count++
}
}
}
// 是否判断缺字
HanHandler := func(char rune, lack bool) {
isHan := unicode.Is(unicode.Han, char)
// 非汉字
if !isHan {
mRes.NotHanMap[char] = struct{}{}
}
// 缺汉字
if lack && isHan {
mRes.LackMap[char] = struct{}{}
}
}

text := []rune(*(*string)(unsafe.Pointer(&buffer)))
for p := 0; p < len(text); {
for brd.Len() > 0 {
// 跳过空白字符
if text[p] < 33 {
p++
continue
}
switch text[p] {
case 65533, ' ':
p++
ch, _, _ := brd.ReadRune()
if ch < 33 || ch == 65533 || ch == ' ' {
continue
}
mRes.Commit.Count++
_ = brd.UnreadRune()

wordLen, code, pos := dict.Matcher.Match(text[p:])
// 匹配到了
if wordLen != 0 {
sWord := string(text[p : p+wordLen])
// 打词
if wordLen >= 2 {
mRes.Commit.Word++
mRes.Commit.WordChars += wordLen
if pos == 1 {
mRes.Commit.WordFirst++
}
}
// 选重
if pos >= 2 {
mRes.Commit.Collision++
mRes.Commit.CollisionChars += wordLen
}
// 对每个字都进行判断
for i := 0; i < wordLen; i++ {
HanHandler(text[p+i], false)
}
Handler(sWord, code, wordLen, pos)
p += wordLen
// 开始匹配
dict.Matcher.Match(brd, res)

// 匹配成功
if res.Pos > 0 {
process(res)
continue
}

// 匹配不到
// 匹配失败了
if c.Clean {
mRes.Commit.Count--
p++
continue
}
res.Pos = 1

HanHandler(text[p], true)
sWord := string(text[p])
// 是否为符号
code = PunctToCode(text[p])
if code != "" {
Handler(sWord, code, 1, 1)
p++
// 两个字符的符号
if res.Char == '—' || res.Char == '…' {
ch2, _, err := brd.ReadRune()
if err != nil {
if res.Char == '—' && ch2 == '—' {
// 中文破折号 —— 占用 6 字节,不计打词
res.SetChar(0).SetCode("=-").SetSize(6)
process(res)
continue
} else if res.Char == '…' && ch2 == '…' {
// 中文省略号 …… 占用 6 字节,不计打词
res.SetChar(0).SetCode("=6").SetSize(6)
process(res)
continue
}
}
_ = brd.UnreadRune()
}
// 单字符符号
punct := convertPunct(res.Char)
if punct != "" {
res.Code = punct
process(res)
continue
}
// 单独处理这两个符号,不作为打词
if p+1 < len(text) {
flag := false
switch string(text[p : p+2]) {
case "——":
Handler("——", "=-", 1, 1)
flag = true
case "……":
Handler("……", "=6", 1, 1)
flag = true
}
if flag {
HanHandler(text[p+1], false)
p += 2
continue
}
isHan := unicode.Is(unicode.Han, res.Char)
if isHan {
mRes.LackMap[ch] = struct{}{}
} else {
mRes.NotHanMap[ch] = struct{}{}
}
// 找不到的符号,设为 "######"
Handler(sWord, "######", 1, 1)
p++
res.Code = "######"
process(res)
}
return mRes
}
Loading

0 comments on commit 5498bec

Please sign in to comment.