refactor: match

nopdan · Mar 6, 2024 · 5498bec · 5498bec
1 parent e4f134c
commit 5498bec
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 128 deletions.
diff --git a/pkg/matcher/matcher.go b/pkg/matcher/matcher.go
@@ -1,10 +1,51 @@
 package matcher
 
+import "bytes"
+
 type Matcher interface {
 	// 插入一个词条 word code pos
 	Insert(string, string, int)
 	// 构建
 	Build()
-	// 匹配下一个词，返回匹配到的词长，编码和候选位置
-	Match([]rune) (int, string, int)
+	// 匹配下一个词，始终会推进 Reader。
+	// 匹配失败，Pos 为 0
+	Match(*bytes.Reader, *Result)
+}
+
+type Result struct {
+	// 匹配到（或匹配失败）的单个字符
+	Char   rune
+	Size   int    // 字节数 >= 1
+	Length int    // utf-8字符数 >= 1
+	Pos    int    // 候选位置
+	Code   string // 编码
+}
+
+func (r *Result) Reset() {
+	r.SetChar(0).SetSize(0).SetLength(0).SetPos(0).SetCode("")
+}
+
+func (r *Result) SetChar(char rune) *Result {
+	r.Char = char
+	return r
+}
+
+func (r *Result) SetSize(size int) *Result {
+	r.Size = size
+	return r
+}
+
+func (r *Result) SetLength(length int) *Result {
+	r.Length = length
+	return r
+}
+
+func (r *Result) SetPos(pos int) *Result {
+	r.Pos = pos
+	return r
+}
+
+func (r *Result) SetCode(code string) *Result {
+	r.Code = code
+	return r
 }
diff --git a/pkg/matcher/single.go b/pkg/matcher/single.go
@@ -1,31 +1,56 @@
 package matcher
 
-type codePos struct {
-	code string
-	pos  int
-}
+import (
+	"bytes"
+	"unicode/utf8"
+)
 
-type single map[rune]*codePos
+type single struct {
+	dict map[rune]*struct {
+		code string
+		pos  int
+	}
+}
 
 func NewSingle() *single {
-	t := make(single, 1024)
-	return &t
+	s := new(single)
+	s.dict = make(map[rune]*struct {
+		code string
+		pos  int
+	}, 1024)
+	return s
 }
 
 func (s *single) Insert(word, code string, pos int) {
-	char := []rune(word)[0]
-	// 同一个字取码长较短的
-	if cp, ok := (*s)[char]; !ok || len(cp.code) > len(code) {
-		(*s)[char] = &codePos{code, pos}
+	char, _ := utf8.DecodeRuneInString(word)
+	cp, ok := s.dict[char]
+	if ok {
+		if len(cp.code) < len(code) {
+			// 同一个字取码长较短的
+			s.dict[char].pos = pos
+		}
+		return
+	}
+	s.dict[char] = &struct {
+		code string
+		pos  int
+	}{
+		code: code,
+		pos:  pos,
 	}
 }
 
 func (s *single) Build() {
 }
 
-func (s *single) Match(text []rune) (int, string, int) {
-	if v, ok := (*s)[text[0]]; ok {
-		return 1, v.code, v.pos
+func (s *single) Match(brd *bytes.Reader, res *Result) {
+	res.Reset()
+	ch, size, _ := brd.ReadRune()
+	res.Char = ch
+	res.Size = size
+	res.Length = 1
+	if v, ok := s.dict[ch]; ok {
+		res.Code = v.code
+		res.Pos = v.pos
 	}
-	return 0, "", 1
 }
diff --git a/pkg/smq/config.go b/pkg/smq/config.go
@@ -1,6 +1,7 @@
 package smq
 
 import (
+	"fmt"
 	"runtime"
 	"sync"
 
@@ -65,26 +66,26 @@ func (c *Config) Race() [][]*result.Result {
 	var wg sync.WaitGroup
 	for i, text := range c.textList {
 		// 分段计算当前文章，pIdx 为每一段的索引
-		pIdx := 0
+		pIdx := -1
 		for {
 			text, err := text.Iter()
 			// fmt.Println(util.UnsafeToString(text))
 			if err != nil {
 				break
 			}
+			pIdx++
 			for j, dict := range c.dictList {
 				wg.Add(1)
-				// go 1.22 修复了 range 循环问题
-				go func() {
+				go func(i, j, pIdx int) {
 					defer wg.Done()
 					mRes := c.match(text, dict)
 					mRes.TextIdx = i
 					mRes.DictIdx = j
 					mRes.PartIdx = pIdx
+					fmt.Printf("text idx: %d dict idx: %d part idx: %d\n", i, j, pIdx)
 					ch <- mRes
-				}()
+				}(i, j, pIdx)
 			}
-			pIdx++
 		}
 	}
 
@@ -113,7 +114,7 @@ func (c *Config) Race() [][]*result.Result {
 		res[i] = make([]*result.Result, dNum)
 		for j := range dNum {
 			// TODO
-			mRes[i][j].Print(false)
+			// mRes[i][j].Print(false)
 			res[i][j] = mRes[i][j].ToResult()
 		}
 	}

diff --git a/pkg/smq/match.go b/pkg/smq/match.go
@@ -1,138 +1,128 @@
 package smq
 
 import (
+	"bytes"
+	"io"
 	"unicode"
-	"unsafe"
 
 	"github.com/nopdan/gosmq/pkg/dict"
 	"github.com/nopdan/gosmq/pkg/feeling"
+	"github.com/nopdan/gosmq/pkg/matcher"
 	"github.com/nopdan/gosmq/pkg/result"
 	"github.com/nopdan/gosmq/pkg/util"
 )
 
 func (c *Config) match(buffer []byte, dict *dict.Dict) *result.MatchRes {
-
-	// 初始化
 	mRes := result.NewMatchRes()
 	feel := feeling.New(mRes, dict.SpacePref)
+	brd := bytes.NewReader(buffer)
+	res := new(matcher.Result)
 
-	Handler := func(word, code string, wordLen, pos int) {
-		util.Increase(&mRes.WordLenDist, wordLen)
-		util.Increase(&mRes.CollisionDist, pos)
-		util.Increase(&mRes.CodeLenDist, len(code))
-
-		for i := 0; i < len(code); i++ {
-			feel.Process(code[i])
+	process := func(res *matcher.Result) {
+		mRes.Commit.Count++
+		util.Increase(&mRes.WordLenDist, res.Length)
+		util.Increase(&mRes.CollisionDist, res.Pos)
+		util.Increase(&mRes.CodeLenDist, len(res.Code))
+		for i := range len(res.Code) {
+			feel.Process(res.Code[i])
+		}
+		// 匹配到词组
+		if res.Length >= 2 {
+			mRes.Commit.Word++
+			mRes.Commit.WordChars += res.Length
+			if res.Pos == 1 {
+				mRes.Commit.WordFirst++ // 首选词
+			}
+		}
+		if !c.Split && !c.Stat {
+			return
 		}
 
+		var word string
+		if res.Char > 32 {
+			word = string([]rune{res.Char})
+		} else {
+			brd.Seek(-1*int64(res.Size), io.SeekCurrent)
+			data := make([]byte, res.Size)
+			brd.Read(data)
+			word = util.UnsafeToString(data)
+		}
 		// 启用分词
 		if c.Split {
 			mRes.Segment = append(mRes.Segment, result.WordCode{
 				Word: word,
-				Code: code,
+				Code: res.Code,
 			})
 		}
 		// 启用统计
 		if c.Stat {
 			if _, ok := mRes.StatData[word]; !ok {
 				mRes.StatData[word] = &result.CodePosCount{
-					Code:  code,
-					Pos:   pos,
+					Code:  res.Code,
+					Pos:   res.Pos,
 					Count: 1}
 			} else {
 				mRes.StatData[word].Count++
 			}
 		}
 	}
-	// 是否判断缺字
-	HanHandler := func(char rune, lack bool) {
-		isHan := unicode.Is(unicode.Han, char)
-		// 非汉字
-		if !isHan {
-			mRes.NotHanMap[char] = struct{}{}
-		}
-		// 缺汉字
-		if lack && isHan {
-			mRes.LackMap[char] = struct{}{}
-		}
-	}
 
-	text := []rune(*(*string)(unsafe.Pointer(&buffer)))
-	for p := 0; p < len(text); {
+	for brd.Len() > 0 {
 		// 跳过空白字符
-		if text[p] < 33 {
-			p++
-			continue
-		}
-		switch text[p] {
-		case 65533, '　':
-			p++
+		ch, _, _ := brd.ReadRune()
+		if ch < 33 || ch == 65533 || ch == '　' {
 			continue
 		}
-		mRes.Commit.Count++
+		_ = brd.UnreadRune()
 
-		wordLen, code, pos := dict.Matcher.Match(text[p:])
-		// 匹配到了
-		if wordLen != 0 {
-			sWord := string(text[p : p+wordLen])
-			// 打词
-			if wordLen >= 2 {
-				mRes.Commit.Word++
-				mRes.Commit.WordChars += wordLen
-				if pos == 1 {
-					mRes.Commit.WordFirst++
-				}
-			}
-			// 选重
-			if pos >= 2 {
-				mRes.Commit.Collision++
-				mRes.Commit.CollisionChars += wordLen
-			}
-			// 对每个字都进行判断
-			for i := 0; i < wordLen; i++ {
-				HanHandler(text[p+i], false)
-			}
-			Handler(sWord, code, wordLen, pos)
-			p += wordLen
+		// 开始匹配
+		dict.Matcher.Match(brd, res)
+
+		// 匹配成功
+		if res.Pos > 0 {
+			process(res)
 			continue
 		}
 
-		// 匹配不到
+		// 匹配失败了
 		if c.Clean {
-			mRes.Commit.Count--
-			p++
 			continue
 		}
+		res.Pos = 1
 
-		HanHandler(text[p], true)
-		sWord := string(text[p])
-		// 是否为符号
-		code = PunctToCode(text[p])
-		if code != "" {
-			Handler(sWord, code, 1, 1)
-			p++
+		// 两个字符的符号
+		if res.Char == '—' || res.Char == '…' {
+			ch2, _, err := brd.ReadRune()
+			if err != nil {
+				if res.Char == '—' && ch2 == '—' {
+					// 中文破折号 —— 占用 6 字节，不计打词
+					res.SetChar(0).SetCode("=-").SetSize(6)
+					process(res)
+					continue
+				} else if res.Char == '…' && ch2 == '…' {
+					// 中文省略号 …… 占用 6 字节，不计打词
+					res.SetChar(0).SetCode("=6").SetSize(6)
+					process(res)
+					continue
+				}
+			}
+			_ = brd.UnreadRune()
+		}
+		// 单字符符号
+		punct := convertPunct(res.Char)
+		if punct != "" {
+			res.Code = punct
+			process(res)
 			continue
 		}
-		// 单独处理这两个符号，不作为打词
-		if p+1 < len(text) {
-			flag := false
-			switch string(text[p : p+2]) {
-			case "——":
-				Handler("——", "=-", 1, 1)
-				flag = true
-			case "……":
-				Handler("……", "=6", 1, 1)
-				flag = true
-			}
-			if flag {
-				HanHandler(text[p+1], false)
-				p += 2
-				continue
-			}
+		isHan := unicode.Is(unicode.Han, res.Char)
+		if isHan {
+			mRes.LackMap[ch] = struct{}{}
+		} else {
+			mRes.NotHanMap[ch] = struct{}{}
 		}
-		// 找不到的符号，设为 "######"
-		Handler(sWord, "######", 1, 1)
-		p++
+		res.Code = "######"
+		process(res)
 	}
 	return mRes
 }