From e4f134c2c354e2c3ea0a648aa58f337d6fde9f08 Mon Sep 17 00:00:00 2001 From: nopdan Date: Wed, 6 Mar 2024 12:48:17 +0800 Subject: [PATCH] refactor(core): almost all --- frontend/components.d.ts | 6 +- frontend/src/components/NewText.vue | 21 +- internal/gen/gen.go | 51 ----- internal/gen/suf_test.go | 16 -- internal/gen/tsv.go | 54 ----- internal/gen/write.go | 30 --- pkg/dict/default.go | 135 ++++++++++++ pkg/dict/dict.go | 81 +++++++ pkg/dict/duoduo.go | 52 +++++ {internal/gen => pkg/dict}/jisu.go | 39 ++-- pkg/dict/test_test.go | 33 +++ pkg/dict/xiaoxiao.go | 30 +++ pkg/feeling/comb.go | 44 ++-- pkg/feeling/feel.go | 126 +++++++++++ pkg/feeling/{test_test.go => feeling_test.go} | 26 ++- pkg/matcher/trie.go | 14 +- pkg/result/match_res.go | 200 ++++++++++++++++++ pkg/{smq => result}/result.go | 10 +- pkg/{smq => result}/stat.go | 26 ++- pkg/smq/config.go | 121 +++++++++++ pkg/smq/dict.go | 87 -------- pkg/smq/feel.go | 103 --------- pkg/smq/match.go | 80 ++++--- pkg/smq/match_res.go | 117 ---------- pkg/smq/merge.go | 65 ------ pkg/smq/smq.go | 160 -------------- pkg/smq/smq_test.go | 60 ++++++ pkg/smq/some_test.go | 13 -- pkg/text/text.go | 12 +- pkg/util/tsv.go | 39 ---- 30 files changed, 999 insertions(+), 852 deletions(-) delete mode 100644 internal/gen/gen.go delete mode 100644 internal/gen/suf_test.go delete mode 100644 internal/gen/tsv.go delete mode 100644 internal/gen/write.go create mode 100644 pkg/dict/default.go create mode 100644 pkg/dict/dict.go create mode 100644 pkg/dict/duoduo.go rename {internal/gen => pkg/dict}/jisu.go (57%) create mode 100644 pkg/dict/test_test.go create mode 100644 pkg/dict/xiaoxiao.go create mode 100644 pkg/feeling/feel.go rename pkg/feeling/{test_test.go => feeling_test.go} (54%) create mode 100644 pkg/result/match_res.go rename pkg/{smq => result}/result.go (96%) rename pkg/{smq => result}/stat.go (88%) create mode 100644 pkg/smq/config.go delete mode 100644 pkg/smq/dict.go delete mode 100644 pkg/smq/feel.go delete mode 100644 pkg/smq/match_res.go delete mode 100644 pkg/smq/merge.go delete mode 100644 pkg/smq/smq.go create mode 100644 pkg/smq/smq_test.go delete mode 100644 pkg/smq/some_test.go delete mode 100644 pkg/util/tsv.go diff --git a/frontend/components.d.ts b/frontend/components.d.ts index f78bc0e..a3f2997 100644 --- a/frontend/components.d.ts +++ b/frontend/components.d.ts @@ -17,12 +17,12 @@ declare module '@vue/runtime-core' { CombsDescription: typeof import('./src/components/Result/CombsDescription.vue')['default'] CombsDistBar: typeof import('./src/components/Result/CombsDistBar.vue')['default'] ComparedBars: typeof import('./src/components/Result/comparedBars.vue')['default'] - Dict: typeof import('./src/components/Home/Dict.vue')['default'] + Dict: typeof import('./src/components/Dict.vue')['default'] FingerPie: typeof import('./src/components/Result/FingerPie.vue')['default'] FingersDescription: typeof import('./src/components/Result/FingersDescription.vue')['default'] HandComp: typeof import('./src/components/Result/HandComp.vue')['default'] HandsDescription: typeof import('./src/components/Result/HandsDescription.vue')['default'] - Input: typeof import('./src/components/Home/Input.vue')['default'] + Input: typeof import('./src/components/Input.vue')['default'] Jisu: typeof import('./src/components/Jisu.vue')['default'] KeyHeatSorted: typeof import('./src/components/Result/KeyHeatSorted.vue')['default'] Main: typeof import('./src/components/Main.vue')['default'] @@ -71,7 +71,7 @@ declare module '@vue/runtime-core' { Result: typeof import('./src/components/Result/Result.vue')['default'] ResultBasic: typeof import('./src/components/Result/ResultBasic.vue')['default'] ResultKeyHeat: typeof import('./src/components/Result/ResultKeyHeat.vue')['default'] - Text: typeof import('./src/components/Home/Text.vue')['default'] + Text: typeof import('./src/components/Text.vue')['default'] WordsDescription: typeof import('./src/components/Result/WordsDescription.vue')['default'] WordsDistBar: typeof import('./src/components/Result/WordsDistBar.vue')['default'] } diff --git a/frontend/src/components/NewText.vue b/frontend/src/components/NewText.vue index 8f788a3..5e9f913 100644 --- a/frontend/src/components/NewText.vue +++ b/frontend/src/components/NewText.vue @@ -156,7 +156,7 @@ 匹配算法 - + {{ algo.label }} @@ -327,10 +327,12 @@ enum DictFormat { Default = "default", /** 极速赛码表 */ Jisu = "jisu", - /** 多多格式 */ + /** 多多 */ Duoduo = "duoduo", - /** 极点格式 */ - Jidian = "jidian", + /** 冰凌 */ + Bingling = "bingling", + /** 小小 */ + Xiaoxiao = "xiaoxiao", } enum Algorithm { @@ -376,12 +378,16 @@ const formatOptions = [ value: DictFormat.Jisu, }, { - label: "多多(Rime)", + label: "多多 | Rime", value: DictFormat.Duoduo, }, { - label: "极点", - value: DictFormat.Jidian, + label: "冰凌", + value: DictFormat.Bingling, + }, + { + label: "小小 | 极点", + value: DictFormat.Xiaoxiao, }, ]; @@ -397,6 +403,7 @@ const algoOptions = [ { label: "最短码长(慢)", value: Algorithm.Dynamic, + disabled: true }, ]; diff --git a/internal/gen/gen.go b/internal/gen/gen.go deleted file mode 100644 index 04746bf..0000000 --- a/internal/gen/gen.go +++ /dev/null @@ -1,51 +0,0 @@ -package gen - -import ( - "sort" -) - -type Config struct { - Path string //待转换的码表路径 - Format string //待转换码表的格式 - - SelectKeys string //自定义选重键 - PushStart int //起顶码长 - SortByWordLen bool // 按照词长重新排序 -} - -type Entry struct { - Word string - Code string - Pos int -} - -func (c *Config) Gen() []*Entry { - var dict []*Entry - - switch c.Format { - case "jisu", "js": - dict = c.LoadJisu() - case "duoduo", "dd": - dict = c.LoadTSV(true) - case "bingling", "bl": - dict = c.LoadTSV(false) - default: - panic("不支持的格式: " + c.Format) - } - - if c.SortByWordLen { - sort.SliceStable(dict, func(i, j int) bool { - return len([]rune(dict[i].Word)) > len([]rune(dict[j].Word)) - }) - } - return dict -} - -// 专用,两位正数 1~99 byte 转 string -// func Itoa(b byte) string { -// if b < 10 { -// return string(b + '0') -// } else { -// return string([]byte{b / 10, b % 10}) -// } -// } diff --git a/internal/gen/suf_test.go b/internal/gen/suf_test.go deleted file mode 100644 index ff268cb..0000000 --- a/internal/gen/suf_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package gen - -import ( - "fmt" - "testing" -) - -func TestFindSuffixInt(t *testing.T) { - fmt.Println(FindSuffixInteger("aaa2")) - fmt.Println(FindSuffixInteger("aaa0")) - fmt.Println(FindSuffixInteger("aaa22")) - fmt.Println(FindSuffixInteger("aaa20")) - fmt.Println(FindSuffixInteger("aaa02")) - fmt.Println(FindSuffixInteger("aaa_")) - fmt.Println(FindSuffixInteger("aaa")) -} diff --git a/internal/gen/tsv.go b/internal/gen/tsv.go deleted file mode 100644 index 1720c25..0000000 --- a/internal/gen/tsv.go +++ /dev/null @@ -1,54 +0,0 @@ -package gen - -import ( - "bufio" - "strconv" - "strings" - - "github.com/nopdan/gosmq/pkg/util" -) - -// 加载多多码表 -func (c *Config) LoadTSV(wordFirst bool) []*Entry { - ret := make([]*Entry, 0, 1e5) - rd, err := util.Read(c.Path) - if err != nil { - panic(err) - } - // 统计编码出现的次数 - stat := make(map[string]int) - scan := bufio.NewScanner(rd) - for scan.Scan() { - wc := strings.Split(scan.Text(), "\t") - if len(wc) < 2 { - continue - } - word, code := wc[0], wc[1] - if !wordFirst { - word, code = code, word - } - stat[code]++ - pos := stat[code] - code = c.addSuffix(code, pos) - ret = append(ret, &Entry{word, code, pos}) - } - return ret -} - -// 加上选重键,pos 是编码出现的次数,最小为 1 -func (c Config) addSuffix(code string, pos int) string { - // 大于等于起顶码长,首选不用添加空格 _ - if len(code) >= c.PushStart { - if pos == 1 { - return code - } - } - - // 添加自定义选重键 - if pos <= len(c.SelectKeys) { - code += string(c.SelectKeys[pos-1]) - } else { - code += strconv.Itoa(pos) - } - return code -} diff --git a/internal/gen/write.go b/internal/gen/write.go deleted file mode 100644 index 38e21a1..0000000 --- a/internal/gen/write.go +++ /dev/null @@ -1,30 +0,0 @@ -package gen - -import ( - "bytes" - "fmt" - "os" - "strconv" -) - -// 输出赛码表 -func Write(dict []*Entry, path string) { - var buf bytes.Buffer - buf.Grow(len(dict)) - for _, entry := range dict { - buf.WriteString(entry.Word) - buf.WriteByte('\t') - buf.WriteString(entry.Code) - if entry.Pos != 1 { - buf.WriteByte('\t') - buf.WriteString(strconv.Itoa(entry.Pos)) - } - buf.WriteByte('\n') - } - err := os.WriteFile(path, buf.Bytes(), 0666) - if err != nil { - fmt.Println("Warning! 输出赛码表失败:", err) - return - } - fmt.Println("输出赛码表成功:", path) -} diff --git a/pkg/dict/default.go b/pkg/dict/default.go new file mode 100644 index 0000000..a6d9138 --- /dev/null +++ b/pkg/dict/default.go @@ -0,0 +1,135 @@ +package dict + +import ( + "bufio" + "bytes" + "cmp" + "fmt" + "os" + "path/filepath" + "slices" + "strconv" + "strings" + "sync" + "unicode/utf8" + + "github.com/nopdan/gosmq/pkg/matcher" +) + +type Entry struct { + Word string + Code string + Pos int +} + +// 初始化 Dict +func (d *Dict) init() { + // 匹配算法 + if d.Single { + d.Matcher = matcher.NewSingle() + } else { + switch d.algorithm { + case "greedy", "": + d.Matcher = matcher.NewTrie(false, false) + case "ordered": + d.Matcher = matcher.NewTrie(true, false) + case "dynamic": + // TODO + fallthrough + default: + panic("不支持的匹配算法: " + d.algorithm) + } + } + + var dict []*Entry + // 读取码表,构建 matcher + switch d.format { + case "default", "": + d.load() + case "jisu", "js": + dict = d.loadJisu() + case "duoduo", "dd", "rime": + dict = d.loadTSV(true) + case "bingling", "bl": + dict = d.loadTSV(false) + case "xiaoxiao", "xx", "jidian", "jd": + dict = d.loadXiao() + default: + panic("不支持的格式: " + d.format) + } + // 输出转换后的赛码表 + var wg sync.WaitGroup + if dict != nil { + wg.Add(1) + go func() { + defer wg.Done() + Output(dict, filepath.Join("dict", + strings.TrimSuffix(d.Name, ".txt")+".txt"), + ) + }() + } + d.Matcher.Build() + if dict != nil { + wg.Wait() + } +} + +// 默认格式 +func (d *Dict) load() { + scan := bufio.NewScanner(d.Reader) + for scan.Scan() { + wc := strings.Split(scan.Text(), "\t") + pos := 1 + if len(wc) >= 3 { + pos, _ = strconv.Atoi(wc[2]) + } else if len(wc) < 2 { + continue + } + d.insert(wc[0], wc[1], pos) + } +} + +// 向 matcher 中添加一个词条 +func (d *Dict) insert(word, code string, pos int) { + if d.Single && utf8.RuneCountInString(word) != 1 { + return + } + d.Matcher.Insert(word, code, pos) + d.Length++ +} + +// 输出赛码表 +func Output(dict []*Entry, path string) { + // 判断文件是否存在,若存在则直接退出 + _, err := os.Stat(path) + if err == nil { + fmt.Printf("赛码表已经存在:%s\n", path) + return + } + // 按照词长排序 + slices.SortStableFunc(dict, func(i, j *Entry) int { + return cmp.Compare( + utf8.RuneCountInString(i.Word), + utf8.RuneCountInString(j.Word), + ) + }) + var buf bytes.Buffer + buf.Grow(len(dict)) + for _, entry := range dict { + buf.WriteString(entry.Word) + buf.WriteByte('\t') + buf.WriteString(entry.Code) + if entry.Pos != 1 { + buf.WriteByte('\t') + buf.WriteString(strconv.Itoa(entry.Pos)) + } + buf.WriteByte('\n') + } + + err = os.WriteFile(path, buf.Bytes(), 0644) + if err != nil { + fmt.Println("输出赛码表失败:", err) + } else { + fmt.Println("输出赛码表成功:", path) + } +} diff --git a/pkg/dict/dict.go b/pkg/dict/dict.go new file mode 100644 index 0000000..117e268 --- /dev/null +++ b/pkg/dict/dict.go @@ -0,0 +1,81 @@ +package dict + +import ( + "github.com/nopdan/gosmq/pkg/matcher" + "github.com/nopdan/gosmq/pkg/text" +) + +type Dict struct { + *text.Text + // default, jisu, duoduo|bingling, jidian + format string + // 起顶码长 + push int + // 选重键 + selectKeys [][]byte + // 是否只用码表里的单字 + Single bool + // 匹配算法 greedy|ordered|dynamic + algorithm string + // 空格按键方式 both|left|right + SpacePref string + + Matcher matcher.Matcher + Length int // 词条数 +} + +type DictOption func(*Dict) + +func New(text *text.Text, opts ...DictOption) *Dict { + dict := &Dict{ + Text: text, + format: "default", + algorithm: "greedy", + SpacePref: "both", + } + for _, opt := range opts { + opt(dict) + } + dict.init() + return dict +} + +func WithFormat(format string) DictOption { + return func(opt *Dict) { + opt.format = format + } +} + +func WithPush(push int) DictOption { + return func(opt *Dict) { + opt.push = push + } +} + +func WithSelectKeys(keys string) DictOption { + return func(opt *Dict) { + res := make([][]byte, 0, 10) + for i := range len(keys) { + res = append(res, []byte{keys[i]}) + } + opt.selectKeys = res + } +} + +func WithSingle() DictOption { + return func(opt *Dict) { + opt.Single = true + } +} + +func WithAlgorithm(algorithm string) DictOption { + return func(opt *Dict) { + opt.algorithm = algorithm + } +} + +func WithSpacePref(spacePref string) DictOption { + return func(opt *Dict) { + opt.SpacePref = spacePref + } +} diff --git a/pkg/dict/duoduo.go b/pkg/dict/duoduo.go new file mode 100644 index 0000000..f3bc93a --- /dev/null +++ b/pkg/dict/duoduo.go @@ -0,0 +1,52 @@ +package dict + +import ( + "bufio" + "slices" + "strings" + + "github.com/nopdan/gosmq/pkg/util" +) + +// 加载多多或者冰凌码表 +func (d *Dict) loadTSV(wordFirst bool) []*Entry { + var cap int = 1e5 + if d.Size > 0 { + cap = d.Size / 32 + } + ret := make([]*Entry, 0, cap) + // 统计编码出现的次数 + stat := make(map[string]int) + var word, code string + var pos int + scan := bufio.NewScanner(d.Reader) + for scan.Scan() { + wc := strings.Split(scan.Text(), "\t") + if len(wc) < 2 { + continue + } + if wordFirst { + word, code = wc[0], wc[1] + } else { + word, code = wc[1], wc[0] + } + stat[code]++ + pos = stat[code] + code = d.addSuffix(code, pos) + ret = append(ret, &Entry{word, code, pos}) + d.insert(word, code, pos) + } + return ret +} + +// 加上选重键,pos 是编码出现的次数,最小为 1 +func (d *Dict) addSuffix(code string, pos int) string { + // 大于等于起顶码长,首选不用添加空格 _ + if pos == 1 && len(code) >= d.push { + return code + } + // 添加自定义选重键 + tmp := util.UnsafeToBytes(code) + tmp = slices.Concat(tmp, d.getSelectKey(pos)) + return util.UnsafeToString(tmp) +} diff --git a/internal/gen/jisu.go b/pkg/dict/jisu.go similarity index 57% rename from internal/gen/jisu.go rename to pkg/dict/jisu.go index d1e19ad..d87bc90 100644 --- a/internal/gen/jisu.go +++ b/pkg/dict/jisu.go @@ -1,4 +1,4 @@ -package gen +package dict import ( "bufio" @@ -8,14 +8,13 @@ import ( "github.com/nopdan/gosmq/pkg/util" ) -func (c *Config) LoadJisu() []*Entry { - ret := make([]*Entry, 0, 1e5) - rd, err := util.Read(c.Path) - if err != nil { - panic(err) +func (d *Dict) loadJisu() []*Entry { + var cap int = 1e5 + if d.Size > 0 { + cap = d.Size / 32 } - - scan := bufio.NewScanner(rd) + ret := make([]*Entry, 0, cap) + scan := bufio.NewScanner(d.Reader) for scan.Scan() { wc := strings.Split(scan.Text(), "\t") if len(wc) != 2 { @@ -29,7 +28,7 @@ func (c *Config) LoadJisu() []*Entry { continue } - pre, suf := FindSuffixInteger(code) + pre, suf := findSuffixInteger(code) // 不带数字 akdb ksdw if suf == "" { ret = append(ret, &Entry{word, code, 1}) @@ -42,17 +41,19 @@ func (c *Config) LoadJisu() []*Entry { pos = 10 } // 添加自定义选重键 - if pos <= len(c.SelectKeys) { - code = pre + string(c.SelectKeys[pos-1]) + if pos <= len(d.selectKeys) { + tmp := util.UnsafeToBytes(pre) + tmp = append(tmp, d.getSelectKey(pos)...) + code = util.UnsafeToString(tmp) } - // fmt.Println(wc[0], code, pos) ret = append(ret, &Entry{wc[0], code, pos}) + d.insert(word, code, pos) } return ret } // 查找末尾数字,返回前缀和后缀 -func FindSuffixInteger(s string) (string, string) { +func findSuffixInteger(s string) (string, string) { var prefix, suffix string for i := len(s) - 1; i >= 0; i-- { if s[i] >= '0' && s[i] <= '9' { @@ -65,3 +66,15 @@ func FindSuffixInteger(s string) (string, string) { // 全是数字 return s, "" } + +func (d *Dict) getSelectKey(num int) []byte { + if num < 1 { + return []byte{} + } + for num > len(d.selectKeys)-2 { + d.selectKeys = append(d.selectKeys, + util.UnsafeToBytes(strconv.Itoa(len(d.selectKeys)+1)), + ) + } + return d.selectKeys[num-1] +} diff --git a/pkg/dict/test_test.go b/pkg/dict/test_test.go new file mode 100644 index 0000000..cc75d40 --- /dev/null +++ b/pkg/dict/test_test.go @@ -0,0 +1,33 @@ +package dict + +import ( + "fmt" + "testing" +) + +func TestFindSuffixInt(t *testing.T) { + fmt.Println(findSuffixInteger("aaa2")) + fmt.Println(findSuffixInteger("aaa0")) + fmt.Println(findSuffixInteger("aaa22")) + fmt.Println(findSuffixInteger("aaa20")) + fmt.Println(findSuffixInteger("aaa02")) + fmt.Println(findSuffixInteger("aaa_")) + fmt.Println(findSuffixInteger("aaa")) +} + +func TestGetSelectKey(t *testing.T) { + d := New(nil, WithSelectKeys("_;'")) + fmt.Println(string(d.getSelectKey(1))) + fmt.Println(string(d.getSelectKey(2))) + fmt.Println(string(d.getSelectKey(3))) + fmt.Println(string(d.getSelectKey(4))) + fmt.Println(string(d.getSelectKey(10))) +} + +func TestSlice(t *testing.T) { + a := make([]byte, 4, 8) + b := a[:3] + b = append(b, 1, 2, 3) + fmt.Println(a) // [0 0 0 1] + fmt.Println(b) // [0 0 0 1 2 3] +} diff --git a/pkg/dict/xiaoxiao.go b/pkg/dict/xiaoxiao.go new file mode 100644 index 0000000..4540553 --- /dev/null +++ b/pkg/dict/xiaoxiao.go @@ -0,0 +1,30 @@ +package dict + +import ( + "bufio" + "strings" +) + +// 小小|极点 +func (d *Dict) loadXiao() []*Entry { + var cap int = 1e5 + if d.Size > 0 { + cap = d.Size / 32 + } + ret := make([]*Entry, 0, cap) + scan := bufio.NewScanner(d.Reader) + for scan.Scan() { + wc := strings.Split(scan.Text(), " ") + if len(wc) < 2 { + continue + } + code := wc[0] + for i := 1; i < len(wc); i++ { + word := wc[i] + code = d.addSuffix(code, i) + ret = append(ret, &Entry{word, code, i}) + d.insert(word, code, i) + } + } + return ret +} diff --git a/pkg/feeling/comb.go b/pkg/feeling/comb.go index af1f640..0da5817 100644 --- a/pkg/feeling/comb.go +++ b/pkg/feeling/comb.go @@ -1,12 +1,12 @@ package feeling import ( + "bufio" "bytes" "strconv" + "strings" _ "embed" - - "github.com/nopdan/gosmq/pkg/util" ) //go:embed assets/equivalent.txt @@ -25,50 +25,50 @@ type distrib struct { } // 1MB -var Comb [128][128]*distrib +var combination [128][128]*distrib func init() { rd := bytes.NewReader(equivalent) - tsv := util.NewTSV(rd) - + scan := bufio.NewScanner(rd) // 当量 - for { - line, err := tsv.Read("\t") - if err != nil { - break - } + for scan.Scan() { + line := strings.Split(scan.Text(), "\t") if len(line) != 2 { continue } code := line[0] dl, _ := strconv.ParseFloat(line[1], 64) - if Comb[code[0]][code[1]] == nil { - Comb[code[0]][code[1]] = new(distrib) + if combination[code[0]][code[1]] == nil { + combination[code[0]][code[1]] = new(distrib) } - Comb[code[0]][code[1]].Equivalent = dl + combination[code[0]][code[1]].Equivalent = dl } rd.Reset(fingering) - tsv = util.NewTSV(rd) + scan = bufio.NewScanner(rd) // 小跨排 - line, _ := tsv.Read(" ") + scan.Scan() + line := strings.Split(scan.Text(), " ") for _, v := range line { - Comb[v[0]][v[1]].SingleSpan = true + combination[v[0]][v[1]].SingleSpan = true } // 大跨排 - line, _ = tsv.Read(" ") + scan.Scan() + line = strings.Split(scan.Text(), " ") for _, v := range line { - Comb[v[0]][v[1]].MultiSpan = true + combination[v[0]][v[1]].MultiSpan = true } // 错手 - line, _ = tsv.Read(" ") + scan.Scan() + line = strings.Split(scan.Text(), " ") for _, v := range line { - Comb[v[0]][v[1]].Staggered = true + combination[v[0]][v[1]].Staggered = true } // 小指干扰 - line, _ = tsv.Read(" ") + scan.Scan() + line = strings.Split(scan.Text(), " ") for _, v := range line { - Comb[v[0]][v[1]].Disturb = true + combination[v[0]][v[1]].Disturb = true } } diff --git a/pkg/feeling/feel.go b/pkg/feeling/feel.go new file mode 100644 index 0000000..9ec3969 --- /dev/null +++ b/pkg/feeling/feel.go @@ -0,0 +1,126 @@ +package feeling + +import ( + "github.com/nopdan/gosmq/pkg/result" +) + +type feeling struct { + mRes *result.MatchRes + spacePref string + + key byte + isLeft bool + finger byte + lastKey byte + lastIsLeft bool + lastFinger byte + last2Key byte +} + +func New(target *result.MatchRes, spacePref string) *feeling { + return &feeling{mRes: target, spacePref: spacePref} +} + +// 处理当前按键,并更新状态。需要在 Process 退出前调用 +func (f *feeling) step() { + currKey := f.key + if currKey == '_' { + switch f.spacePref { + case "right": + currKey = '+' + case "both", "": // "both" + // 如果上一个键是左手 + if f.lastFinger != 0 && f.lastIsLeft { + currKey = '+' + } + } + } + f.mRes.KeysDist[currKey]++ + f.last2Key, f.lastKey = f.lastKey, f.key + f.lastIsLeft, f.lastFinger = f.isLeft, f.finger +} + +// 传入的 key 必须为 a-z0-9,./;'[]-= 中的一个 +// +// 特别的,传入大写字母自动转为小写,传入空格_,处理右手击键为+ +func (f *feeling) Process(key byte) { + mRes := f.mRes + // 跳过 + if key >= 128 { + return + } + // magic: 将大写字母转为小写 + if 'A' <= key && key <= 'Z' { + key |= 32 + } + + f.key = key + f.isLeft, f.finger = KeyPos(f.key) + // 如果当前键或者上一个键不合法(不在46键里) + if f.lastFinger == 0 || f.finger == 0 { + // 当前键不是第一个按键 + if f.lastKey != 0 { + mRes.Equivalent += 2.0 + mRes.Combs.Count++ + } + f.step() + return + } + + comb := combination[f.lastKey][f.key] + // 当量表里找不到 + if comb == nil { + mRes.Equivalent += 2.0 + mRes.Combs.Count++ + f.step() + return + } + + // 左右手分布 + if f.lastIsLeft { + if f.isLeft { + mRes.Hands.LL++ + } else { + mRes.Hands.LR++ + } + } else { + if f.isLeft { + mRes.Hands.RL++ + } else { + mRes.Hands.RR++ + } + } + + // 同指 + if f.finger == f.lastFinger { + mRes.Combs.SameFingers++ + } + // 同键、三连击 + if f.key == f.lastKey { + mRes.Combs.DoubleHit++ + if f.key == f.last2Key { + mRes.Combs.TribleHit++ + } + } + // 小跨排 + if comb.SingleSpan { + mRes.Combs.SingleSpan++ + } + // 大跨排 + if comb.MultiSpan { + mRes.Combs.MultiSpan++ + } + // 错手 + if comb.Staggered { + mRes.Combs.Staggered++ + } + // 小拇指干扰 + if comb.Disturb { + mRes.Combs.Disturb++ + } + + mRes.Equivalent += comb.Equivalent + mRes.Combs.Count++ + f.step() + return +} diff --git a/pkg/feeling/test_test.go b/pkg/feeling/feeling_test.go similarity index 54% rename from pkg/feeling/test_test.go rename to pkg/feeling/feeling_test.go index e68770a..c4ce699 100644 --- a/pkg/feeling/test_test.go +++ b/pkg/feeling/feeling_test.go @@ -6,17 +6,17 @@ import ( ) func TestComb(t *testing.T) { - a := Comb['c']['f'] + a := combination['c']['f'] fmt.Println(a.Equivalent) - a = Comb['f']['r'] + a = combination['f']['r'] fmt.Println(a.SingleSpan) - a = Comb['b']['t'] + a = combination['b']['t'] fmt.Println(a.MultiSpan) - a = Comb['x']['e'] + a = combination['x']['e'] fmt.Println(a.Staggered) - a = Comb['a']['w'] + a = combination['a']['w'] fmt.Println(a.Disturb) - a = Comb['c']['c'] + a = combination['c']['c'] fmt.Println(a.Staggered) } @@ -30,3 +30,17 @@ func TestKeyPos(t *testing.T) { string(keys[i]), isLeft, finger) } } + +type Result struct{} + +func TestRace(t *testing.T) { + res := make([][]*Result, 3) + fmt.Println(res) + for i := range 3 { + fmt.Println(res[i]) + res[i] = append(res[i], &Result{}, &Result{}, &Result{}) + fmt.Println(res[i]) + res[i] = make([]*Result, 4) + fmt.Println(res[i]) + } +} diff --git a/pkg/matcher/trie.go b/pkg/matcher/trie.go index 3d24683..446654b 100644 --- a/pkg/matcher/trie.go +++ b/pkg/matcher/trie.go @@ -13,8 +13,8 @@ type trie struct { tails []tail useTail bool // 是否压缩 tail - count uint32 // 插入词的数量 - stable bool // 是否按照码表的顺序 + count uint32 // 插入词的数量 + ordered bool // 是否按照码表的顺序 } type trieNode struct { @@ -36,11 +36,11 @@ type tail struct { valueIdx int32 } -func NewTrie(stable bool, useTail bool) *trie { +func NewTrie(ordered bool, useTail bool) *trie { t := new(trie) t.root = newTrieNode() t.values = make([]value, 0, 1e4) - t.stable = stable + t.ordered = ordered if useTail { t.useTail = useTail t.tails = make([]tail, 0, 1000) @@ -76,7 +76,7 @@ func (t *trie) Insert(word, code string, pos int) { } // 已经存在的词 // 取排在前面的 - if t.stable { + if t.ordered { return } // 取码长较短的 @@ -155,7 +155,7 @@ func (t *trie) Match(text []rune) (int, string, int) { if slices.Equal(_tail.runes, text[p:p+len(_tail.runes)]) { val := &t.values[_tail.valueIdx] // 跳过码表顺序在后面的词 - if t.stable && res.order != 0 && val.order > res.order { + if t.ordered && res.order != 0 && val.order > res.order { return } wordLen = p + len(_tail.runes) @@ -172,7 +172,7 @@ func (t *trie) Match(text []rune) (int, string, int) { if node.valueIdx != -1 { val := &t.values[node.valueIdx] // 跳过码表顺序在后面的词 - if t.stable && res.order != 0 && val.order > res.order { + if t.ordered && res.order != 0 && val.order > res.order { } else { wordLen = p res = val diff --git a/pkg/result/match_res.go b/pkg/result/match_res.go new file mode 100644 index 0000000..176d98e --- /dev/null +++ b/pkg/result/match_res.go @@ -0,0 +1,200 @@ +package result + +import ( + "fmt" + "sort" + + "github.com/nopdan/gosmq/pkg/util" +) + +type CodePosCount struct { + Code string + Pos int + Count int +} + +type WordCode struct { + Word string + Code string +} + +// 匹配一段文字得到的信息 +type MatchRes struct { + TextIdx int // 文章索引 + DictIdx int // 码表索引 + + PartIdx int // 文章分段索引 + Segment []WordCode + segments []struct { + // 分段索引 + PartIdx int + // 每段的分词结果 + Segment []WordCode + } + + // 每个词条对应的编码,以及出现的次数 + StatData map[string]*CodePosCount + + KeysDist [128]int // 按键分布 + NotHanMap map[rune]struct{} // 非汉字 + LackMap map[rune]struct{} // 缺失的汉字 + + CodeLenDist []int // 码长 + WordLenDist []int // 词长 + CollisionDist []int // 选重 + Equivalent float64 // 总当量 + + Commit struct { + Count int // 上屏数 + Word int // 打词数 + WordChars int // 打词字数 + WordFirst int // 首选词 + + Collision int // 选重 + CollisionChars int // 选重字数 + } + + Combs struct { + Count int // 按键组合数 + SameFingers int // 同指 + DoubleHit int // 同键双击 + TribleHit int // 同键三连击 + SingleSpan int // 小跨排 + MultiSpan int // 大跨排 + Staggered int // 错手 + Disturb int // 小指干扰 + } + + Hands struct { + LL int + LR int + RL int + RR int + } +} + +func NewMatchRes() *MatchRes { + mRes := new(MatchRes) + mRes.Segment = make([]WordCode, 0) + mRes.segments = make([]struct { + PartIdx int + Segment []WordCode + }, 0) + mRes.StatData = make(map[string]*CodePosCount) + + mRes.NotHanMap = make(map[rune]struct{}) + mRes.LackMap = make(map[rune]struct{}) + mRes.CodeLenDist = make([]int, 0, 10) + mRes.WordLenDist = make([]int, 0, 10) + mRes.CollisionDist = make([]int, 0, 10) + return mRes +} + +// 将每次匹配得到的信息追加到总结果 +func (m *MatchRes) Combine(mRes *MatchRes) { + // 第一个 MatchRes 为总结果 + if len(m.segments) == 0 { + m.segments = append(m.segments, struct { + PartIdx int + Segment []WordCode + }{m.PartIdx, m.Segment}) + } + if len(mRes.Segment) != 0 { + m.segments = append(m.segments, struct { + PartIdx int + Segment []WordCode + }{mRes.PartIdx, mRes.Segment}) + } + for k, v := range mRes.StatData { + if _, ok := m.StatData[k]; !ok { + m.StatData[k] = v + } else { + m.StatData[k].Count += v.Count + } + } + for i := 33; i < 128; i++ { + m.KeysDist[i] += mRes.KeysDist[i] + } + for k := range mRes.NotHanMap { + m.NotHanMap[k] = struct{}{} + } + for k := range mRes.LackMap { + m.LackMap[k] = struct{}{} + } + for i := range mRes.CodeLenDist { + util.AddTo(mRes.CodeLenDist[i], &m.CodeLenDist, i) + } + for i := range mRes.WordLenDist { + util.AddTo(mRes.WordLenDist[i], &m.WordLenDist, i) + } + for i := range mRes.CollisionDist { + util.AddTo(mRes.CollisionDist[i], &m.CollisionDist, i) + } + m.Equivalent += mRes.Equivalent + + m.Commit.Count += mRes.Commit.Count + m.Commit.Word += mRes.Commit.Word + m.Commit.WordChars += mRes.Commit.WordChars + m.Commit.WordFirst += mRes.Commit.WordFirst + m.Commit.Collision += mRes.Commit.Collision + m.Commit.CollisionChars += mRes.Commit.CollisionChars + + m.Combs.Count += mRes.Combs.Count + m.Combs.SameFingers += mRes.Combs.SameFingers + m.Combs.DoubleHit += mRes.Combs.DoubleHit + m.Combs.TribleHit += mRes.Combs.TribleHit + m.Combs.SingleSpan += mRes.Combs.SingleSpan + m.Combs.MultiSpan += mRes.Combs.MultiSpan + m.Combs.Staggered += mRes.Combs.Staggered + m.Combs.Disturb += mRes.Combs.Disturb + + m.Hands.LL += mRes.Hands.LL + m.Hands.LR += mRes.Hands.LR + m.Hands.RL += mRes.Hands.RL + m.Hands.RR += mRes.Hands.RR +} + +func (m *MatchRes) Print(detailed bool) { + if m.segments != nil && detailed { + sort.Slice(m.segments, func(i, j int) bool { + return m.segments[i].PartIdx < m.segments[j].PartIdx + }) + for i := range m.segments { + for j := range m.segments[i].Segment { + fmt.Printf("part: %d word: %s code: %s\n", + m.segments[i].PartIdx, m.segments[i].Segment[j].Word, m.segments[i].Segment[j].Code) + } + } + } + + for b, count := range m.KeysDist { + if count != 0 { + fmt.Printf("key: %s count: %d\n", string(rune(b)), count) + } + } + + notHan := "" + for k := range m.NotHanMap { + notHan += string(k) + } + fmt.Printf("not han: %s\n", notHan) + lackHan := "" + for k := range m.LackMap { + lackHan += string(k) + } + fmt.Printf("lack han: %s\n", lackHan) + + fmt.Printf("code len dist: %v\n", m.CodeLenDist) + fmt.Printf("word len dist: %v\n", m.WordLenDist) + fmt.Printf("collision dist: %v\n", m.CollisionDist) + fmt.Printf("equivalent: %f\n", m.Equivalent) + + fmt.Printf("commit: %+v\n", m.Commit) + fmt.Printf("combs: %+v\n", m.Combs) + fmt.Printf("hands: %+v\n", m.Hands) +} + +func (m *MatchRes) ToResult() *Result { + // TODO + return nil +} diff --git a/pkg/smq/result.go b/pkg/result/result.go similarity index 96% rename from pkg/smq/result.go rename to pkg/result/result.go index c4020b2..6ed47cb 100644 --- a/pkg/smq/result.go +++ b/pkg/result/result.go @@ -1,4 +1,4 @@ -package smq +package result // count and rate type CountRate struct { @@ -6,6 +6,12 @@ type CountRate struct { Rate float64 } +type wcIdx struct { + idx int + wordSli []string + codeSli []string +} + type Result struct { TextName string TextLen int // 文本字数 @@ -101,7 +107,7 @@ type hands struct { RR CountRate `json:"RightToRight"` // 右右 } -func newResult() *Result { +func NewResult() *Result { res := new(Result) res.notHanMap = make(map[rune]struct{}, 20) res.lackMap = make(map[rune]struct{}, 20) diff --git a/pkg/smq/stat.go b/pkg/result/stat.go similarity index 88% rename from pkg/smq/stat.go rename to pkg/result/stat.go index 05fe7f3..1a536d0 100644 --- a/pkg/smq/stat.go +++ b/pkg/result/stat.go @@ -1,9 +1,13 @@ -package smq +package result import ( "sort" + + "github.com/nopdan/gosmq/pkg/util" ) +var div = util.Div[int] + func (res *Result) stat() { for i, v := range res.Words.Dist { @@ -76,15 +80,17 @@ func (res *Result) statFeel() { res.Hands.Same.Rate = div(res.Hands.Same.Count, res.Combs.Count) res.Hands.Diff.Rate = div(res.Hands.Diff.Count, res.Combs.Count) // fingers - for i := 33; i < 128; i++ { - if keyPos := KeyPosArr[i]; keyPos.Fin == 0 { - res.Fingers.Dist[10].Count += res.keysDist[i] - } else if keyPos.Fin == 10 { - res.Fingers.Dist[0].Count += res.keysDist[i] - } else { - res.Fingers.Dist[keyPos.Fin].Count += res.keysDist[i] - } - } + // TODO + // for i := byte(33); i < 128; i++ { + // _, finger := feeling.KeyPos(i) + // if finger == 0 { + // res.Fingers.Dist[10].Count += res.keysDist[i] + // } else if finger == 10 { + // res.Fingers.Dist[0].Count += res.keysDist[i] + // } else { + // res.Fingers.Dist[finger].Count += res.keysDist[i] + // } + // } for i := range res.Fingers.Dist { v := &res.Fingers.Dist[i] v.Rate = div(v.Count, res.CodeLen.Total) diff --git a/pkg/smq/config.go b/pkg/smq/config.go new file mode 100644 index 0000000..a3d2c1b --- /dev/null +++ b/pkg/smq/config.go @@ -0,0 +1,121 @@ +package smq + +import ( + "runtime" + "sync" + + "github.com/nopdan/gosmq/pkg/dict" + "github.com/nopdan/gosmq/pkg/result" + "github.com/nopdan/gosmq/pkg/text" +) + +type Config struct { + textList []*text.Text + dictList []*dict.Dict + + Clean bool // 只统计词库中的词条 + Split bool // 统计分词结果 + Stat bool // 统计每个词条出现的次数 +} + +// 逻辑 CPU 数量(线程数) +var NUM_CPU = runtime.NumCPU() + +type SmqOption func(*Config) + +func New(opts ...SmqOption) *Config { + c := new(Config) + c.textList = make([]*text.Text, 0) + c.dictList = make([]*dict.Dict, 0) + for _, opt := range opts { + opt(c) + } + return c +} + +func WithClean() SmqOption { + return func(c *Config) { + c.Clean = true + } +} + +func WithSplit() SmqOption { + return func(c *Config) { + c.Split = true + } +} + +func WithStat() SmqOption { + return func(c *Config) { + c.Stat = true + } +} + +func (c *Config) AddText(textList ...*text.Text) { + c.textList = append(c.textList, textList...) +} + +func (c *Config) AddDict(dictList ...*dict.Dict) { + c.dictList = append(c.dictList, dictList...) +} + +func (c *Config) Race() [][]*result.Result { + // 限制并发数量 + ch := make(chan *result.MatchRes, NUM_CPU) + var wg sync.WaitGroup + for i, text := range c.textList { + // 分段计算当前文章,pIdx 为每一段的索引 + pIdx := 0 + for { + text, err := text.Iter() + // fmt.Println(util.UnsafeToString(text)) + if err != nil { + break + } + for j, dict := range c.dictList { + wg.Add(1) + // go 1.22 修复了 range 循环问题 + go func() { + defer wg.Done() + mRes := c.match(text, dict) + mRes.TextIdx = i + mRes.DictIdx = j + mRes.PartIdx = pIdx + ch <- mRes + }() + } + pIdx++ + } + } + + // 文章数量和码表数量 + var tNum, dNum = len(c.textList), len(c.dictList) + mRes := make([][]*result.MatchRes, tNum) + for i := range tNum { + mRes[i] = make([]*result.MatchRes, dNum) + for j := range dNum { + mRes[i][j] = result.NewMatchRes() + } + } + + go func() { + wg.Wait() + close(ch) + }() + + // 循环从 ch 通道中接受值 + for part := range ch { + mRes[part.TextIdx][part.DictIdx].Combine(part) + } + + res := make([][]*result.Result, tNum) + for i := range tNum { + res[i] = make([]*result.Result, dNum) + for j := range dNum { + // TODO + mRes[i][j].Print(false) + res[i][j] = mRes[i][j].ToResult() + } + } + return res +} diff --git a/pkg/smq/dict.go b/pkg/smq/dict.go deleted file mode 100644 index cffa9fe..0000000 --- a/pkg/smq/dict.go +++ /dev/null @@ -1,87 +0,0 @@ -package smq - -import ( - "bufio" - "fmt" - "io" - "strconv" - "strings" - - "github.com/nopdan/gosmq/pkg/matcher" - "github.com/nopdan/gosmq/pkg/util" -) - -type Dict struct { - Name string // 码表名 - Single bool // 单字模式 - Stable bool // 按照码表顺序 - UseTail bool // 压缩 tail - Clean bool // 只统计词库中的词条 - - // 空格按键方式 left|right|both - PressSpaceBy string - - Split bool // 统计分词数据并输出 - Stat bool // 统计词条数据并输出 - - matcher matcher.Matcher // 初始化 Matcher - reader io.Reader // 赛码表 io 流 - length int // 词条数 -} - -// 从文件加载码表 -func (dict *Dict) Load(path string) { - rd, err := util.Read(path) - if err != nil { - fmt.Println("Warning! 读取文件失败:", err) - return - } - if dict.Name == "" { - dict.Name = util.GetFileName(path) - } - dict.reader = rd - dict.init() -} - -// 从字符串加载码表 -func (dict *Dict) LoadString(text, name string) { - if text == "" { - fmt.Println("Warning! 码表输入为空。") - return - } - dict.Name = name - dict.reader = strings.NewReader(text) - dict.init() -} - -// 初始化 Dict -func (dict *Dict) init() { - // 匹配算法 - if dict.Single { - dict.matcher = matcher.NewSingle() - } - if dict.matcher == nil { - dict.matcher = matcher.NewTrie(dict.Stable, dict.UseTail) - } - m := dict.matcher - - // 读取码表,构建 matcher - scan := bufio.NewScanner(dict.reader) - for scan.Scan() { - wc := strings.Split(scan.Text(), "\t") - pos := 1 - if len(wc) == 3 { - pos, _ = strconv.Atoi(wc[2]) - } else if len(wc) != 2 { - continue - } - if dict.Single { - if len([]rune(wc[0])) != 1 { - continue - } - } - dict.length++ - m.Insert(wc[0], wc[1], pos) - } - m.Build() -} diff --git a/pkg/smq/feel.go b/pkg/smq/feel.go deleted file mode 100644 index b966fc7..0000000 --- a/pkg/smq/feel.go +++ /dev/null @@ -1,103 +0,0 @@ -package smq - -import ( - "github.com/nopdan/gosmq/pkg/feeling" -) - -type KeyPos = feeling.KeyPos - -var KeyPosArr = feeling.KeyPosArr - -// 上 两 键,当前键,前一键的状态 => 当前键,当前键的状态 -func (mRes *matchRes) feel(last2Key, lastKey, currKey byte, last KeyPos, dict *Dict) (byte, KeyPos) { - if currKey >= 128 { - return lastKey, last - } - // for key - // 利用或操作 | 和空格将英文字符转换为小写 - if 'A' <= currKey && currKey <= 'Z' { - currKey |= ' ' - } - // 处理空格 - // if currKey == ' ' { - // currKey = '_' - // } - var origin = currKey - if currKey == '_' { - switch dict.PressSpaceBy { - case "right": - currKey = '+' - case "both", "": // "both" - // 如果上一个键是左手 - if last.Fin != 0 && last.IsLeft { - currKey = '+' - } - } - } - mRes.keysDist[currKey]++ - - // 如果当前键或者上一个键不合法(不在41键里) - // 当量增加1.5,继续下一个循环 - curr := KeyPosArr[currKey] - if last.Fin == 0 || curr.Fin == 0 { - mRes.toTalEq10 += 15 - mRes.CombsCount++ - return origin, curr - } - - // for comb - comb := feeling.Comb[lastKey][origin] - // 当量表里找不到 - if comb == 0 { - mRes.toTalEq10 += 15 - mRes.CombsCount++ - return origin, curr - } - mRes.toTalEq10 += int(comb >> 8) - mRes.CombsCount++ - - // for finger - if curr.Fin == last.Fin { - mRes.SameFingers++ - } - // for hands - if last.IsLeft { - if curr.IsLeft { - mRes.Hands.LL++ - } else { - mRes.Hands.LR++ - } - } else { - if curr.IsLeft { - mRes.Hands.RL++ - } else { - mRes.Hands.RR++ - } - } - - // 同键、三连击 - if currKey == lastKey { - mRes.Combs.DoubleHit++ - if currKey == last2Key { - mRes.Combs.TribleHit++ - } - } - // 小跨排 - if comb&feeling.IsXKP != 0 { - mRes.Combs.SingleSpan++ - } - // 大跨排 - if comb&feeling.IsDKP != 0 { - mRes.Combs.MultiSpan++ - } - // 错手 - if comb&feeling.IsCS != 0 { - mRes.Combs.LongFingersDisturb++ - } - // 小拇指干扰 - if comb&feeling.IsXZGR != 0 { - mRes.Combs.LittleFingersDisturb++ - } - - return origin, curr -} diff --git a/pkg/smq/match.go b/pkg/smq/match.go index 10423e0..f5583ec 100644 --- a/pkg/smq/match.go +++ b/pkg/smq/match.go @@ -3,48 +3,44 @@ package smq import ( "unicode" "unsafe" + + "github.com/nopdan/gosmq/pkg/dict" + "github.com/nopdan/gosmq/pkg/feeling" + "github.com/nopdan/gosmq/pkg/result" + "github.com/nopdan/gosmq/pkg/util" ) -func match(buffer []byte, dict *Dict) *matchRes { +func (c *Config) match(buffer []byte, dict *dict.Dict) *result.MatchRes { // 初始化 - mRes := new(matchRes) - mRes.wordSlice = make([]string, 0, len(buffer)/3) - mRes.codeSlice = make([]string, 0, len(buffer)/3) - mRes.statData = make(map[string]*CodePosCount) - - mRes.notHanMap = make(map[rune]struct{}) - mRes.lackMap = make(map[rune]struct{}) - mRes.CodeLenDist = make([]int, 0) - mRes.WordsDist = make([]int, 0) - mRes.CollisionDist = make([]int, 0) - - // 前面的键 - var last2Key, lastKey byte - var last KeyPos + mRes := result.NewMatchRes() + feel := feeling.New(mRes, dict.SpacePref) Handler := func(word, code string, wordLen, pos int) { - AddTo(&mRes.WordsDist, wordLen) - AddTo(&mRes.CollisionDist, pos) - AddTo(&mRes.CodeLenDist, len(code)) + util.Increase(&mRes.WordLenDist, wordLen) + util.Increase(&mRes.CollisionDist, pos) + util.Increase(&mRes.CodeLenDist, len(code)) for i := 0; i < len(code); i++ { - tmpKey, tmp := mRes.feel(last2Key, lastKey, code[i], last, dict) - last2Key = lastKey - lastKey, last = tmpKey, tmp + feel.Process(code[i]) } // 启用分词 - if dict.Split { - mRes.wordSlice = append(mRes.wordSlice, word) - mRes.codeSlice = append(mRes.codeSlice, code) + if c.Split { + mRes.Segment = append(mRes.Segment, result.WordCode{ + Word: word, + Code: code, + }) } // 启用统计 - if dict.Stat { - if _, ok := mRes.statData[word]; !ok { - mRes.statData[word] = &CodePosCount{code, pos, 1} + if c.Stat { + if _, ok := mRes.StatData[word]; !ok { + mRes.StatData[word] = &result.CodePosCount{ + Code: code, + Pos: pos, + Count: 1} } else { - mRes.statData[word].Count++ + mRes.StatData[word].Count++ } } } @@ -53,13 +49,11 @@ func match(buffer []byte, dict *Dict) *matchRes { isHan := unicode.Is(unicode.Han, char) // 非汉字 if !isHan { - mRes.notHanMap[char] = struct{}{} - mRes.NotHanCount++ + mRes.NotHanMap[char] = struct{}{} } // 缺汉字 if lack && isHan { - mRes.lackMap[char] = struct{}{} - mRes.LackCount++ + mRes.LackMap[char] = struct{}{} } } @@ -75,24 +69,24 @@ func match(buffer []byte, dict *Dict) *matchRes { p++ continue } - mRes.Commits++ + mRes.Commit.Count++ - wordLen, code, pos := dict.matcher.Match(text[p:]) + wordLen, code, pos := dict.Matcher.Match(text[p:]) // 匹配到了 if wordLen != 0 { sWord := string(text[p : p+wordLen]) // 打词 if wordLen >= 2 { - mRes.WordsCommitsCount++ - mRes.WordsCharsCount += wordLen + mRes.Commit.Word++ + mRes.Commit.WordChars += wordLen if pos == 1 { - mRes.WordsFirstCount++ + mRes.Commit.WordFirst++ } } // 选重 if pos >= 2 { - mRes.CollisionCommitsCount++ - mRes.CollisionCharsCount += wordLen + mRes.Commit.Collision++ + mRes.Commit.CollisionChars += wordLen } // 对每个字都进行判断 for i := 0; i < wordLen; i++ { @@ -104,8 +98,8 @@ func match(buffer []byte, dict *Dict) *matchRes { } // 匹配不到 - if dict.Clean { - mRes.Commits-- + if c.Clean { + mRes.Commit.Count-- p++ continue } @@ -136,8 +130,8 @@ func match(buffer []byte, dict *Dict) *matchRes { continue } } - // 找不到的符号,设为 "####" - Handler(sWord, "####", 1, 1) + // 找不到的符号,设为 "######" + Handler(sWord, "######", 1, 1) p++ } return mRes diff --git a/pkg/smq/match_res.go b/pkg/smq/match_res.go deleted file mode 100644 index 601c382..0000000 --- a/pkg/smq/match_res.go +++ /dev/null @@ -1,117 +0,0 @@ -package smq - -type CodePosCount struct { - Code string - Pos int - Count int -} - -// 匹配一段文字得到的信息 -type matchRes struct { - dictIdx int // 码表索引 - - wordSlice []string - codeSlice []string - statData map[string]*CodePosCount - - keysDist [128]int - notHanMap map[rune]struct{} - lackMap map[rune]struct{} - - Commits int - - NotHanCount int // 非汉字计数 - LackCount int - - WordsCommitsCount int - WordsCharsCount int - WordsFirstCount int - - CollisionCommitsCount int - CollisionCharsCount int - - CodeLenDist []int - WordsDist []int - CollisionDist []int - - toTalEq10 int - CombsCount int - - SameFingers int - Hands struct { - LL int - LR int - RL int - RR int - } - Combs struct { - DoubleHit int - TribleHit int - SingleSpan int - MultiSpan int - LongFingersDisturb int - LittleFingersDisturb int - } -} - -// 将每次匹配得到的信息追加到总结果 -func (res *Result) append(mRes *matchRes, dict *Dict, idx int) { - if dict.Stat { - for k, v := range mRes.statData { - if _, ok := res.statData[k]; !ok { - res.statData[k] = v - } else { - res.statData[k].Count += v.Count - } - } - } - if dict.Split { - res.wcIdxs = append(res.wcIdxs, wcIdx{idx, mRes.wordSlice, mRes.codeSlice}) - } - res.Basic.Commits += mRes.Commits - res.Basic.NotHanCount += mRes.NotHanCount - res.Basic.LackCount += mRes.LackCount - - res.Words.Commits.Count += mRes.WordsCommitsCount - res.Words.Chars.Count += mRes.WordsCharsCount - res.Words.FirstCount += mRes.WordsFirstCount - res.Collision.Commits.Count += mRes.CollisionCommitsCount - res.Collision.Chars.Count += mRes.CollisionCharsCount - - res.toTalEq10 += mRes.toTalEq10 - res.Combs.Count += mRes.CombsCount - res.Fingers.Same.Count += mRes.SameFingers - - res.Hands.LL.Count += mRes.Hands.LL - res.Hands.LR.Count += mRes.Hands.LR - res.Hands.RL.Count += mRes.Hands.RL - res.Hands.RR.Count += mRes.Hands.RR - - res.Combs.DoubleHit.Count += mRes.Combs.DoubleHit - res.Combs.TribleHit.Count += mRes.Combs.TribleHit - res.Combs.SingleSpan.Count += mRes.Combs.SingleSpan - res.Combs.MultiSpan.Count += mRes.Combs.MultiSpan - res.Combs.LongFingersDisturb.Count += mRes.Combs.LongFingersDisturb - res.Combs.LittleFingersDisturb.Count += mRes.Combs.LittleFingersDisturb - - for i := 33; i < 128; i++ { - res.keysDist[i] += mRes.keysDist[i] - } - for k := range mRes.notHanMap { - res.notHanMap[k] = struct{}{} - } - for k := range mRes.lackMap { - res.lackMap[k] = struct{}{} - } - - for i, v := range mRes.CodeLenDist { - AddToVal(&res.CodeLen.Dist, i, v) - } - - for i, v := range mRes.WordsDist { - AddToVal(&res.Words.Dist, i, v) - } - for i, v := range mRes.CollisionDist { - AddToVal(&res.Collision.Dist, i, v) - } -} diff --git a/pkg/smq/merge.go b/pkg/smq/merge.go deleted file mode 100644 index 9e54b75..0000000 --- a/pkg/smq/merge.go +++ /dev/null @@ -1,65 +0,0 @@ -package smq - -// 合并一段文本多个码表生成的结果 -func mergeRes(resA, resB []*Result, dicts []*Dict) { - if len(resA) != len(resB) || len(resA) != len(dicts) { - panic("程序运行错误:merge,两个结果长度不相等,请联系作者") - } - for j, res := range resA { - if dicts[j].Stat { - for k, v := range resB[j].statData { - if _, ok := res.statData[k]; !ok { - res.statData[k] = v - } else { - res.statData[k].Count += v.Count - } - } - } - res.Basic.Commits += resB[j].Basic.Commits - res.Basic.NotHanCount += resB[j].Basic.NotHanCount - res.Basic.LackCount += resB[j].Basic.LackCount - - res.Words.Commits.Count += resB[j].Words.Commits.Count - res.Words.Chars.Count += resB[j].Words.Chars.Count - res.Words.FirstCount += resB[j].Words.FirstCount - res.Collision.Commits.Count += resB[j].Collision.Commits.Count - res.Collision.Chars.Count += resB[j].Collision.Chars.Count - - res.toTalEq10 += resB[j].toTalEq10 - res.Combs.Count += resB[j].Combs.Count - res.Fingers.Same.Count += resB[j].Fingers.Same.Count - - res.Hands.LL.Count += resB[j].Hands.LL.Count - res.Hands.LR.Count += resB[j].Hands.LR.Count - res.Hands.RL.Count += resB[j].Hands.RL.Count - res.Hands.RR.Count += resB[j].Hands.RR.Count - - res.Combs.DoubleHit.Count += resB[j].Combs.DoubleHit.Count - res.Combs.TribleHit.Count += resB[j].Combs.TribleHit.Count - res.Combs.SingleSpan.Count += resB[j].Combs.SingleSpan.Count - res.Combs.MultiSpan.Count += resB[j].Combs.MultiSpan.Count - res.Combs.LongFingersDisturb.Count += resB[j].Combs.LongFingersDisturb.Count - res.Combs.LittleFingersDisturb.Count += resB[j].Combs.LittleFingersDisturb.Count - - for i := 33; i < 128; i++ { - res.keysDist[i] += resB[j].keysDist[i] - } - for k := range resB[j].notHanMap { - res.notHanMap[k] = struct{}{} - } - for k := range resB[j].lackMap { - res.lackMap[k] = struct{}{} - } - - for i, v := range resB[j].CodeLen.Dist { - AddToVal(&res.CodeLen.Dist, i, v) - } - - for i, v := range resB[j].Words.Dist { - AddToVal(&res.Words.Dist, i, v) - } - for i, v := range resB[j].Collision.Dist { - AddToVal(&res.Collision.Dist, i, v) - } - } -} diff --git a/pkg/smq/smq.go b/pkg/smq/smq.go deleted file mode 100644 index 751e37a..0000000 --- a/pkg/smq/smq.go +++ /dev/null @@ -1,160 +0,0 @@ -package smq - -import ( - "bufio" - "io" - "os" - "strings" - "sync" - - "github.com/nopdan/gosmq/pkg/util" -) - -type Text struct { - Name string // 文本名 - - reader io.Reader // 文本 - bufLen int -} - -// 从文件添加文本 -func (t *Text) Load(path string) error { - t.Name = util.GetFileName(path) - f, err := os.Open(path) - if err != nil { - return err - } - fi, _ := f.Stat() - if fi.Size() < 4<<20 { - // 4MB 以下 64KB - t.bufLen = 64 << 10 - } else { - // 其他 256KB - t.bufLen = 256 << 10 - } - // fmt.Println("buffer size", s.bufLen) - t.reader = util.NewReader(f) - return nil -} - -func (t *Text) LoadString(name, text string) { - t.Name = name - t.reader = strings.NewReader(text) -} - -// 计算一个码表 -func (t *Text) RaceOne(di *Dict) *Result { - resArr := t.Race([]*Dict{di}, true) - return resArr[0] -} - -type wcIdx struct { - idx int - wordSli []string - codeSli []string -} - -// 一篇文章计算多个码表,是否输出 -func (t *Text) Race(dicts []*Dict, output bool) []*Result { - resArr := make([]*Result, len(dicts)) - for i := range dicts { - resArr[i] = newResult() - resArr[i].TextName = t.Name - resArr[i].DictName = dicts[i].Name - resArr[i].DictLen = dicts[i].length - resArr[i].Single = dicts[i].Single - } - brd := bufio.NewReader(t.reader) - - var wg sync.WaitGroup - var lock sync.Mutex - ch := make(chan struct{}, 16) - for idx := 0; ; idx++ { - text, err := SplitStep(brd, t.bufLen) - for i := range dicts { - if dicts[i].length < 100 { - continue - } - wg.Add(1) - ch <- struct{}{} - go func(text []byte, i, idx int) { - defer wg.Done() - mRes := match(text, dicts[i]) - mRes.dictIdx = i - // 加锁操作 - lock.Lock() - resArr[i].append(mRes, dicts[i], i) - lock.Unlock() - <-ch - }(text, i, idx) - } - if err != nil { - break - } - } - wg.Wait() - for i := range dicts { - resArr[i].stat() - if output { - resArr[i].OutputSplit(dicts[i]) - resArr[i].OutputStat(dicts[i]) - } - } - return resArr -} - -// 多篇文章计算多个码表,回调函数针对每篇文章生成的结果列表 -func Parallel(texts []string, dicts []*Dict, callback func([]*Result)) { - var wg sync.WaitGroup - ch := make(chan struct{}, 16) - for i, text := range texts { - ch <- struct{}{} - wg.Add(1) - go func(text string, i int) { - t := &Text{} - t.Load(text) - res := t.Race(dicts, true) - callback(res) - <-ch - wg.Done() - }(text, i) - } - wg.Wait() -} - -// 多篇文章计算多个码表,合并同一个码表的多个结果 -func ParallelMerge(texts []string, dicts []*Dict) []*Result { - resArr := make([]*Result, len(dicts)) - // 合并结果不会输出分词 - for i, dict := range dicts { - resArr[i] = newResult() - resArr[i].TextName = "总计" - resArr[i].DictName = dict.Name - resArr[i].DictLen = dict.length - resArr[i].Single = dict.Single - dict.Split = false - } - var wg sync.WaitGroup - var lock sync.Mutex - ch := make(chan struct{}, 16) - for i, text := range texts { - ch <- struct{}{} - wg.Add(1) - go func(text string, i int) { - t := &Text{} - t.Load(text) - res := t.Race(dicts, false) - lock.Lock() - mergeRes(resArr, res, dicts) - lock.Unlock() - <-ch - wg.Done() - }(text, i) - } - wg.Wait() - for i, res := range resArr { - res.stat() - res.OutputStat(dicts[i]) - } - return resArr -} diff --git a/pkg/smq/smq_test.go b/pkg/smq/smq_test.go new file mode 100644 index 0000000..95c6b68 --- /dev/null +++ b/pkg/smq/smq_test.go @@ -0,0 +1,60 @@ +package smq + +import ( + "fmt" + "testing" + "time" + + "github.com/nopdan/gosmq/pkg/dict" + "github.com/nopdan/gosmq/pkg/text" +) + +func TestPuncts(t *testing.T) { + for k, v := range zhKeysMap { + fmt.Printf("%s\t%s\n", string(k), v) + } + fmt.Println(enKeysMap) +} + +func TestSmq(t *testing.T) { + now := time.Now() + + // s := New(WithSplit()) + s := New() + + t1 := text.New("testdict", text.WithPath(`D:\Code\go\gosmq\build\dict\091点儿2023春.txt`)) + d := dict.New(t1) + s.AddDict(d) + fmt.Printf("载入码表耗时: %v\n", time.Since(now)) + + // t1 = text.New("test", text.WithPath(`D:\Code\go\gosmq\build\text\心情决定事情.txt`)) + t1 = text.New("test", text.WithPath(`D:\Code\go\gosmq\build\text\《红楼梦》-曹雪芹.txt`)) + s.AddText(t1) + + res := s.Race() + fmt.Println(res) + + fmt.Printf("耗时: %v\n", time.Since(now)) +} + +func BenchmarkSmq(b *testing.B) { + s := New() + + for i := 0; i < b.N; i++ { + t1 := text.New("testdict", text.WithPath(`D:\Code\go\gosmq\build\dict\091点儿2023春.txt`)) + d := dict.New(t1) + s.AddDict(d) + } + + // t1 := text.New("testdict", text.WithPath(`D:\Code\go\gosmq\build\dict\091点儿2023春.txt`)) + // d := dict.New(t1) + // s.AddDict(d) + + t2 := text.New("test", text.WithPath(`D:\Code\go\gosmq\build\text\心情决定事情.txt`)) + s.AddText(t2) + t3 := text.New("test", text.WithPath(`D:\Code\go\gosmq\build\text\《红楼梦》-曹雪芹.txt`)) + s.AddText(t3) + + res := s.Race() + _ = res +} diff --git a/pkg/smq/some_test.go b/pkg/smq/some_test.go deleted file mode 100644 index f975af2..0000000 --- a/pkg/smq/some_test.go +++ /dev/null @@ -1,13 +0,0 @@ -package smq - -import ( - "fmt" - "testing" -) - -func TestPuncts(t *testing.T) { - for k, v := range zhKeysMap { - fmt.Printf("%s\t%s\n", string(k), v) - } - fmt.Println(enKeysMap) -} diff --git a/pkg/text/text.go b/pkg/text/text.go index 610b9dd..688d3dc 100644 --- a/pkg/text/text.go +++ b/pkg/text/text.go @@ -25,6 +25,7 @@ type Text struct { plainText string Reader *bufio.Reader + Size int // 文件大小 } type TextOption func(*Text) @@ -70,8 +71,9 @@ func WithText(text string) TextOption { } func (t *Text) init() error { - foo := func(rd io.Reader) { + foo := func(rd io.Reader, size int) { t.Reader = bufio.NewReaderSize(rd, 32*1024) + t.Size = size } switch t.source { case "local": @@ -80,20 +82,22 @@ func (t *Text) init() error { if err != nil { return fmt.Errorf("text.New(): %w", err) } - foo(f) + fi, _ := f.Stat() + rd := util.ConvertReader(f) + foo(rd, int(fi.Size())) case "upload": if len(t.data) == 0 { return fmt.Errorf("text.New(): data is empty") } brd := bytes.NewReader(t.data) rd := util.ConvertReader(brd) - foo(rd) + foo(rd, len(t.data)) case "clipboard": if len(t.plainText) == 0 { return fmt.Errorf("text.New(): plainText is empty") } rd := strings.NewReader(t.plainText) - foo(rd) + foo(rd, len(t.plainText)) } return nil } diff --git a/pkg/util/tsv.go b/pkg/util/tsv.go deleted file mode 100644 index ac99abd..0000000 --- a/pkg/util/tsv.go +++ /dev/null @@ -1,39 +0,0 @@ -package util - -import ( - "bufio" - "io" - "strings" -) - -type tsv struct { - *bufio.Scanner -} - -func NewTSV(rd io.Reader) *tsv { - return &tsv{ - Scanner: bufio.NewScanner(rd), - } -} - -// 读取一行 -func (t *tsv) ReadLine() (string, error) { - if ok := t.Scan(); !ok { - if t.Err() == nil { - return "", io.EOF - } - return "", t.Err() - } - return t.Text(), nil -} - -// 读取文件的一行,按 sep 分隔返回切片 -func (t *tsv) Read(sep string) ([]string, error) { - if ok := t.Scan(); !ok { - if t.Err() == nil { - return nil, io.EOF - } - return nil, t.Err() - } - return strings.Split(t.Text(), sep), nil -}