Skip to content

Commit

Permalink
perf: 降低内存占用
Browse files Browse the repository at this point in the history
  • Loading branch information
nopdan committed Mar 11, 2024
1 parent 4702e8d commit b209a1f
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 56 deletions.
1 change: 1 addition & 0 deletions cmd/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,5 @@ func (c *Config) convert() {
Overwrite: c.Overwrite,
}
smq.AddDict(dict)
smq.OnBeforeRace()
}
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (
)

func main() {
// defer profile.Start(profile.MemProfile, profile.MemProfileRate(1)).Stop()
// defer profile.Start().Stop()
_ = os.MkdirAll("dict", os.ModePerm)
_ = os.MkdirAll("text", os.ModePerm)
if len(os.Args) < 2 {
Expand Down
5 changes: 4 additions & 1 deletion pkg/data/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ func (t *Text) Iter() ([]byte, error) {
if t.reader == nil {
return nil, io.EOF
}
buffer := make([]byte, 32*1024, 36*1024)
if t.size < t.bufSize {
return io.ReadAll(t.reader)
}
buffer := make([]byte, t.bufSize, t.bufSize+4*1024)
n, _ := io.ReadFull(t.reader, buffer)
buffer = buffer[:n]

Expand Down
124 changes: 99 additions & 25 deletions pkg/data/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@ import (
"io"
"os"
"path/filepath"
"runtime"
"strings"

"github.com/nopdan/gosmq/pkg/util"
"github.com/gogs/chardet"
"golang.org/x/net/html/charset"
)

// 逻辑 CPU 数量(线程数)
var NUM_CPU = runtime.NumCPU()

type Text struct {
// 自定义名字
Name string
Expand All @@ -21,40 +26,24 @@ type Text struct {
// 剪切板文本
String string

reader *bufio.Reader
size int // 文件大小
IsInit bool // 是否已经初始化
reader *bufio.Reader
size int // 文件大小
IsInit bool // 是否已经初始化
bufSize int
}

func (t *Text) Init() {
if t.IsInit {
logger.Debug("文本已经初始化过了", "name", t.Name)
return
}
foo := func(rd io.Reader, size int) {
t.reader = bufio.NewReaderSize(rd, 32*1024)
t.size = size
}
if len(t.Path) > 0 {
f, err := os.Open(t.Path)
if err != nil {
logger.Warn("文本初始化失败", "path", t.Path, "error", err)
return
}
fi, _ := f.Stat()
if len(t.Name) == 0 {
t.Name = fi.Name()
t.Name = strings.TrimSuffix(t.Name, filepath.Ext(t.Name))
}
rd := util.ConvertReader(f)
foo(rd, int(fi.Size()))
t.loadFile()
} else if len(t.Bytes) > 0 {
brd := bytes.NewReader(t.Bytes)
rd := util.ConvertReader(brd)
foo(rd, len(t.Bytes))
t.loadBytes()
} else if len(t.String) > 0 {
rd := strings.NewReader(t.String)
foo(rd, len(t.String))
t.determineBufSize(len(t.String))
t.reader = bufio.NewReader(strings.NewReader(t.String))
} else {
logger.Warn("文本初始化失败", "name", t.Name)
return
Expand All @@ -66,6 +55,91 @@ func (t *Text) Init() {
return
}

func (t *Text) determineBufSize(size int) {
t.size = size
if size > NUM_CPU*256*1024 {
t.bufSize = 256 * 1024
} else if size > NUM_CPU*64*1024 {
t.bufSize = 64 * 1024
} else if size > 16*1024 {
t.bufSize = 16 * 1024
} else {
t.bufSize = 4 * 1024 // defaultBufSize
}
}

func (t *Text) loadFile() {
f, err := os.Open(t.Path)
if err != nil {
logger.Warn("文本初始化失败", "path", t.Path, "error", err)
return
}
fi, _ := f.Stat()
if len(t.Name) == 0 {
t.Name = fi.Name()
t.Name = strings.TrimSuffix(t.Name, filepath.Ext(t.Name))
}
t.determineBufSize(int(fi.Size()))

buf := make([]byte, 1024)
_, _ = f.Read(buf)
f.Seek(0, io.SeekStart)
// 检测编码格式
cs := t.detect(buf, f)
if cs == nil {
return
}
// 转换
brd := bufio.NewReaderSize(f, t.bufSize)
rd, _ := charset.NewReaderLabel(cs.Charset, brd)
t.reader = bufio.NewReader(rd)
}

func (t *Text) loadBytes() {
t.determineBufSize(len(t.Bytes))

brd := bytes.NewReader(t.Bytes)
buf := make([]byte, 1024)
_, _ = brd.Read(buf)
brd.Seek(0, io.SeekStart)
// 检测编码格式
cs := t.detect(buf, brd)
if cs == nil {
return
}
// 转换
rd, _ := charset.NewReaderLabel(cs.Charset, brd)
t.reader = bufio.NewReader(rd)
}

// 检测编码格式,返回 nil 不需要再处理
func (t *Text) detect(buf []byte, input io.Reader) *chardet.Result {
detector := chardet.NewTextDetector()
cs, err := detector.DetectBest(buf)
if err != nil {
t.reader = bufio.NewReaderSize(input, t.bufSize)
return nil
}
if cs.Confidence != 100 && cs.Charset != "UTF-8" {
cs.Charset = "GB18030"
}
// 删除 BOM 文件头
boms := make(map[string][]byte)
boms["UTF-16BE"] = []byte{0xfe, 0xff}
boms["UTF-16LE"] = []byte{0xff, 0xfe}
boms["UTF-8"] = []byte{0xef, 0xbb, 0xbf}
if b, ok := boms[cs.Charset]; ok {
if bytes.HasPrefix(buf, b) {
_, _ = input.Read(b)
}
}
if cs.Charset == "UTF-8" {
t.reader = bufio.NewReaderSize(input, t.bufSize)
return nil
}
return cs
}

// 重新初始化文本
func (t *Text) ReInit() {
t.reader = nil
Expand Down
10 changes: 7 additions & 3 deletions pkg/result/match_res.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package result

import (
"sync"

"github.com/nopdan/gosmq/pkg/util"
)

Expand All @@ -24,9 +26,6 @@ type segment struct {

// 匹配一段文字得到的信息
type MatchRes struct {
TextIdx int // 文章索引
DictIdx int // 码表索引

PartIdx int // 分段索引
Segment []WordCode
segments []segment
Expand All @@ -39,6 +38,8 @@ type MatchRes struct {
Commit commit
Char char
Pair pair

lock sync.Mutex
}

type dist struct {
Expand Down Expand Up @@ -103,6 +104,9 @@ func NewMatchRes() *MatchRes {

// 将每次匹配得到的信息追加到总结果
func (m *MatchRes) Combine(mRes *MatchRes) {
m.lock.Lock()
defer m.lock.Unlock()

// 第一个 MatchRes 为总结果
if len(m.segments) == 0 {
m.segments = append(m.segments, segment{m.PartIdx, m.Segment})
Expand Down
54 changes: 27 additions & 27 deletions pkg/smq/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,59 +68,59 @@ func (c *Config) AddDict(dictList ...*data.Dict) {
}
}

func (c *Config) Race() [][]*result.Result {
// 只转换码表时可以调用,正常赛码不需要
func (c *Config) OnBeforeRace() {
c.wg.Wait()
}

func (c *Config) Race() [][]*result.Result {
c.OnBeforeRace()
if len(c.textList) == 0 || len(c.dictList) == 0 {
logger.Warn("文本或码表为空", "text", len(c.textList), "dict", len(c.dictList))
return nil
}
logger.Info("开始赛码...", "文本", len(c.textList), "码表", len(c.dictList))
now := time.Now()
// 限制并发数量
ch := make(chan *result.MatchRes, NUM_CPU)
ch := make(chan struct{}, NUM_CPU)
var wg sync.WaitGroup

// 文章数量和码表数量
var tNum, dNum = len(c.textList), len(c.dictList)
mRes := make([][]*result.MatchRes, tNum)
for i := range tNum {
mRes[i] = make([]*result.MatchRes, dNum)
for j := range dNum {
mRes[i][j] = result.NewMatchRes()
}
}
for i, text := range c.textList {
// 分段计算当前文章,pIdx 为每一段的索引
pIdx := -1
for {
text, err := text.Iter()
if len(text) == 0 {
break
}
pIdx++
for j, dict := range c.dictList {
wg.Add(1)
ch <- struct{}{}
go func(i, j, pIdx int) {
defer wg.Done()
mRes := c.match(text, dict)
mRes.TextIdx = i
mRes.DictIdx = j
mRes.PartIdx = pIdx
ch <- mRes
m := c.match(text, dict)
m.PartIdx = pIdx
mRes[i][j].Combine(m)
<-ch
}(i, j, pIdx)
}
if err != nil {
break
}
}
}

// 文章数量和码表数量
var tNum, dNum = len(c.textList), len(c.dictList)
mRes := make([][]*result.MatchRes, tNum)
for i := range tNum {
mRes[i] = make([]*result.MatchRes, dNum)
for j := range dNum {
mRes[i][j] = result.NewMatchRes()
}
}

go func() {
wg.Wait()
close(ch)
}()

// 循环从 ch 通道中接受值
for part := range ch {
mRes[part.TextIdx][part.DictIdx].Combine(part)
}
wg.Wait()
close(ch)

if c.Merge {
for j := range dNum {
Expand Down

0 comments on commit b209a1f

Please sign in to comment.