Skip to content

Commit

Permalink
improve performance for text detection (#532)
Browse files Browse the repository at this point in the history
This commit improves memory allocations inside nd-json by using a
scanLines function that works with already allocated buffer. Previous
version of code was using bufio.Scanner which allocates his own copy of
the buffer.

before:
BenchmarkText/application/x-ndjson-8           	  663314	      2027 ns/op	    4306 B/op	       6 allocs/op
after:
BenchmarkText/application/x-ndjson-8           	 1930292	       678.6 ns/op	     160 B/op	       4 allocs/op
  • Loading branch information
gabriel-vasile authored May 23, 2024
1 parent bc511b8 commit ff4d3d0
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 45 deletions.
88 changes: 85 additions & 3 deletions internal/magic/magic_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package magic

import (
"io"
"bufio"
"strings"
"testing"
)

Expand Down Expand Up @@ -86,6 +87,88 @@ func TestMagic(t *testing.T) {
}
}

func TestScanLine(t *testing.T) {
tcases := []struct {
name string
input string
expected []string
}{{
name: "empty input",
input: "",
expected: nil,
}, {
name: "single line, no terminal nl",
input: "1",
expected: []string{"1"},
}, {
name: "single line, terminal nl",
input: "1\n",
expected: []string{"1"},
}, {
name: "two lines, no terminal nl",
input: "1\n2",
expected: []string{"1", "2"},
}, {
name: "two lines, with terminal nl",
input: "1\n2\n",
expected: []string{"1", "2"},
}, {
name: "drops final cr",
input: "1\n2\r",
expected: []string{"1", "2"},
}, {
name: "final empty line",
input: "1\n2\n\n",
expected: []string{"1", "2", ""},
}, {
name: "empty line with cr",
input: "1\n2\n\r",
expected: []string{"1", "2", ""},
}, {
name: "nd-json numbers and object",
input: "1\n2\n3\n{}",
expected: []string{"1", "2", "3", "{}"},
},
}

for _, tt := range tcases {
t.Run(tt.name, func(t *testing.T) {
testScanLine(t, tt.input, tt.expected)
testScanLineLikeBufioScanner(t, tt.input)
})
}
}

func testScanLine(t *testing.T, text string, expectedLines []string) {
var l []byte
i, raw := 0, []byte(text)
for i = 0; len(raw) != 0; i++ {
l, raw = scanLine(raw)
if string(l) != expectedLines[i] {
t.Errorf("expected %q, got %q", expectedLines[i], l)
}
}
if i != len(expectedLines) {
t.Errorf("expected %d lines, got %d lines", len(expectedLines), i)
}
}

// Test that scanLine behaves exactly like bufio.Scanner.
func testScanLineLikeBufioScanner(t *testing.T, text string) {
var l []byte
raw := []byte(text)
s := bufio.NewScanner(strings.NewReader(text))
for lineNum := 0; s.Scan(); lineNum++ {
l, raw = scanLine(raw)
if string(l) != s.Text() {
t.Errorf("expected: %q, got: %q", s.Text(), string(l))
}
}
if err := s.Err(); err != nil {
t.Error(err)
}
}

func TestDropLastLine(t *testing.T) {
dropTests := []struct {
raw string
Expand All @@ -105,8 +188,7 @@ func TestDropLastLine(t *testing.T) {
{"\nå\n", 5, "\nå\n"},
}
for i, tt := range dropTests {
gotR := dropLastLine([]byte(tt.raw), tt.cutAt)
got, _ := io.ReadAll(gotR)
got := dropLastLine([]byte(tt.raw), tt.cutAt)
if got := string(got); got != tt.res {
t.Errorf("dropLastLine %d error: expected %q; got %q", i, tt.res, got)
}
Expand Down
44 changes: 18 additions & 26 deletions internal/magic/text.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package magic

import (
"bufio"
"bytes"
"strings"
"time"
Expand Down Expand Up @@ -234,9 +233,10 @@ func GeoJSON(raw []byte, limit uint32) bool {
// types.
func NdJSON(raw []byte, limit uint32) bool {
lCount, hasObjOrArr := 0, false
sc := bufio.NewScanner(dropLastLine(raw, limit))
for sc.Scan() {
l := sc.Bytes()
raw = dropLastLine(raw, limit)
var l []byte
for len(raw) != 0 {
l, raw = scanLine(raw)
// Empty lines are allowed in NDJSON.
if l = trimRWS(trimLWS(l)); len(l) == 0 {
continue
Expand Down Expand Up @@ -301,21 +301,14 @@ func Svg(raw []byte, limit uint32) bool {
}

// Srt matches a SubRip file.
func Srt(in []byte, _ uint32) bool {
line, in, found := scanLine(in)
if !found {
return false
}
func Srt(raw []byte, _ uint32) bool {
line, raw := scanLine(raw)

// First line must be 1.
if string(line) != "1" {
return false
}
line, in, found = scanLine(in)
if !found {
return false
}

line, raw = scanLine(raw)
secondLine := string(line)
// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
// length to exactly 29 characters.
Expand Down Expand Up @@ -345,9 +338,9 @@ func Srt(in []byte, _ uint32) bool {
return false
}

line, _, found = scanLine(in)
line, _ = scanLine(raw)
// A third line must exist and not be empty. This is the actual subtitle text.
return found && len(line) != 0
return len(line) != 0
}

// Vtt matches a Web Video Text Tracks (WebVTT) file. See
Expand Down Expand Up @@ -375,15 +368,14 @@ func Vtt(raw []byte, limit uint32) bool {
bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT"
}

func scanLine(in []byte) (line, remainder []byte, found bool) {
line, remainder, found = bytes.Cut(in, []byte("\n"))
if !found {
return
}

// Drop off any \r before \n.
if lenLine := len(line); lenLine > 0 && line[lenLine-1] == '\r' {
line = line[:lenLine-1]
// dropCR drops a terminal \r from the data.
func dropCR(data []byte) []byte {
if len(data) > 0 && data[len(data)-1] == '\r' {
return data[0 : len(data)-1]
}
return
return data
}
func scanLine(b []byte) (line, remainder []byte) {
line, remainder, _ = bytes.Cut(b, []byte("\n"))
return dropCR(line), remainder
}
22 changes: 8 additions & 14 deletions internal/magic/text_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func Tsv(raw []byte, limit uint32) bool {
}

func sv(in []byte, comma rune, limit uint32) bool {
r := csv.NewReader(dropLastLine(in, limit))
r := csv.NewReader(bytes.NewReader(dropLastLine(in, limit)))
r.Comma = comma
r.ReuseRecord = true
r.LazyQuotes = true
Expand All @@ -44,20 +44,14 @@ func sv(in []byte, comma rune, limit uint32) bool {
// mimetype limits itself to ReadLimit bytes when performing a detection.
// This means, for file formats like CSV for NDJSON, the last line of the input
// can be an incomplete line.
func dropLastLine(b []byte, cutAt uint32) io.Reader {
if cutAt == 0 {
return bytes.NewReader(b)
func dropLastLine(b []byte, readLimit uint32) []byte {
if readLimit == 0 || uint32(len(b)) < readLimit {
return b
}
if uint32(len(b)) >= cutAt {
for i := cutAt - 1; i > 0; i-- {
if b[i] == '\n' {
return bytes.NewReader(b[:i])
}
for i := len(b) - 1; i > 0; i-- {
if b[i] == '\n' {
return b[:i]
}

// No newline was found between the 0 index and cutAt.
return bytes.NewReader(b[:cutAt])
}

return bytes.NewReader(b)
return b
}
22 changes: 20 additions & 2 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ func TestConcurrent(t *testing.T) {
wg := sync.WaitGroup{}
wg.Add(4)

Extend(func([]byte, uint32) bool { return false }, "e", ".e")
go func() {
for i := 0; i < 1000; i++ {
Detect([]byte("text content"))
Expand All @@ -436,8 +437,7 @@ func TestConcurrent(t *testing.T) {
}()
go func() {
for i := 0; i < 1000; i++ {
Extend(func([]byte, uint32) bool { return false }, "e", ".e")
Lookup("text/plain").Extend(func([]byte, uint32) bool { return false }, "e", ".e")
Lookup("e").Extend(func([]byte, uint32) bool { return false }, "e", ".e")
}
wg.Done()
}()
Expand Down Expand Up @@ -495,6 +495,24 @@ func BenchmarkSliceRand(b *testing.B) {
})
}

func BenchmarkText(b *testing.B) {
r := rand.New(rand.NewSource(0))
data := make([]byte, defaultLimit)
if _, err := io.ReadFull(r, data); err != io.ErrUnexpectedEOF && err != nil {
b.Fatal(err)
}

for _, m := range text.children {
b.Run(m.String(), func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for n := 0; n < b.N; n++ {
m.detector(data, uint32(len(data)))
}
})
}
}

func BenchmarkAll(b *testing.B) {
r := rand.New(rand.NewSource(0))
data := make([]byte, defaultLimit)
Expand Down

0 comments on commit ff4d3d0

Please sign in to comment.