Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace stdlib CSV reader with simpler detector #553

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 96 additions & 17 deletions internal/magic/text_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ package magic

import (
"bytes"
"encoding/csv"
"errors"
"io"
)

const (
svLineLimit = 10
quote = '"'
comment = '#'
)

// Csv matches a comma-separated values file.
func Csv(raw []byte, limit uint32) bool {
return sv(raw, ',', limit)
Expand All @@ -17,26 +21,101 @@ func Tsv(raw []byte, limit uint32) bool {
return sv(raw, '\t', limit)
}

func sv(in []byte, comma rune, limit uint32) bool {
r := csv.NewReader(bytes.NewReader(dropLastLine(in, limit)))
r.Comma = comma
r.ReuseRecord = true
r.LazyQuotes = true
r.Comment = '#'

lines := 0
for {
_, err := r.Read()
if errors.Is(err, io.EOF) {
func sv(raw []byte, delimiter byte, limit uint32) bool {
reader := prepSvReader(raw, limit)

isWithinQuote := false
isWithinComment := false
lineIdx := 0
recordFields := make(map[int]int)

buf := make([]byte, 1024)
n, err := reader.Read(buf)

var prev, cur, next byte
loop:
for err == nil {
for i := 0; i < n; i++ {
cur = buf[i]

if i > 0 {
prev = buf[i-1]
} else {
prev = byte(0)
}

if i < n-1 {
next = buf[i+1]
} else {
next = byte(0)
}

isNewline := cur == '\n' && prev != '\r' && next != byte(0) && next != '\n' || cur == '\r'

switch {
case cur == quote:
if (!isWithinQuote || next != quote) && !isWithinComment {
isWithinQuote = !isWithinQuote
} else {
i++
}

case isNewline && !isWithinQuote:
if lineIdx >= svLineLimit {
break loop
}
_, ok := recordFields[lineIdx]
if !isWithinComment && !ok {
// this should have been a csv line, but we saw content without a delimiter that was not in a comment
return false
}
lineIdx++
isWithinComment = false

case !isWithinQuote && !isWithinComment:
switch cur {
case comment:
isWithinComment = true

case delimiter:
if recordFields[lineIdx] == 0 {
recordFields[lineIdx] = 1
}
recordFields[lineIdx]++
}
}

}

n, err = reader.Read(buf)
}

var fieldCount int
for _, fields := range recordFields {
if fields > 0 {
fieldCount = fields
break
}
if err != nil {
return false
}

var badFieldCount bool
for _, fields := range recordFields {
if fields != fieldCount {
badFieldCount = true
break
}
lines++
}

return r.FieldsPerRecord > 1 && lines > 1
return !badFieldCount && fieldCount > 1 && lineIdx > 0
}

func prepSvReader(in []byte, limit uint32) io.Reader {
var reader io.Reader = bytes.NewReader(dropLastLine(in, limit))
if limit > 0 {
reader = io.LimitReader(reader, int64(limit))
}

return reader
}

// dropLastLine drops the last incomplete line from b.
Expand Down
217 changes: 217 additions & 0 deletions internal/magic/text_csv_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package magic

import (
"io"
"reflect"
"testing"
)

func TestCsv(t *testing.T) {
tests := []struct {
name string
input string
limit uint32
want bool
}{

{
name: "csv multiple lines",
input: "a,b,c\n1,2,3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := Csv([]byte(tt.input), tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func TestTsv(t *testing.T) {
tests := []struct {
name string
input string
limit uint32
want bool
}{

{
name: "tsv multiple lines",
input: "a\tb\tc\n1\t2\t3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := Tsv([]byte(tt.input), tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func TestSv(t *testing.T) {
tests := []struct {
name string
delimiter byte
input string
limit uint32
want bool
}{
{
name: "empty",
delimiter: ',',
input: "",
want: false,
},
{
name: "csv single line",
delimiter: ',',
input: "a,b,c",
want: false,
},
{
name: "csv multiple lines",
delimiter: ',',
input: "a,b,c\n1,2,3",
want: true,
},
{
name: "csv with spaces",
delimiter: ',',
input: " a ,\t\tb, c\n1, 2 , 3 ",
want: true,
},
{
name: "csv multiple lines under limit",
delimiter: ',',
input: "a,b,c\n1,2,3\n4,5,6",
limit: 10,
want: true,
},
{
name: "csv multiple lines over limit",
delimiter: ',',
input: "a,b,c\n1,2,3\n4,5,6",
limit: 1,
want: false,
},
{
name: "csv 2 line with incomplete last line",
delimiter: ',',
input: "a,b,c\n1,2",
want: false,
},
{
name: "csv 3 line with incomplete last line",
delimiter: ',',
input: "a,b,c\na,b,c\n1,2",
limit: 10,
want: true,
},
{
name: "within quotes",
delimiter: ',',
input: "\"a,b,c\n1,2,3\n4,5,6\"",
want: false,
},
{
name: "partial quotes",
delimiter: ',',
input: "\"a,b,c\n1,2,3\n4,5,6",
want: false,
},
{
name: "has quotes",
delimiter: ',',
input: "\"a\",\"b\",\"c\"\n1,\",\"2,3\n\"4\",5,6",
want: true,
},
{
name: "comma within quotes",
delimiter: ',',
input: "\"a,b\",\"c\"\n1,2,3\n\"4\",5,6",
want: false,
},
{
name: "ignore comments",
delimiter: ',',
input: "#a,b,c\n#1,2,3",
want: false,
},
{
name: "multiple comments at the end of line",
delimiter: ',',
input: "a,b#,c\n1,2#,3",
want: true,
},
{
name: "a non csv line within a csv file",
delimiter: ',',
input: "#comment\nsomething else\na,b,c\n1,2,3",
want: false,
},
{
name: "mixing comments and csv lines",
delimiter: ',',
input: "#comment\na,b,c\n#something else\n1,2,3",
want: true,
},
{
name: "ignore empty lines",
delimiter: ',',
input: "#comment\na,b,c\n\n\n#something else\n1,2,3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := sv([]byte(tt.input), tt.delimiter, tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func Test_prepSvReader(t *testing.T) {

tests := []struct {
name string
input string
limit uint32
want string
}{
{
name: "multiple lines",
input: "a,b,c\n1,2,3",
limit: 0,
want: "a,b,c\n1,2,3",
},
{
name: "limit",
input: "a,b,c\n1,2,3",
limit: 5,
want: "a,b,c",
},
{
name: "drop last line",
input: "a,b,c\na,b,c\na,b,c\n1,2",
limit: 20,
want: "a,b,c\na,b,c\na,b,c",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := prepSvReader([]byte(tt.input), tt.limit)
by, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("prepSvReader() error = %v", err)
}
if !reflect.DeepEqual(string(by), tt.want) {
t.Errorf("prepSvReader() = '%v', want '%v'", string(by), tt.want)
}
})
}
}