diff --git a/Makefile b/Makefile index b41f08b86..00d1d8192 100644 --- a/Makefile +++ b/Makefile @@ -60,7 +60,7 @@ checksuccess: $(VFSGENDEV_BIN): cd tools && $(GOBUILD) -o ../$(VFSGENDEV_BIN) github.com/shurcooL/vfsgen/cmd/vfsgendev -data_parsers: $(VFSGENDEV_BIN) lightning/mydump/parser_generated.go lightning/mydump/csv_parser_generated.go +data_parsers: $(VFSGENDEV_BIN) lightning/mydump/parser_generated.go PATH="$(GOPATH)/bin":"$(PATH)" protoc -I. -I"$(GOPATH)/src" lightning/checkpoints/file_checkpoints.proto --gogofaster_out=. $(VFSGENDEV_BIN) -source='"github.com/pingcap/tidb-lightning/lightning/web".Res' && mv res_vfsdata.go lightning/web/ diff --git a/lightning/mydump/csv_parser.go b/lightning/mydump/csv_parser.go index e74f9a478..2f81b4a3b 100644 --- a/lightning/mydump/csv_parser.go +++ b/lightning/mydump/csv_parser.go @@ -1,6 +1,20 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + package mydump import ( + "bytes" "io" "strings" @@ -10,21 +24,52 @@ import ( "github.com/pingcap/tidb/types" ) +var ( + errUnterminatedQuotedField = errors.NewNoStackError("syntax error: unterminated quoted field") + errDanglingBackslash = errors.NewNoStackError("syntax error: no character after backslash") + errUnexpectedQuoteField = errors.NewNoStackError("syntax error: cannot have consecutive fields without separator") +) + +// CSVParser is basically a copy of encoding/csv, but special-cased for MySQL-like input. type CSVParser struct { blockParser cfg *config.CSVConfig escFlavor backslashEscapeFlavor + + comma byte + quote byte + quoteStopSet string + unquoteStopSet string + + // recordBuffer holds the unescaped fields, one after another. + // The fields can be accessed by using the indexes in fieldIndexes. + // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` + // and fieldIndexes will contain the indexes [1, 2, 5, 6]. + recordBuffer []byte + + // fieldIndexes is an index of fields inside recordBuffer. + // The i'th field ends at offset fieldIndexes[i] in recordBuffer. + fieldIndexes []int } func NewCSVParser( cfg *config.CSVConfig, - reader io.Reader, + reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool, ) *CSVParser { + quote := byte(0) + if len(cfg.Delimiter) > 0 { + quote = cfg.Delimiter[0] + } + escFlavor := backslashEscapeFlavorNone + quoteStopSet := cfg.Delimiter + unquoteStopSet := "\r\n" + cfg.Separator + cfg.Delimiter if cfg.BackslashEscape { escFlavor = backslashEscapeFlavorMySQL + quoteStopSet += `\` + unquoteStopSet += `\` // we need special treatment of the NULL value \N, used by MySQL. if !cfg.NotNull && cfg.Null == `\N` { escFlavor = backslashEscapeFlavorMySQLWithNull @@ -32,143 +77,287 @@ func NewCSVParser( } return &CSVParser{ - blockParser: makeBlockParser(reader, blockBufSize, ioWorkers), - cfg: cfg, - escFlavor: escFlavor, + blockParser: makeBlockParser(reader, blockBufSize, ioWorkers), + cfg: cfg, + comma: cfg.Separator[0], + quote: quote, + escFlavor: escFlavor, + quoteStopSet: quoteStopSet, + unquoteStopSet: unquoteStopSet, } } -type csvToken byte - -const ( - csvTokNil csvToken = iota - csvTokSep - csvTokNewLine - csvTokField -) - -func (parser *CSVParser) appendEmptyValues(sepCount int) { - var datum types.Datum - if !parser.cfg.NotNull && parser.cfg.Null == "" { - datum.SetNull() - } else { - datum.SetString("") - } - for i := 0; i < sepCount; i++ { - parser.lastRow.Row = append(parser.lastRow.Row, datum) +func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool) { + if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` { + return input, true } + unescaped = unescape(input, "", parser.escFlavor) + isNull = parser.escFlavor != backslashEscapeFlavorMySQLWithNull && + !parser.cfg.NotNull && + unescaped == parser.cfg.Null + return } -func (parser *CSVParser) appendField(content string) { - input, isNull := parser.unescapeString(content) - - if parser.escFlavor != backslashEscapeFlavorMySQLWithNull { - isNull = !parser.cfg.NotNull && parser.cfg.Null == input +func (parser *CSVParser) readByte() (byte, error) { + if len(parser.buf) == 0 { + if err := parser.readBlock(); err != nil { + return 0, err + } } - - var datum types.Datum - if isNull { - datum.SetNull() - } else { - datum.SetString(input) + if len(parser.buf) == 0 { + return 0, io.EOF } - parser.lastRow.Row = append(parser.lastRow.Row, datum) + b := parser.buf[0] + parser.buf = parser.buf[1:] + parser.pos++ + return b, nil } -func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool) { - delim := parser.cfg.Delimiter - if len(delim) > 0 && len(input) >= 2 && input[0] == delim[0] { - input = input[1 : len(input)-1] - } else { - delim = "" +func (parser *CSVParser) peekByte() (byte, error) { + if len(parser.buf) == 0 { + if err := parser.readBlock(); err != nil { + return 0, err + } } - if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` { - return input, true + if len(parser.buf) == 0 { + return 0, io.EOF } - return unescape(input, delim, parser.escFlavor), false + return parser.buf[0], nil } -// ReadRow reads a row from the datafile. -func (parser *CSVParser) ReadRow() error { - emptySepCount := 1 - hasField := false +func (parser *CSVParser) skipByte() { + parser.buf = parser.buf[1:] + parser.pos++ +} - row := &parser.lastRow - row.RowID++ - row.Row = make([]types.Datum, 0, len(row.Row)) +// readUntil reads the buffer until any character from the `chars` set is found. +// that character is excluded from the final buffer. +func (parser *CSVParser) readUntil(chars string) ([]byte, byte, error) { + index := bytes.IndexAny(parser.buf, chars) + if index >= 0 { + ret := parser.buf[:index] + parser.buf = parser.buf[index:] + parser.pos += int64(index) + return ret, parser.buf[0], nil + } - // skip the header first - if parser.pos == 0 && parser.cfg.Header { - parser.columns = make([]string, 0, len(row.Row)) - outside: - for { - tok, content, err := parser.lex() - if err != nil { - return errors.Trace(err) - } - switch tok { - case csvTokSep: - case csvTokField: - colName, _ := parser.unescapeString(string(content)) - parser.columns = append(parser.columns, strings.ToLower(colName)) - case csvTokNewLine: - break outside + // not found in parser.buf, need allocate and loop. + var buf []byte + for { + buf = append(buf, parser.buf...) + parser.buf = nil + if err := parser.readBlock(); err != nil || len(parser.buf) == 0 { + if err == nil { + err = io.EOF } + parser.pos += int64(len(buf)) + return buf, 0, errors.Trace(err) + } + index := bytes.IndexAny(parser.buf, chars) + if index >= 0 { + buf = append(buf, parser.buf[:index]...) + parser.buf = parser.buf[index:] + parser.pos += int64(len(buf)) + return buf, parser.buf[0], nil } } +} + +func (parser *CSVParser) readRecord() ([]string, error) { + parser.recordBuffer = parser.recordBuffer[:0] + parser.fieldIndexes = parser.fieldIndexes[:0] + isEmptyLine := true +outside: for { - tok, content, err := parser.lex() - switch errors.Cause(err) { - case nil: - case io.EOF: - if hasField { - tok = csvTokNewLine - break + firstByte, err := parser.readByte() + if err != nil { + if isEmptyLine || errors.Cause(err) != io.EOF { + return nil, err } - fallthrough + // treat EOF as the same as trailing \n. + firstByte = '\n' + } + + switch firstByte { + case parser.comma: + parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) + + case parser.quote: + if err := parser.readQuotedField(); err != nil { + return nil, err + } + + case '\r', '\n': + // new line = end of record (ignore empty lines) + if isEmptyLine { + continue + } + parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) + break outside + default: - return errors.Trace(err) + if firstByte == '\\' && parser.escFlavor != backslashEscapeFlavorNone { + if err := parser.readByteForBackslashEscape(); err != nil { + return nil, err + } + } else { + parser.recordBuffer = append(parser.recordBuffer, firstByte) + } + if err := parser.readUnquoteField(); err != nil { + return nil, err + } } + isEmptyLine = false + } - hasField = true + // Create a single string and create slices out of it. + // This pins the memory of the fields together, but allocates once. + str := string(parser.recordBuffer) // Convert to string once to batch allocations + dst := make([]string, len(parser.fieldIndexes)) + var preIdx int + for i, idx := range parser.fieldIndexes { + dst[i] = str[preIdx:idx] + preIdx = idx + } - switch tok { - case csvTokSep: - emptySepCount++ + // Check or update the expected fields per record. + return dst, nil +} - case csvTokField: - parser.appendEmptyValues(emptySepCount - 1) - emptySepCount = 0 - parser.appendField(string(content)) +func (parser *CSVParser) readByteForBackslashEscape() error { + b, err := parser.readByte() + err = parser.replaceEOF(err, errDanglingBackslash) + if err != nil { + return err + } + parser.recordBuffer = append(parser.recordBuffer, '\\', b) + return nil +} - case csvTokNewLine: - if !parser.cfg.TrimLastSep { - parser.appendEmptyValues(emptySepCount) +func (parser *CSVParser) readQuotedField() error { + for { + content, terminator, err := parser.readUntil(parser.quoteStopSet) + err = parser.replaceEOF(err, errUnterminatedQuotedField) + if err != nil { + return err + } + parser.recordBuffer = append(parser.recordBuffer, content...) + parser.skipByte() + switch terminator { + case parser.quote: + // encountered '"' -> continue if we're seeing '""'. + b, err := parser.peekByte() + err = parser.replaceEOF(err, nil) + if err != nil { + return err + } + switch b { + case parser.quote: + // consume the double quotation mark and continue + parser.skipByte() + parser.recordBuffer = append(parser.recordBuffer, '"') + case '\r', '\n', parser.comma, 0: + // end the field if the next is a separator + return nil + default: + // in all other cases, we've got a syntax error. + parser.logSyntaxError() + return errors.AddStack(errUnexpectedQuoteField) + } + case '\\': + if err := parser.readByteForBackslashEscape(); err != nil { + return err } - return nil } } } -func (parser *CSVParser) ReadUntilTokNewLine() (pos int64, err error) { - hasField := false +func (parser *CSVParser) readUnquoteField() error { for { - tok, _, err := parser.lex() - switch errors.Cause(err) { - case nil: - case io.EOF: - if hasField { - tok = csvTokNewLine - break + content, terminator, err := parser.readUntil(parser.unquoteStopSet) + parser.recordBuffer = append(parser.recordBuffer, content...) + err = parser.replaceEOF(err, nil) + if err != nil { + return err + } + + switch terminator { + case '\r', '\n', parser.comma, 0: + return nil + case parser.quote: + parser.logSyntaxError() + return errors.AddStack(errUnexpectedQuoteField) + case '\\': + parser.skipByte() + if err := parser.readByteForBackslashEscape(); err != nil { + return err } - fallthrough - default: - return parser.pos, errors.Trace(err) } - hasField = true - if tok == csvTokNewLine { - return parser.pos, nil + } +} + +func (parser *CSVParser) replaceEOF(err error, replaced error) error { + if err == nil || errors.Cause(err) != io.EOF { + return err + } + if replaced != nil { + parser.logSyntaxError() + replaced = errors.AddStack(replaced) + } + return replaced +} + +// ReadRow reads a row from the datafile. +func (parser *CSVParser) ReadRow() error { + row := &parser.lastRow + row.RowID++ + + // skip the header first + if parser.pos == 0 && parser.cfg.Header { + columns, err := parser.readRecord() + if err != nil { + return errors.Trace(err) + } + parser.columns = make([]string, 0, len(columns)) + for _, colName := range columns { + colName, _ = parser.unescapeString(colName) + parser.columns = append(parser.columns, strings.ToLower(colName)) } } + + records, err := parser.readRecord() + if err != nil { + return errors.Trace(err) + } + // remove trailing empty values + if parser.cfg.TrimLastSep { + var i int + for i = len(records); i > 0 && len(records[i-1]) == 0; i-- { + } + records = records[:i] + } + + row.Row = parser.acquireDatumSlice() + for _, record := range records { + var datum types.Datum + unescaped, isNull := parser.unescapeString(record) + if isNull { + datum.SetNull() + } else { + datum.SetString(unescaped) + } + row.Row = append(row.Row, datum) + } + + return nil +} + +func (parser *CSVParser) ReadUntilTokNewLine() (int64, error) { + _, _, err := parser.readUntil("\r\n") + if err != nil { + return 0, err + } + parser.skipByte() + return parser.pos, nil } diff --git a/lightning/mydump/csv_parser.rl b/lightning/mydump/csv_parser.rl deleted file mode 100644 index 059b7a074..000000000 --- a/lightning/mydump/csv_parser.rl +++ /dev/null @@ -1,107 +0,0 @@ -// Please edit `csv_parser.rl` if you want to modify this file. To generate -// `csv_parser_generated.go`, please execute -// -// ```sh -// make data_parsers -// ``` - -package mydump - -import ( - "io" - - "github.com/pingcap/errors" -) - -%%{ -#` - -# This is a ragel parser to quickly scan through a CSV data source file. -# You may find detailed syntax explanation on its website -# . - -machine csv_parser; - -# We are not going to use Go's `encoding/csv` package since we have some special cases to deal with. -# -# MySQL supports backslash escaping, so the following has 2 fields, but `encoding/csv` will report -# a syntax error. -# -# "5\"6",7 -# - -q = ^[\r\n] when { fc == delim }; -bs = '\\' when { parser.escFlavor != backslashEscapeFlavorNone }; -sep = ^[\r\n] when { fc == sep }; - -c = (^[\r\n] - q - bs - sep) | bs any; - -main := |* - sep => { - consumedToken = csvTokSep - fbreak; - }; - - q (c | [\r\n] | sep | q q)* q | c+ => { - consumedToken = csvTokField - fbreak; - }; - - [\r\n]+ => { - consumedToken = csvTokNewLine - fbreak; - }; -*|; - -#` -}%% - -%% write data; - -func (parser *CSVParser) lex() (csvToken, []byte, error) { - var delim byte - if len(parser.cfg.Delimiter) > 0 { - delim = parser.cfg.Delimiter[0] - } - sep := parser.cfg.Separator[0] - - var cs, ts, te, act, p int - %% write init; - - for { - data := parser.buf - consumedToken := csvTokNil - pe := len(data) - eof := -1 - if parser.isLastChunk { - eof = pe - } - - %% write exec; - - if cs == %%{ write error; }%% { - parser.logSyntaxError() - return csvTokNil, nil, errors.New("syntax error") - } - - if consumedToken != csvTokNil { - result := data[ts:te] - parser.buf = data[te:] - parser.pos += int64(te) - return consumedToken, result, nil - } - - if parser.isLastChunk { - return csvTokNil, nil, io.EOF - } - - parser.buf = parser.buf[ts:] - parser.pos += int64(ts) - p -= ts - te -= ts - ts = 0 - if err := parser.readBlock(); err != nil { - return csvTokNil, nil, errors.Trace(err) - } - } -} diff --git a/lightning/mydump/csv_parser_generated.go b/lightning/mydump/csv_parser_generated.go deleted file mode 100644 index 9c348ca1f..000000000 --- a/lightning/mydump/csv_parser_generated.go +++ /dev/null @@ -1,3048 +0,0 @@ -// Code generated by ragel DO NOT EDIT. - -//.... lightning/mydump/csv_parser.rl:1 -// Please edit `csv_parser.rl` if you want to modify this file. To generate -// `csv_parser_generated.go`, please execute -// -// ```sh -// make data_parsers -// ``` - -package mydump - -import ( - "io" - - "github.com/pingcap/errors" -) - -//.... lightning/mydump/csv_parser.rl:57 - -//.... tmp_parser.go:24 -const csv_parser_start int = 8 -const csv_parser_first_final int = 8 -const csv_parser_error int = 0 - -const csv_parser_en_main int = 8 - -//.... lightning/mydump/csv_parser.rl:60 - -func (parser *CSVParser) lex() (csvToken, []byte, error) { - var delim byte - if len(parser.cfg.Delimiter) > 0 { - delim = parser.cfg.Delimiter[0] - } - sep := parser.cfg.Separator[0] - - var cs, ts, te, act, p int - - //.... tmp_parser.go:43 - { - cs = csv_parser_start - ts = 0 - te = 0 - act = 0 - } - - //.... lightning/mydump/csv_parser.rl:70 - - for { - data := parser.buf - consumedToken := csvTokNil - pe := len(data) - eof := -1 - if parser.isLastChunk { - eof = pe - } - - //.... tmp_parser.go:63 - { - var _widec int16 - if p == pe { - goto _test_eof - } - switch cs { - case 8: - goto st_case_8 - case 0: - goto st_case_0 - case 9: - goto st_case_9 - case 10: - goto st_case_10 - case 1: - goto st_case_1 - case 2: - goto st_case_2 - case 11: - goto st_case_11 - case 12: - goto st_case_12 - case 3: - goto st_case_3 - case 13: - goto st_case_13 - case 4: - goto st_case_4 - case 14: - goto st_case_14 - case 15: - goto st_case_15 - case 5: - goto st_case_5 - case 16: - goto st_case_16 - case 6: - goto st_case_6 - case 17: - goto st_case_17 - case 7: - goto st_case_7 - case 18: - goto st_case_18 - case 19: - goto st_case_19 - case 20: - goto st_case_20 - case 21: - goto st_case_21 - case 22: - goto st_case_22 - case 23: - goto st_case_23 - case 24: - goto st_case_24 - } - goto st_out - tr0: - //.... NONE:1 - switch act { - case 0: - { - { - goto st0 - } - } - case 1: - { - p = (te) - 1 - - consumedToken = csvTokSep - { - p++ - cs = 8 - goto _out - } - } - case 2: - { - p = (te) - 1 - - consumedToken = csvTokField - { - p++ - cs = 8 - goto _out - } - } - } - - goto st8 - tr14: - //.... lightning/mydump/csv_parser.rl:45 - p = (te) - 1 - { - consumedToken = csvTokField - { - p++ - cs = 8 - goto _out - } - } - goto st8 - tr17: - //.... lightning/mydump/csv_parser.rl:40 - te = p + 1 - { - consumedToken = csvTokSep - { - p++ - cs = 8 - goto _out - } - } - goto st8 - tr23: - //.... lightning/mydump/csv_parser.rl:50 - te = p - p-- - { - consumedToken = csvTokNewLine - { - p++ - cs = 8 - goto _out - } - } - goto st8 - tr24: - //.... lightning/mydump/csv_parser.rl:45 - te = p - p-- - { - consumedToken = csvTokField - { - p++ - cs = 8 - goto _out - } - } - goto st8 - tr25: - //.... lightning/mydump/csv_parser.rl:40 - te = p - p-- - { - consumedToken = csvTokSep - { - p++ - cs = 8 - goto _out - } - } - goto st8 - st8: - //.... NONE:1 - ts = 0 - - //.... NONE:1 - act = 0 - - if p++; p == pe { - goto _test_eof8 - } - st_case_8: - //.... NONE:1 - ts = p - - //.... tmp_parser.go:199 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st9 - case 13: - goto st9 - case 3932: - goto tr1 - case 4188: - goto st2 - case 4444: - goto st1 - case 4700: - goto st5 - case 4956: - goto tr17 - case 5212: - goto tr18 - case 5468: - goto st23 - case 5724: - goto st24 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr1 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr1 - } - default: - goto tr1 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto st2 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto st2 - } - default: - goto st2 - } - default: - goto tr1 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto tr17 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto tr17 - } - default: - goto tr17 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr18 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr18 - } - case _widec >= 3598: - goto tr18 - } - default: - goto tr18 - } - default: - goto tr17 - } - default: - goto st2 - } - goto st0 - st_case_0: - st0: - cs = 0 - goto _out - st9: - if p++; p == pe { - goto _test_eof9 - } - st_case_9: - switch data[p] { - case 10: - goto st9 - case 13: - goto st9 - } - goto tr23 - tr1: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st10 - st10: - if p++; p == pe { - goto _test_eof10 - } - st_case_10: - //.... tmp_parser.go:378 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 3932: - goto tr1 - case 4444: - goto st1 - case 4700: - goto st1 - case 5468: - goto st1 - case 5724: - goto st1 - } - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr1 - } - case _widec > 2828: - switch { - case _widec > 2907: - if 2909 <= _widec && _widec <= 3071 { - goto tr1 - } - case _widec >= 2830: - goto tr1 - } - default: - goto tr1 - } - goto tr24 - st1: - if p++; p == pe { - goto _test_eof1 - } - st_case_1: - goto tr1 - st2: - if p++; p == pe { - goto _test_eof2 - } - st_case_2: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto st2 - case 4188: - goto tr3 - case 4444: - goto st3 - case 4700: - goto tr6 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st4 - case 5724: - goto tr8 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto st2 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto st2 - } - default: - goto st2 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr3 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr3 - } - default: - goto tr3 - } - default: - goto st2 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr3 - } - goto tr0 - tr3: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st11 - st11: - if p++; p == pe { - goto _test_eof11 - } - st_case_11: - //.... tmp_parser.go:638 - _widec = int16(data[p]) - switch { - case data[p] < 11: - if data[p] <= 9 { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - case data[p] > 12: - if 14 <= data[p] { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - default: - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - switch { - case _widec < 1035: - if 1024 <= _widec && _widec <= 1033 { - goto st2 - } - case _widec > 1036: - if 1038 <= _widec && _widec <= 1279 { - goto st2 - } - default: - goto st2 - } - goto tr24 - tr4: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st12 - st12: - if p++; p == pe { - goto _test_eof12 - } - st_case_12: - //.... tmp_parser.go:686 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto st2 - case 4188: - goto tr4 - case 4444: - goto st3 - case 4700: - goto tr8 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st4 - case 5724: - goto tr8 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto st2 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto st2 - } - default: - goto st2 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr4 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr4 - } - goto tr24 - st3: - if p++; p == pe { - goto _test_eof3 - } - st_case_3: - goto st2 - tr8: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st13 - st13: - if p++; p == pe { - goto _test_eof13 - } - st_case_13: - //.... tmp_parser.go:855 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto st2 - case 4188: - goto tr4 - case 4444: - goto st4 - case 4700: - goto tr8 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st4 - case 5724: - goto tr8 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto st2 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto st2 - } - default: - goto st2 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr4 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr4 - } - goto tr24 - st4: - if p++; p == pe { - goto _test_eof4 - } - st_case_4: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto st2 - case 4188: - goto tr4 - case 4444: - goto st4 - case 4700: - goto tr8 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st4 - case 5724: - goto tr8 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto st2 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto st2 - } - default: - goto st2 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr4 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr4 - } - goto tr0 - tr6: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st14 - st14: - if p++; p == pe { - goto _test_eof14 - } - st_case_14: - //.... tmp_parser.go:1173 - _widec = int16(data[p]) - switch { - case data[p] < 11: - if data[p] <= 9 { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - case data[p] > 12: - if 14 <= data[p] { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - default: - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - } - switch { - case _widec < 782: - switch { - case _widec > 777: - if 779 <= _widec && _widec <= 780 { - goto st2 - } - case _widec >= 768: - goto st2 - } - case _widec > 1033: - switch { - case _widec > 1036: - if 1038 <= _widec && _widec <= 1279 { - goto st2 - } - case _widec >= 1035: - goto st2 - } - default: - goto st2 - } - goto tr24 - tr18: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:40 - act = 1 - goto st15 - st15: - if p++; p == pe { - goto _test_eof15 - } - st_case_15: - //.... tmp_parser.go:1237 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto st2 - case 4188: - goto tr3 - case 4444: - goto st3 - case 4700: - goto tr6 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st4 - case 5724: - goto tr8 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto st2 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto st2 - } - default: - goto st2 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr3 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr3 - } - default: - goto tr3 - } - default: - goto st2 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr3 - } - goto tr25 - st5: - if p++; p == pe { - goto _test_eof5 - } - st_case_5: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto tr9 - case 13: - goto tr9 - case 3932: - goto tr9 - case 4188: - goto tr10 - case 4444: - goto tr12 - case 4700: - goto tr12 - case 4956: - goto tr9 - case 5212: - goto tr11 - case 5468: - goto tr13 - case 5724: - goto tr13 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr10 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr10 - } - default: - goto tr10 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto tr9 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr11 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr11 - } - case _widec >= 3598: - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - default: - goto tr10 - } - goto tr0 - tr9: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st16 - st16: - if p++; p == pe { - goto _test_eof16 - } - st_case_16: - //.... tmp_parser.go:1555 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto tr9 - case 4188: - goto tr3 - case 4444: - goto st6 - case 4700: - goto st17 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st7 - case 5724: - goto st19 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr3 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr3 - } - default: - goto tr3 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr3 - } - goto tr24 - st6: - if p++; p == pe { - goto _test_eof6 - } - st_case_6: - goto tr9 - st17: - if p++; p == pe { - goto _test_eof17 - } - st_case_17: - _widec = int16(data[p]) - switch { - case data[p] < 11: - if data[p] <= 9 { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - case data[p] > 12: - if 14 <= data[p] { - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - default: - _widec = 768 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - } - switch _widec { - case 10: - goto tr9 - case 13: - goto tr9 - } - switch { - case _widec < 782: - switch { - case _widec > 777: - if 779 <= _widec && _widec <= 780 { - goto tr9 - } - case _widec >= 768: - goto tr9 - } - case _widec > 1033: - switch { - case _widec > 1036: - if 1038 <= _widec && _widec <= 1279 { - goto tr9 - } - case _widec >= 1035: - goto tr9 - } - default: - goto tr9 - } - goto tr24 - st7: - if p++; p == pe { - goto _test_eof7 - } - st_case_7: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto tr9 - case 13: - goto tr9 - case 3932: - goto tr9 - case 4188: - goto tr11 - case 4444: - goto tr13 - case 4700: - goto tr13 - case 4956: - goto tr9 - case 5212: - goto tr11 - case 5468: - goto tr13 - case 5724: - goto tr13 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr11 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto tr9 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr11 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr11 - } - case _widec >= 3598: - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - default: - goto tr11 - } - goto tr14 - tr11: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st18 - st18: - if p++; p == pe { - goto _test_eof18 - } - st_case_18: - //.... tmp_parser.go:1935 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto tr9 - case 4188: - goto tr4 - case 4444: - goto st6 - case 4700: - goto st19 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st7 - case 5724: - goto st19 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr4 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr4 - } - default: - goto tr4 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr4 - } - goto tr24 - st19: - if p++; p == pe { - goto _test_eof19 - } - st_case_19: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto tr9 - case 13: - goto tr9 - case 3932: - goto tr9 - case 4188: - goto tr11 - case 4444: - goto tr13 - case 4700: - goto tr13 - case 4956: - goto tr9 - case 5212: - goto tr11 - case 5468: - goto tr13 - case 5724: - goto tr13 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr11 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto tr9 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr11 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr11 - } - case _widec >= 3598: - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - default: - goto tr11 - } - goto tr24 - tr13: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st20 - st20: - if p++; p == pe { - goto _test_eof20 - } - st_case_20: - //.... tmp_parser.go:2253 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto tr9 - case 4188: - goto tr4 - case 4444: - goto st7 - case 4700: - goto st19 - case 4956: - goto st2 - case 5212: - goto tr4 - case 5468: - goto st7 - case 5724: - goto st19 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr4 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr4 - } - default: - goto tr4 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr4 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr4 - } - case _widec >= 3598: - goto tr4 - } - default: - goto tr4 - } - default: - goto st2 - } - default: - goto tr4 - } - goto tr24 - tr10: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st21 - st21: - if p++; p == pe { - goto _test_eof21 - } - st_case_21: - //.... tmp_parser.go:2416 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 3932: - goto tr1 - case 4188: - goto st2 - case 4444: - goto st1 - case 4700: - goto st5 - case 5212: - goto st2 - case 5468: - goto st1 - case 5724: - goto st5 - } - switch { - case _widec < 3083: - switch { - case _widec < 2830: - switch { - case _widec > 2825: - if 2827 <= _widec && _widec <= 2828 { - goto tr1 - } - case _widec >= 2816: - goto tr1 - } - case _widec > 2907: - switch { - case _widec > 3071: - if 3072 <= _widec && _widec <= 3081 { - goto st2 - } - case _widec >= 2909: - goto tr1 - } - default: - goto tr1 - } - case _widec > 3084: - switch { - case _widec < 3584: - switch { - case _widec > 3163: - if 3165 <= _widec && _widec <= 3327 { - goto st2 - } - case _widec >= 3086: - goto st2 - } - case _widec > 3593: - switch { - case _widec < 3598: - if 3595 <= _widec && _widec <= 3596 { - goto st2 - } - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto st2 - } - default: - goto st2 - } - default: - goto st2 - } - default: - goto st2 - } - goto tr24 - tr12: - //.... NONE:1 - te = p + 1 - - //.... lightning/mydump/csv_parser.rl:45 - act = 2 - goto st22 - st22: - if p++; p == pe { - goto _test_eof22 - } - st_case_22: - //.... tmp_parser.go:2556 - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto st2 - case 13: - goto st2 - case 3932: - goto tr9 - case 4188: - goto st2 - case 4444: - goto st5 - case 4700: - goto st5 - case 4956: - goto st2 - case 5212: - goto st2 - case 5468: - goto st5 - case 5724: - goto st5 - } - switch { - case _widec < 3086: - switch { - case _widec < 2830: - switch { - case _widec > 2825: - if 2827 <= _widec && _widec <= 2828 { - goto tr9 - } - case _widec >= 2816: - goto tr9 - } - case _widec > 2907: - switch { - case _widec < 3072: - if 2909 <= _widec && _widec <= 3071 { - goto tr9 - } - case _widec > 3081: - if 3083 <= _widec && _widec <= 3084 { - goto st2 - } - default: - goto st2 - } - default: - goto tr9 - } - case _widec > 3163: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3165 <= _widec && _widec <= 3337 { - goto st2 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto st2 - } - default: - goto st2 - } - case _widec > 3593: - switch { - case _widec < 3598: - if 3595 <= _widec && _widec <= 3596 { - goto st2 - } - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto st2 - } - default: - goto st2 - } - default: - goto st2 - } - default: - goto st2 - } - goto tr24 - st23: - if p++; p == pe { - goto _test_eof23 - } - st_case_23: - goto tr1 - st24: - if p++; p == pe { - goto _test_eof24 - } - st_case_24: - _widec = int16(data[p]) - switch { - case data[p] < 14: - switch { - case data[p] > 9: - if 11 <= data[p] && data[p] <= 12 { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] > 91: - switch { - case data[p] > 92: - if 93 <= data[p] { - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - case data[p] >= 92: - _widec = 3840 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if parser.escFlavor != backslashEscapeFlavorNone { - _widec += 512 - } - if data[p] == sep { - _widec += 1024 - } - } - default: - _widec = 2816 + (int16(data[p]) - 0) - if data[p] == delim { - _widec += 256 - } - if data[p] == sep { - _widec += 512 - } - } - switch _widec { - case 10: - goto tr9 - case 13: - goto tr9 - case 3932: - goto tr9 - case 4188: - goto tr10 - case 4444: - goto tr12 - case 4700: - goto tr12 - case 4956: - goto tr9 - case 5212: - goto tr11 - case 5468: - goto tr13 - case 5724: - goto tr13 - } - switch { - case _widec < 3165: - switch { - case _widec < 2909: - switch { - case _widec < 2827: - if 2816 <= _widec && _widec <= 2825 { - goto tr9 - } - case _widec > 2828: - if 2830 <= _widec && _widec <= 2907 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3071: - switch { - case _widec < 3083: - if 3072 <= _widec && _widec <= 3081 { - goto tr10 - } - case _widec > 3084: - if 3086 <= _widec && _widec <= 3163 { - goto tr10 - } - default: - goto tr10 - } - default: - goto tr9 - } - case _widec > 3327: - switch { - case _widec < 3421: - switch { - case _widec < 3339: - if 3328 <= _widec && _widec <= 3337 { - goto tr9 - } - case _widec > 3340: - if 3342 <= _widec && _widec <= 3419 { - goto tr9 - } - default: - goto tr9 - } - case _widec > 3583: - switch { - case _widec < 3595: - if 3584 <= _widec && _widec <= 3593 { - goto tr11 - } - case _widec > 3596: - switch { - case _widec > 3675: - if 3677 <= _widec && _widec <= 3839 { - goto tr11 - } - case _widec >= 3598: - goto tr11 - } - default: - goto tr11 - } - default: - goto tr9 - } - default: - goto tr10 - } - goto tr25 - st_out: - _test_eof8: - cs = 8 - goto _test_eof - _test_eof9: - cs = 9 - goto _test_eof - _test_eof10: - cs = 10 - goto _test_eof - _test_eof1: - cs = 1 - goto _test_eof - _test_eof2: - cs = 2 - goto _test_eof - _test_eof11: - cs = 11 - goto _test_eof - _test_eof12: - cs = 12 - goto _test_eof - _test_eof3: - cs = 3 - goto _test_eof - _test_eof13: - cs = 13 - goto _test_eof - _test_eof4: - cs = 4 - goto _test_eof - _test_eof14: - cs = 14 - goto _test_eof - _test_eof15: - cs = 15 - goto _test_eof - _test_eof5: - cs = 5 - goto _test_eof - _test_eof16: - cs = 16 - goto _test_eof - _test_eof6: - cs = 6 - goto _test_eof - _test_eof17: - cs = 17 - goto _test_eof - _test_eof7: - cs = 7 - goto _test_eof - _test_eof18: - cs = 18 - goto _test_eof - _test_eof19: - cs = 19 - goto _test_eof - _test_eof20: - cs = 20 - goto _test_eof - _test_eof21: - cs = 21 - goto _test_eof - _test_eof22: - cs = 22 - goto _test_eof - _test_eof23: - cs = 23 - goto _test_eof - _test_eof24: - cs = 24 - goto _test_eof - - _test_eof: - { - } - if p == eof { - switch cs { - case 9: - goto tr23 - case 10: - goto tr24 - case 1: - goto tr0 - case 2: - goto tr0 - case 11: - goto tr24 - case 12: - goto tr24 - case 3: - goto tr0 - case 13: - goto tr24 - case 4: - goto tr0 - case 14: - goto tr24 - case 15: - goto tr25 - case 5: - goto tr0 - case 16: - goto tr24 - case 6: - goto tr14 - case 17: - goto tr24 - case 7: - goto tr14 - case 18: - goto tr24 - case 19: - goto tr24 - case 20: - goto tr24 - case 21: - goto tr24 - case 22: - goto tr24 - case 23: - goto tr25 - case 24: - goto tr25 - } - } - - _out: - { - } - } - - //.... lightning/mydump/csv_parser.rl:81 - - if cs == 0 { - parser.logSyntaxError() - return csvTokNil, nil, errors.New("syntax error") - } - - if consumedToken != csvTokNil { - result := data[ts:te] - parser.buf = data[te:] - parser.pos += int64(te) - return consumedToken, result, nil - } - - if parser.isLastChunk { - return csvTokNil, nil, io.EOF - } - - parser.buf = parser.buf[ts:] - parser.pos += int64(ts) - p -= ts - te -= ts - ts = 0 - if err := parser.readBlock(); err != nil { - return csvTokNil, nil, errors.Trace(err) - } - } -} diff --git a/lightning/mydump/csv_parser_test.go b/lightning/mydump/csv_parser_test.go index 9e509b7b0..13ec9cf98 100644 --- a/lightning/mydump/csv_parser_test.go +++ b/lightning/mydump/csv_parser_test.go @@ -1,19 +1,22 @@ package mydump_test import ( - // "fmt" "context" + "encoding/csv" "io" + "os" + "path/filepath" "strings" . "github.com/pingcap/check" "github.com/pingcap/errors" + "github.com/pingcap/tidb/types" + "go.uber.org/zap" + "github.com/pingcap/tidb-lightning/lightning/config" "github.com/pingcap/tidb-lightning/lightning/log" "github.com/pingcap/tidb-lightning/lightning/mydump" "github.com/pingcap/tidb-lightning/lightning/worker" - "github.com/pingcap/tidb/types" - "go.uber.org/zap/zaptest" ) var _ = Suite(&testMydumpCSVParserSuite{}) @@ -52,10 +55,11 @@ type testCase struct { func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []testCase) { for _, tc := range cases { - parser := mydump.NewCSVParser(cfg, strings.NewReader(tc.input), blockBufSize, s.ioWorkers) + parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers) for i, row := range tc.expected { comment := Commentf("input = %q, row = %d", tc.input, i+1) - c.Assert(parser.ReadRow(), IsNil, comment) + e := parser.ReadRow() + c.Assert(e, IsNil, Commentf("input = %q, row = %d, error = %s", tc.input, i+1, errors.ErrorStack(e))) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{RowID: int64(i) + 1, Row: row}, comment) } c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input)) @@ -64,13 +68,14 @@ func (s *testMydumpCSVParserSuite) runTestCases(c *C, cfg *config.CSVConfig, blo func (s *testMydumpCSVParserSuite) runFailingTestCases(c *C, cfg *config.CSVConfig, blockBufSize int64, cases []string) { for _, tc := range cases { - parser := mydump.NewCSVParser(cfg, strings.NewReader(tc), blockBufSize, s.ioWorkers) - c.Assert(parser.ReadRow(), ErrorMatches, "syntax error", Commentf("input = %q", tc)) + parser := mydump.NewCSVParser(cfg, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers) + e := parser.ReadRow() + c.Assert(e, ErrorMatches, "syntax error.*", Commentf("input = %q / %s", tc, errors.ErrorStack(e))) } } -func (s *testMydumpCSVParserSuite) TestTCPH(c *C) { - reader := strings.NewReader( +func (s *testMydumpCSVParserSuite) TestTPCH(c *C) { + reader := mydump.NewStringReader( `1|goldenrod lavender spring chocolate lace|Manufacturer#1|Brand#13|PROMO BURNISHED COPPER|7|JUMBO PKG|901.00|ly. slyly ironi| 2|blush thistle blue yellow saddle|Manufacturer#1|Brand#13|LARGE BRUSHED BRASS|1|LG CASE|902.00|lar accounts amo| 3|spring green yellow purple cornsilk|Manufacturer#4|Brand#42|STANDARD POLISHED BRASS|21|WRAP CASE|903.00|egular deposits hag| @@ -146,7 +151,7 @@ func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { // example 1, trailing new lines - parser := mydump.NewCSVParser(&cfg, strings.NewReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), config.ReadBlockSize, s.ioWorkers) + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx\n"), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ @@ -174,7 +179,7 @@ func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { // example 2, no trailing new lines - parser = mydump.NewCSVParser(&cfg, strings.NewReader("aaa,bbb,ccc\nzzz,yyy,xxx"), config.ReadBlockSize, s.ioWorkers) + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("aaa,bbb,ccc\nzzz,yyy,xxx"), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ @@ -202,7 +207,7 @@ func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { // example 5, quoted fields - parser = mydump.NewCSVParser(&cfg, strings.NewReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), config.ReadBlockSize, s.ioWorkers) + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","bbb","ccc"`+"\nzzz,yyy,xxx"), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ @@ -230,7 +235,7 @@ func (s *testMydumpCSVParserSuite) TestRFC4180(c *C) { // example 6, line breaks within fields - parser = mydump.NewCSVParser(&cfg, strings.NewReader(`"aaa","b + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b bb","ccc" zzz,yyy,xxx`), config.ReadBlockSize, s.ioWorkers) @@ -260,7 +265,7 @@ zzz,yyy,xxx`), config.ReadBlockSize, s.ioWorkers) // example 7, quote escaping - parser = mydump.NewCSVParser(&cfg, strings.NewReader(`"aaa","b""bb","ccc"`), config.ReadBlockSize, s.ioWorkers) + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"aaa","b""bb","ccc"`), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ @@ -285,7 +290,7 @@ func (s *testMydumpCSVParserSuite) TestMySQL(c *C) { Null: `\N`, } - parser := mydump.NewCSVParser(&cfg, strings.NewReader(`"\"","\\","\?" + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`"\"","\\","\?" "\ ",\N,\\N`), config.ReadBlockSize, s.ioWorkers) @@ -349,7 +354,7 @@ func (s *testMydumpCSVParserSuite) TestTSV(c *C) { Header: true, } - parser := mydump.NewCSVParser(&cfg, strings.NewReader(`a b c d e f + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`a b c d e f 0 foo 0000-00-00 0 foo 0000-00-00 0 abc def ghi bar 1999-12-31`), config.ReadBlockSize, s.ioWorkers) @@ -406,17 +411,17 @@ func (s *testMydumpCSVParserSuite) TestEmpty(c *C) { Delimiter: `"`, } - parser := mydump.NewCSVParser(&cfg, strings.NewReader(""), config.ReadBlockSize, s.ioWorkers) + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), config.ReadBlockSize, s.ioWorkers) c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) // Try again with headers. cfg.Header = true - parser = mydump.NewCSVParser(&cfg, strings.NewReader(""), config.ReadBlockSize, s.ioWorkers) + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader(""), config.ReadBlockSize, s.ioWorkers) c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) - parser = mydump.NewCSVParser(&cfg, strings.NewReader("h\n"), config.ReadBlockSize, s.ioWorkers) + parser = mydump.NewCSVParser(&cfg, mydump.NewStringReader("h\n"), config.ReadBlockSize, s.ioWorkers) c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF) } @@ -425,7 +430,7 @@ func (s *testMydumpCSVParserSuite) TestCRLF(c *C) { Separator: ",", Delimiter: `"`, } - parser := mydump.NewCSVParser(&cfg, strings.NewReader("a\rb\r\nc\n\n\n\nd"), config.ReadBlockSize, s.ioWorkers) + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader("a\rb\r\nc\n\n\n\nd"), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ @@ -460,7 +465,7 @@ func (s *testMydumpCSVParserSuite) TestQuotedSeparator(c *C) { Delimiter: `"`, } - parser := mydump.NewCSVParser(&cfg, strings.NewReader(`",",','`), config.ReadBlockSize, s.ioWorkers) + parser := mydump.NewCSVParser(&cfg, mydump.NewStringReader(`",",','`), config.ReadBlockSize, s.ioWorkers) c.Assert(parser.ReadRow(), IsNil) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{ RowID: 1, @@ -481,29 +486,19 @@ func (s *testMydumpCSVParserSuite) TestConsecutiveFields(c *C) { // Go's CSV package returns a parse error. // NPM's CSV package returns a parse error. // MySQL's LOAD DATA statement returns `"xxx"yyy` as-is. - // For simplicity we treat this as two separate fields. cfg := config.CSVConfig{ Separator: ",", Delimiter: `"`, } - testCases := []testCase{ - { - input: `"x"?`, - expected: [][]types.Datum{{types.NewStringDatum("x"), types.NewStringDatum("?")}}, - }, - { - input: "\"\"\x01", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\x01")}}, - }, - { - input: "\"\"\v", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\v")}}, - }, + testCases := []string{ + `"x"?`, + "\"\"\x01", + "\"\"\v", } - s.runTestCases(c, &cfg, config.ReadBlockSize, testCases) + s.runFailingTestCases(c, &cfg, config.ReadBlockSize, testCases) } func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) { @@ -531,7 +526,11 @@ func (s *testMydumpCSVParserSuite) TestSpecialChars(c *C) { }, { input: "\n\r", - expected: [][]types.Datum{{nullDatum}}, + expected: [][]types.Datum{}, + }, + { + input: `"""",0`, + expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum(`0`)}}, }, } @@ -562,116 +561,13 @@ func (s *testMydumpCSVParserSuite) TestContinuation(c *C) { }, }, }, - } - - s.runTestCases(c, &cfg, 1, testCases) -} - -func (s *testMydumpCSVParserSuite) TestOverlappingSepDelim(c *C) { - // If the same character is simultaneously a separator and a delimiter, - // we treat paired characters as a delimiter and an orphan character as a - // separator, due to behavior of picking longest match in Ragel's tokenizer. - cfg := config.CSVConfig{ - Separator: ",", - Delimiter: ",", - } - - testCases := []testCase{ - { - input: `,`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: "0000,0", - expected: [][]types.Datum{{types.NewStringDatum("0000"), types.NewStringDatum("0")}}, - }, { - input: ",0", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("0")}}, - }, - { - input: ",\r", - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: ",\n", - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: ",\r\n", - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: ",,", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: ",c", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("c")}}, - }, - { - input: ",\x04", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\x04")}}, - }, - { - input: ",\f", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\f")}}, - }, - { - input: ",0,", - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: `,\`, - expected: [][]types.Datum{{nullDatum, types.NewStringDatum(`\`)}}, - }, - { - input: "0,00,0", - expected: [][]types.Datum{{types.NewStringDatum("0"), types.NewStringDatum("00"), types.NewStringDatum("0")}}, - }, - { - input: ",,0", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("0")}}, - }, - { - input: ",,\f", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\f")}}, - }, - { - input: ",,\x8f", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\x8f")}}, - }, - { - input: ",,,", - expected: [][]types.Datum{{types.NewStringDatum(",")}}, + input: `"VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca"`, + expected: [][]types.Datum{{types.NewStringDatum("VzMXdTXsLbiIqTYQlwPSudocNPKVsAqXgnuvupXEzlxkaFpBtHNDyoVEydoEgdnhsygaNHLpMTdEkpkrkNdzVjCbSoXvUqwoVaca")}}, }, } s.runTestCases(c, &cfg, 1, testCases) - - cfg.BackslashEscape = true - testCases = []testCase{ - { - input: ",,\x02", - expected: [][]types.Datum{{nullDatum, types.NewStringDatum("\x02")}}, - }, - { - input: ",,\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: ",,\r", - expected: [][]types.Datum{{nullDatum}}, - }, - } - - s.runTestCases(c, &cfg, 1, testCases) - - failingInputs := []string{ - `,\`, - `,,\`, - } - s.runFailingTestCases(c, &cfg, 1, failingInputs) } func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) { @@ -697,58 +593,9 @@ func (s *testMydumpCSVParserSuite) TestBackslashAsSep(c *C) { `"\`, } s.runFailingTestCases(c, &cfg, 1, failingInputs) - - cfg.BackslashEscape = true - - testCases = []testCase{ - { - input: `0\`, - expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}}, - }, - { - input: `\`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: `""""\0`, - expected: [][]types.Datum{{types.NewStringDatum(`"`), types.NewStringDatum("\x00")}}, - }, - { - input: `\0`, - expected: [][]types.Datum{{types.NewStringDatum("\x00")}}, - }, - { - input: `"\"`, - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: `"\"\`, - expected: [][]types.Datum{{types.NewStringDatum(`\`), nullDatum}}, - }, - } - - s.runTestCases(c, &cfg, 1, testCases) - - failingInputs = []string{ - `"\`, - "\"\\\xef", - `"000\0`, - `"\0`, - `"\\`, - "\"\\\v", - "\"\\\n", - "\"\\\x00", - "\"\\\r", - } - s.runFailingTestCases(c, &cfg, 1, failingInputs) } func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) { - // Most of these are just documenting the current behavior for coverage, - // there's no sane way to describe the desired behavior. The expected - // results of these tests may change according to the parser's internals. - // - // We'll deny these cases when checking the config. cfg := config.CSVConfig{ Separator: ",", Delimiter: `\`, @@ -766,358 +613,6 @@ func (s *testMydumpCSVParserSuite) TestBackslashAsDelim(c *C) { `"\`, } s.runFailingTestCases(c, &cfg, 1, failingInputs) - - cfg.BackslashEscape = true - - testCases = []testCase{ - { - input: `\0`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\x00", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\\`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\r", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "000\r\\0", - expected: [][]types.Datum{{types.NewStringDatum("000")}, {nullDatum}}, - }, - { - input: "\\\xe3", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\v", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\0\xbf", - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: `\0\`, - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: "\\0\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\0\r", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\0\v", - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: "00\n\\00", - expected: [][]types.Datum{{types.NewStringDatum("00")}, {types.NewStringDatum("0")}}, - }, - { - input: `\\0`, - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "00,\\00", - expected: [][]types.Datum{{types.NewStringDatum("00"), types.NewStringDatum("0")}}, - }, - { - input: "\\\\\x00", - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: `\01`, - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: "\\0\x00", - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: `\,`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\\\r", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\0\\`, - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: `\0,`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: `\\\\\\\\\\0`, - expected: [][]types.Datum{{types.NewStringDatum(`\\\`)}}, - }, - { - input: `\\,`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: "\\0\\\r", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\\\n", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\r\\", - expected: [][]types.Datum{{types.NewStringDatum("0\r")}}, - }, - { - input: "\\\\\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\0\0`, - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "0\n\\0\\0", - expected: [][]types.Datum{{types.NewStringDatum(`0`)}, {types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\\v", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("\v")}}, - }, - { - input: "\\0\\\v", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\0", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("0")}}, - }, - { - input: "\\0\\\x00", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\\n", - expected: [][]types.Datum{{types.NewStringDatum("0\n")}}, - }, - { - input: "\\0\r\\\r", - expected: [][]types.Datum{{types.NewStringDatum("0\r")}}, - }, - { - input: "\n\\0\n\\0", - expected: [][]types.Datum{{nullDatum}, {types.NewStringDatum("0\n"), types.NewStringDatum("0")}}, - }, - { - input: "\\0\n\\\x01", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("\x01")}}, - }, - } - s.runTestCases(c, &cfg, 1, testCases) - - failingInputs = []string{ - `0\`, - `\`, - `\\\`, - `\0,\\`, - } - s.runFailingTestCases(c, &cfg, 1, failingInputs) -} - -func (s *testMydumpCSVParserSuite) TestBackslashAsSepAndDelim(c *C) { - // Most of these are just documenting the current behavior for coverage, - // there's no sane way to describe the desired behavior. The expected - // results of these tests may change according to the parser's internals. - // - // We'll deny these cases when checking the config. - cfg := config.CSVConfig{ - Separator: `\`, - Delimiter: `\`, - } - - testCases := []testCase{ - { - input: `\`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: `\0\`, - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: `\\`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\\\`, - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - } - s.runTestCases(c, &cfg, 1, testCases) - - cfg.BackslashEscape = true - - testCases = []testCase{ - { - input: `0\`, - expected: [][]types.Datum{{types.NewStringDatum("0"), nullDatum}}, - }, - { - input: `\`, - expected: [][]types.Datum{{nullDatum, nullDatum}}, - }, - { - input: "\\\xe7", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\0`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\x00", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\\`, - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\r", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "000\r\\0", - expected: [][]types.Datum{{types.NewStringDatum("000")}, {nullDatum}}, - }, - { - input: "\\\v", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\0\`, - expected: [][]types.Datum{{types.NewStringDatum("0")}}, - }, - { - input: "00\r\\\\0", - expected: [][]types.Datum{{types.NewStringDatum("00")}, {types.NewStringDatum(`\`)}}, - }, - { - input: "\\0\n\\", - expected: [][]types.Datum{{types.NewStringDatum("0\n")}}, - }, - { - input: "\\\\r", - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "\\\\\r", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: "\\\\0", - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "\\\\\v", - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "\\\\\x00", - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "\\\\\n", - expected: [][]types.Datum{{nullDatum}}, - }, - { - input: `\\\`, - expected: [][]types.Datum{{types.NewStringDatum(`\`)}}, - }, - { - input: "\\0\\\v", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\\\", - expected: [][]types.Datum{{types.NewStringDatum("0\n\\")}}, - }, - { - input: "\\0\n\\0", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("0")}}, - }, - { - input: `\0\\`, - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\\\x00", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\\n", - expected: [][]types.Datum{{types.NewStringDatum("0\n")}}, - }, - { - input: "\\0\\t", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\n\\\x04", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("\x04")}}, - }, - { - input: "\\0\\\r", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "\\0\r\\\r", - expected: [][]types.Datum{{types.NewStringDatum("0\r")}}, - }, - { - input: "\\0\n\\\xdf", - expected: [][]types.Datum{{types.NewStringDatum("0\n"), types.NewStringDatum("\xdf")}}, - }, - { - input: "\n\\0\n\\0", - expected: [][]types.Datum{{nullDatum}, {types.NewStringDatum("0\n"), types.NewStringDatum("0")}}, - }, - { - input: "\\0\r\\\v", - expected: [][]types.Datum{{types.NewStringDatum("0\r"), types.NewStringDatum("\v")}}, - }, - { - input: "\\0\\\n", - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: `\0\0`, - expected: [][]types.Datum{{types.NewStringDatum(`0\`)}}, - }, - { - input: "0\n\\0\\0", - expected: [][]types.Datum{{types.NewStringDatum("0")}, {types.NewStringDatum(`0\`)}}, - }, - } - s.runTestCases(c, &cfg, 1, testCases) } // errorReader implements the Reader interface which always returns an error. @@ -1127,6 +622,14 @@ func (*errorReader) Read(p []byte) (int, error) { return 0, errors.New("fake read error") } +func (*errorReader) Seek(offset int64, whence int) (int64, error) { + return 0, errors.New("fake seek error") +} + +func (*errorReader) Close() error { + return errors.New("fake close error") +} + func (s *testMydumpCSVParserSuite) TestReadError(c *C) { cfg := config.CSVConfig{ Separator: ",", @@ -1144,15 +647,103 @@ func (s *testMydumpCSVParserSuite) TestSyntaxErrorLog(c *C) { Delimiter: "'", } - tc := strings.NewReader("x'" + strings.Repeat("y", 50000)) + tc := mydump.NewStringReader("x'" + strings.Repeat("y", 50000)) parser := mydump.NewCSVParser(&cfg, tc, 50000, s.ioWorkers) - var buffer *zaptest.Buffer - parser.Logger, buffer = log.MakeTestLogger() + logger, buffer := log.MakeTestLogger() + parser.SetLogger(logger) c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*") - c.Assert(parser.Logger.Sync(), IsNil) + c.Assert(logger.Sync(), IsNil) c.Assert( buffer.Stripped(), Equals, `{"$lvl":"ERROR","$msg":"syntax error","pos":1,"content":"'`+strings.Repeat("y", 255)+`"}`, ) } + +// Run `go test github.com/pingcap/tidb-lightning/lightning/mydump -check.b -check.bmem -test.v` to get benchmark result. +// Please ensure your temporary storage has (c.N / 2) KiB of free space. + +type benchCSVParserSuite struct { + csvPath string + ioWorkers *worker.Pool +} + +var _ = Suite(&benchCSVParserSuite{}) + +func (s *benchCSVParserSuite) setupTest(c *C) { + s.ioWorkers = worker.NewPool(context.Background(), 5, "bench_csv") + + dir := c.MkDir() + s.csvPath = filepath.Join(dir, "input.csv") + file, err := os.Create(s.csvPath) + c.Assert(err, IsNil) + defer func() { + c.Assert(file.Close(), IsNil) + }() + for i := 0; i < c.N; i++ { + _, err = file.WriteString("18,1,1,0.3650,GC,BARBARBAR,rw9AOV1AjoI1,50000.00,-10.00,10.00,1,1,djj3Q2XaIPoYVy1FuF,gc80Q2o82Au3C9xv,PYOolSxG3w,DI,265111111,7586538936787184,2020-02-26 20:06:00.193,OE,YCkSPBVqoJ2V5F8zWs87V5XzbaIY70aWCD4dgcB6bjUzCr5wOJCJ2TYH49J7yWyysbudJIxlTAEWSJahY7hswLtTsqyjEkrlsN8iDMAa9Poj29miJ08tnn2G8mL64IlyywvnRGbLbyGvWDdrOSF42RyUFTWVyqlDWc6Gr5wyMPYgvweKemzFDVD3kro5JsmBmJY08EK54nQoyfo2sScyb34zcM9GFo9ZQTwloINfPYQKXQm32m0XvU7jiNmYpFTFJQjdqA825SEvQqMMefG2WG4jVu9UPdhdUjRsFRd0Gw7YPKByOlcuY0eKxT7sAzMKXx2000RR6dqHNXe47oVYd\n") + c.Assert(err, IsNil) + } + c.ResetTimer() +} + +func (s *benchCSVParserSuite) BenchmarkReadRowUsingMydumpCSVParser(c *C) { + s.setupTest(c) + + file, err := os.Open(s.csvPath) + c.Assert(err, IsNil) + defer func() { + c.Assert(file.Close(), IsNil) + }() + + cfg := config.CSVConfig{Separator: ","} + parser := mydump.NewCSVParser(&cfg, file, 65536, s.ioWorkers) + parser.SetLogger(log.Logger{Logger: zap.NewNop()}) + + rowsCount := 0 + for { + err := parser.ReadRow() + if err == nil { + parser.RecycleRow(parser.LastRow()) + rowsCount++ + continue + } + if errors.Cause(err) == io.EOF { + break + } + c.Fatal(err) + } + c.Assert(rowsCount, Equals, c.N) +} + +func (s *benchCSVParserSuite) BenchmarkReadRowUsingEncodingCSV(c *C) { + s.setupTest(c) + + file, err := os.Open(s.csvPath) + c.Assert(err, IsNil) + defer func() { + c.Assert(file.Close(), IsNil) + }() + + csvParser := csv.NewReader(file) + + rowsCount := 0 + var datums []types.Datum + for { + records, err := csvParser.Read() + if err == nil { + // for fair comparison, we need to include the cost of conversion to Datum. + for _, record := range records { + datums = append(datums, types.NewStringDatum(record)) + } + datums = datums[:0] + rowsCount++ + continue + } + if errors.Cause(err) == io.EOF { + break + } + c.Fatal(err) + } + c.Assert(rowsCount, Equals, c.N) +} diff --git a/lightning/mydump/parser.go b/lightning/mydump/parser.go index a34179bf5..7749245bd 100644 --- a/lightning/mydump/parser.go +++ b/lightning/mydump/parser.go @@ -20,6 +20,7 @@ import ( "regexp" "strconv" "strings" + "sync" "time" "github.com/pingcap/errors" @@ -34,7 +35,7 @@ import ( type blockParser struct { // states for the lexer - reader io.Reader + reader PooledReader buf []byte blockBuf []byte isLastChunk bool @@ -42,6 +43,7 @@ type blockParser struct { // The list of column names of the last INSERT statement. columns []string + rowPool *sync.Pool lastRow Row // Current file offset. pos int64 @@ -49,20 +51,23 @@ type blockParser struct { // cache remainBuf *bytes.Buffer appendBuf *bytes.Buffer - ioWorkers *worker.Pool // the Logger associated with this parser for reporting failure Logger log.Logger } -func makeBlockParser(reader io.Reader, blockBufSize int64, ioWorkers *worker.Pool) blockParser { +func makeBlockParser(reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool) blockParser { return blockParser{ - reader: reader, + reader: MakePooledReader(reader, ioWorkers), blockBuf: make([]byte, blockBufSize*config.BufferSizeScale), remainBuf: &bytes.Buffer{}, appendBuf: &bytes.Buffer{}, - ioWorkers: ioWorkers, Logger: log.L(), + rowPool: &sync.Pool{ + New: func() interface{} { + return make([]types.Datum, 0, 16) + }, + }, } } @@ -102,16 +107,19 @@ type Parser interface { Close() error ReadRow() error LastRow() Row + RecycleRow(row Row) // Columns returns the _lower-case_ column names corresponding to values in // the LastRow. Columns() []string + + SetLogger(log.Logger) } // NewChunkParser creates a new parser which can read chunks out of a file. func NewChunkParser( sqlMode mysql.SQLMode, - reader io.Reader, + reader ReadSeekCloser, blockBufSize int64, ioWorkers *worker.Pool, ) *ChunkParser { @@ -126,13 +134,9 @@ func NewChunkParser( } } -// Reader returns the underlying reader of this parser. -func (parser *blockParser) Reader() io.Reader { - return parser.reader -} - // SetPos changes the reported position and row ID. func (parser *blockParser) SetPos(pos int64, rowID int64) { + parser.reader.Seek(pos, io.SeekStart) parser.pos = pos parser.lastRow.RowID = rowID } @@ -143,10 +147,7 @@ func (parser *blockParser) Pos() (int64, int64) { } func (parser *blockParser) Close() error { - if closer, ok := parser.reader.(io.Closer); ok { - return closer.Close() - } - return errors.New("this parser is not created with a reader that can be closed") + return parser.reader.Close() } func (parser *blockParser) Columns() []string { @@ -164,6 +165,10 @@ func (parser *blockParser) logSyntaxError() { ) } +func (parser *blockParser) SetLogger(logger log.Logger) { + parser.Logger = logger +} + type token byte const ( @@ -216,10 +221,7 @@ func (tok token) String() string { func (parser *blockParser) readBlock() error { startTime := time.Now() - // limit IO concurrency - w := parser.ioWorkers.Apply() - n, err := io.ReadFull(parser.reader, parser.blockBuf) - parser.ioWorkers.Recycle(w) + n, err := parser.reader.ReadFull(parser.blockBuf) switch err { case io.ErrUnexpectedEOF, io.EOF: @@ -427,7 +429,7 @@ func (parser *ChunkParser) ReadRow() error { switch tok { case tokRowBegin: row.RowID++ - row.Row = make([]types.Datum, 0, len(row.Row)) + row.Row = parser.acquireDatumSlice() st = stateRow case tokUnquoted, tokDoubleQuoted, tokBackQuoted: parser.columns = nil @@ -499,6 +501,16 @@ func (parser *blockParser) LastRow() Row { return parser.lastRow } +// RecycleRow places the row object back into the allocation pool. +func (parser *blockParser) RecycleRow(row Row) { + parser.rowPool.Put(row.Row[:0]) +} + +// acquireDatumSlice allocates an empty []types.Datum +func (parser *blockParser) acquireDatumSlice() []types.Datum { + return parser.rowPool.Get().([]types.Datum) +} + // ReadChunks parses the entire file and splits it into continuous chunks of // size >= minSize. func ReadChunks(parser Parser, minSize int64) ([]Chunk, error) { diff --git a/lightning/mydump/parser_test.go b/lightning/mydump/parser_test.go index 826b603b4..6268e5dc8 100644 --- a/lightning/mydump/parser_test.go +++ b/lightning/mydump/parser_test.go @@ -16,7 +16,6 @@ package mydump_test import ( "context" "io" - "strings" . "github.com/pingcap/check" "github.com/pingcap/errors" @@ -40,10 +39,11 @@ func (s *testMydumpParserSuite) TearDownSuite(c *C) {} func (s *testMydumpParserSuite) runTestCases(c *C, mode mysql.SQLMode, blockBufSize int64, cases []testCase) { for _, tc := range cases { - parser := mydump.NewChunkParser(mode, strings.NewReader(tc.input), blockBufSize, s.ioWorkers) + parser := mydump.NewChunkParser(mode, mydump.NewStringReader(tc.input), blockBufSize, s.ioWorkers) for i, row := range tc.expected { - comment := Commentf("input = %q, row = %d", tc.input, i+1) - c.Assert(parser.ReadRow(), IsNil, comment) + e := parser.ReadRow() + comment := Commentf("input = %q, row = %d, err = %s", tc.input, i+1, errors.ErrorStack(e)) + c.Assert(e, IsNil, comment) c.Assert(parser.LastRow(), DeepEquals, mydump.Row{RowID: int64(i) + 1, Row: row}, comment) } c.Assert(errors.Cause(parser.ReadRow()), Equals, io.EOF, Commentf("input = %q", tc.input)) @@ -52,13 +52,13 @@ func (s *testMydumpParserSuite) runTestCases(c *C, mode mysql.SQLMode, blockBufS func (s *testMydumpParserSuite) runFailingTestCases(c *C, mode mysql.SQLMode, blockBufSize int64, cases []string) { for _, tc := range cases { - parser := mydump.NewChunkParser(mode, strings.NewReader(tc), blockBufSize, s.ioWorkers) + parser := mydump.NewChunkParser(mode, mydump.NewStringReader(tc), blockBufSize, s.ioWorkers) c.Assert(parser.ReadRow(), ErrorMatches, "syntax error.*", Commentf("input = %q", tc)) } } func (s *testMydumpParserSuite) TestReadRow(c *C) { - reader := strings.NewReader( + reader := mydump.NewStringReader( "/* whatever pragmas */;" + "INSERT INTO `namespaced`.`table` (columns, more, columns) VALUES (1,-2, 3),\n(4,5., 6);" + "INSERT `namespaced`.`table` (x,y,z) VALUES (7,8,9);" + @@ -129,7 +129,7 @@ func (s *testMydumpParserSuite) TestReadRow(c *C) { } func (s *testMydumpParserSuite) TestReadChunks(c *C) { - reader := strings.NewReader(` + reader := mydump.NewStringReader(` INSERT foo VALUES (1,2,3,4),(5,6,7,8),(9,10,11,12); INSERT foo VALUES (13,14,15,16),(17,18,19,20),(21,22,23,24),(25,26,27,28); INSERT foo VALUES (29,30,31,32),(33,34,35,36); @@ -174,7 +174,7 @@ func (s *testMydumpParserSuite) TestReadChunks(c *C) { } func (s *testMydumpParserSuite) TestNestedRow(c *C) { - reader := strings.NewReader(` + reader := mydump.NewStringReader(` INSERT INTO exam_detail VALUES ("123",CONVERT("{}" USING UTF8MB4)), ("456",CONVERT("{\"a\":4}" USING UTF8MB4)), @@ -351,8 +351,29 @@ func (s *testMydumpParserSuite) TestVariousSyntax(c *C) { s.runTestCases(c, mysql.ModeNone, config.ReadBlockSize, testCases) } +func (s *testMydumpParserSuite) TestContinuation(c *C) { + testCases := []testCase{ + { + input: ` + ('FUZNtcGYegeXwnMRKtYnXtFhgnAMTzQHEBUTBehAFBQdPsnjHhRwRZhZLtEBsIDUFduzftskgxkYkPmEgvoirfIZRsARXjsdKwOc') + `, + expected: [][]types.Datum{ + {types.NewStringDatum("FUZNtcGYegeXwnMRKtYnXtFhgnAMTzQHEBUTBehAFBQdPsnjHhRwRZhZLtEBsIDUFduzftskgxkYkPmEgvoirfIZRsARXjsdKwOc")}, + }, + }, + { + input: "INSERT INTO `report_case_high_risk` VALUES (2,'4','6',8,10);", + expected: [][]types.Datum{ + {types.NewUintDatum(2), types.NewStringDatum("4"), types.NewStringDatum("6"), types.NewUintDatum(8), types.NewUintDatum(10)}, + }, + }, + } + + s.runTestCases(c, mysql.ModeNone, 1, testCases) +} + func (s *testMydumpParserSuite) TestPseudoKeywords(c *C) { - reader := strings.NewReader(` + reader := mydump.NewStringReader(` INSERT INTO t ( c, C, co, CO, diff --git a/lightning/mydump/reader.go b/lightning/mydump/reader.go index a30e42461..66a9221e0 100644 --- a/lightning/mydump/reader.go +++ b/lightning/mydump/reader.go @@ -23,6 +23,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb-lightning/lightning/log" + "github.com/pingcap/tidb-lightning/lightning/worker" "go.uber.org/zap" "golang.org/x/text/encoding/simplifiedchinese" ) @@ -112,3 +113,64 @@ func ExportStatement(sqlFile string, characterSet string) ([]byte, error) { } return data, nil } + +// ReadSeekCloser = Reader + Seeker + Closer +type ReadSeekCloser interface { + io.Reader + io.Seeker + io.Closer +} + +// StringReader is a wrapper around *strings.Reader with an additional Close() method +type StringReader struct{ *strings.Reader } + +// NewStringReader constructs a new StringReader +func NewStringReader(s string) StringReader { + return StringReader{Reader: strings.NewReader(s)} +} + +// Close implements io.Closer +func (sr StringReader) Close() error { + return nil +} + +// PooledReader is a throttled reader wrapper, where Read() calls have an upper limit of concurrency +// imposed by the given worker pool. +type PooledReader struct { + reader ReadSeekCloser + ioWorkers *worker.Pool +} + +// MakePooledReader constructs a new PooledReader. +func MakePooledReader(reader ReadSeekCloser, ioWorkers *worker.Pool) PooledReader { + return PooledReader{ + reader: reader, + ioWorkers: ioWorkers, + } +} + +// Read implements io.Reader +func (pr PooledReader) Read(p []byte) (n int, err error) { + w := pr.ioWorkers.Apply() + defer pr.ioWorkers.Recycle(w) + return pr.reader.Read(p) +} + +// Seek implements io.Seeker +func (pr PooledReader) Seek(offset int64, whence int) (int64, error) { + w := pr.ioWorkers.Apply() + defer pr.ioWorkers.Recycle(w) + return pr.reader.Seek(offset, whence) +} + +// Close implements io.Closer +func (pr PooledReader) Close() error { + return pr.reader.Close() +} + +// ReadFull is same as `io.ReadFull(pr)` with less worker recycling +func (pr PooledReader) ReadFull(buf []byte) (n int, err error) { + w := pr.ioWorkers.Apply() + defer pr.ioWorkers.Recycle(w) + return io.ReadFull(pr.reader, buf) +} diff --git a/lightning/mydump/region.go b/lightning/mydump/region.go index c1de2992b..95ca6d990 100644 --- a/lightning/mydump/region.go +++ b/lightning/mydump/region.go @@ -14,7 +14,6 @@ package mydump import ( - "io" "math" "os" "strings" @@ -221,10 +220,6 @@ func SplitLargeFile( } parser := NewCSVParser(&cfg.Mydumper.CSV, reader, cfg.Mydumper.ReadBlockSize, ioWorker) parser.SetPos(endOffset, prevRowIdMax) - _, err = reader.Seek(endOffset, io.SeekStart) - if err != nil { - return 0, nil, nil, err - } pos, err := parser.ReadUntilTokNewLine() if err != nil { return 0, nil, nil, err diff --git a/lightning/restore/restore.go b/lightning/restore/restore.go index 5da223946..350e1dc1c 100644 --- a/lightning/restore/restore.go +++ b/lightning/restore/restore.go @@ -1276,7 +1276,6 @@ func newChunkRestore( parser = mydump.NewChunkParser(cfg.TiDB.SQLMode, reader, blockBufSize, ioWorkers) } - reader.Seek(chunk.Chunk.Offset, io.SeekStart) parser.SetPos(chunk.Chunk.Offset, chunk.Chunk.PrevRowIDMax) return &chunkRestore{ @@ -1743,6 +1742,7 @@ func (cr *chunkRestore) encodeLoop( // sql -> kv kvs, encodeErr := kvEncoder.Encode(logger, lastRow.Row, lastRow.RowID, cr.chunk.ColumnPermutation) encodeDur += time.Since(encodeDurStart) + cr.parser.RecycleRow(lastRow) if encodeErr != nil { err = errors.Annotatef(encodeErr, "in file %s at offset %d", &cr.chunk.Key, newOffset) return