Skip to content

Commit

Permalink
Allow any whitespace after \ to escape newline in """ and disallow co…
Browse files Browse the repository at this point in the history
…ntrol characters

This changed in TOML 0.5:

	Allow accidental whitespace between backslash and newline in the
	line continuation operator in multi-line basic strings

In general it deals with CRLF inside """ strings better: it all gets
normalized to LF.

This also fixes the last of the crashes in #239; """\\r""" would panic.
Control characters aren't allowed in strings anyway, so error out on
that too.
  • Loading branch information
arp242 committed Jun 13, 2021
1 parent 720994e commit 35432de
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 76 deletions.
102 changes: 59 additions & 43 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,30 +458,6 @@ func TestDecodeBadDatetime(t *testing.T) {
}
}

func TestDecodeMultilineStrings(t *testing.T) {
var x struct {
S string
}
const s0 = `s = """
a b \n c
d e f
"""`
if _, err := Decode(s0, &x); err != nil {
t.Fatal(err)
}
if want := "a b \n c\nd e f\n"; x.S != want {
t.Errorf("got: %q; want: %q", x.S, want)
}
const s1 = `s = """a b c\
"""`
if _, err := Decode(s1, &x); err != nil {
t.Fatal(err)
}
if want := "a b c"; x.S != want {
t.Errorf("got: %q; want: %q", x.S, want)
}
}

type sphere struct {
Center [3]float64
Radius float64
Expand Down Expand Up @@ -1004,6 +980,16 @@ func TestDecodeErrors(t *testing.T) {
{`x = [{ key = 42 #`, "expected a comma or an inline table terminator", true}, // panic
{`x = {a = 42 #`, "expected a comma or an inline table terminator '}', but got end of file instead", true},
{`x = [42 #`, "expected a comma or array terminator ']', but got end of file instead", false},

// Literal escape characters are not alllowed in any strings
{`x = """` + "\r" + `"""`, `control characters are not allowed`, true},
{`x = """` + "\x01" + `"""`, `control characters are not allowed`, true},
{`x = '''` + "\r" + `'''`, `control characters are not allowed`, true},
{`x = '''` + "\x01" + `'''`, `control characters are not allowed`, true},
{`x = "` + "\r" + `"`, `control characters are not allowed`, true},
{`x = "` + "\x01" + `"`, `control characters are not allowed`, true},
{`x = '` + "\r" + `'`, `control characters are not allowed`, true},
{`x = '` + "\x01" + `'`, `control characters are not allowed`, true},
}

for _, tt := range tests {
Expand All @@ -1028,36 +1014,66 @@ func TestDecodeMultilineNewlines(t *testing.T) {
in string
want string
}{
// Note `NL` gets replaced by "\n"; this makes it easier to read and
// write these tests.
// Note "NL" gets replaced by "\n" and "\r\n" (the tests are run twice);
// this makes it easier to read and write these tests.

{`x = """"""`, ``},
{`x = """\NL"""`, ``}, // Empty string
{`x = """\NL\NL\NL"""`, ``}, // Empty string

{`x=""""""`, ``},
{`x="""\NL"""`, ``}, // Empty string
{`x="""\NL\NL\NL"""`, ``}, // Empty string
{`x = """a\NL u2222b"""`, `au2222b`}, // Remove all whitespace after \
{`x = """a\NLNLNLu2222b"""`, `au2222b`}, // Remove all newlines
{`x = """a \NL u2222b"""`, `a u2222b`}, // Don't remove whitespace before \

{`x="""a\NL u2222b"""`, `au2222b`}, // Remove all whitespace after \
{`x="""a\NLNLNLu2222b"""`, `au2222b`}, // Remove all newlines
{`x="""a \NL u2222b"""`, `a u2222b`}, // Don't remove whitespace before \
{`x = """a \ NLb"""`, `a b`}, // Allow any whitespace between \n and \
{`x = """a \ NL b"""`, `a b`},
{`x = """a \ NLb"""`, `a b`},

{`x="""a\NLu2222b"""`, `au2222b`}, // Ends in \ → remove
{`x="""a\\NLu2222b"""`, `a\NLu2222b`}, // Ends in \\ → literal backslash, so keep NL.
{`x="""a\\\NLu2222b"""`, `a\u2222b`}, // Ends in \\\ → backslash followed by NL escape, so remove.
{`x="""a\\\\NLu2222b"""`, `a\\NLu2222b`}, // Ends in \\\\ → two lieral backslashes; keep NL

{`x = """NLa b \n cNLd e fNL"""`, "a b \n c\nd e f\n"},
{`x = """a b c\NL"""`, "a b c"},

{`x = """NLThe quick brown \NLNLNLfox jumps over \NL the lazy dog."""`,
`The quick brown fox jumps over the lazy dog.`},
{`x = """\NL The quick brown \NLNLNL fox jumps over \NL the lazy dog.\NL """`,
`The quick brown fox jumps over the lazy dog.`},
}

replUnix := strings.NewReplacer("NL", "\n")
replWin := strings.NewReplacer("NL", "\r\n")
for _, tt := range tests {
t.Run("", func(t *testing.T) {
tt.in = strings.ReplaceAll(tt.in, "NL", "\n")
tt.want = strings.ReplaceAll(tt.want, "NL", "\n")

var s struct{ X string }
_, err := Decode(tt.in, &s)
if err != nil {
t.Fatal(err)
}
if s.X != tt.want {
t.Errorf("\nhave: %s\nwant: %s", s.X, tt.want)
}
t.Run("unix", func(t *testing.T) {
in := replUnix.Replace(tt.in)
want := replUnix.Replace(tt.want)

var s struct{ X string }
_, err := Decode(in, &s)
if err != nil {
t.Fatal(err)
}
if s.X != want {
t.Errorf("\nhave: %q\nwant: %q", s.X, want)
}
})

t.Run("windows", func(t *testing.T) {
in := replWin.Replace(tt.in)
want := replWin.Replace(tt.want)

var s struct{ X string }
_, err := Decode(in, &s)
if err != nil {
t.Fatal(err)
}
if s.X != want {
t.Errorf("\nhave: %q\nwant: %q", s.X, want)
}
})
})
}
}
Expand Down
42 changes: 40 additions & 2 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,8 @@ func lexString(lx *lexer) stateFn {
switch {
case r == eof:
return lx.errorf(`unexpected EOF; expected '"'`)
case isControl(r) || r == '\r':
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
case isNL(r):
return lx.errorf("strings cannot contain newlines")
case r == '\\':
Expand All @@ -614,9 +616,15 @@ func lexString(lx *lexer) stateFn {
// lexMultilineString consumes the inner contents of a string. It assumes that
// the beginning '"""' has already been consumed and ignored.
func lexMultilineString(lx *lexer) stateFn {
switch lx.next() {
r := lx.next()
switch r {
case eof:
return lx.errorf(`unexpected EOF; expected '"""'`)
case '\r':
if lx.peek() != '\n' {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineString
case '\\':
return lexMultilineStringEscape
case stringEnd:
Expand All @@ -635,6 +643,10 @@ func lexMultilineString(lx *lexer) stateFn {
lx.backup()
}
}

if isControl(r) {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineString
}

Expand All @@ -645,6 +657,8 @@ func lexRawString(lx *lexer) stateFn {
switch {
case r == eof:
return lx.errorf(`unexpected EOF; expected "'"`)
case isControl(r) || r == '\r':
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
case isNL(r):
return lx.errorf("strings cannot contain newlines")
case r == rawStringEnd:
Expand All @@ -661,9 +675,15 @@ func lexRawString(lx *lexer) stateFn {
// a string. It assumes that the beginning "'''" has already been consumed and
// ignored.
func lexMultilineRawString(lx *lexer) stateFn {
switch lx.next() {
r := lx.next()
switch r {
case eof:
return lx.errorf(`unexpected EOF; expected "'''"`)
case '\r':
if lx.peek() != '\n' {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineString
case rawStringEnd:
if lx.accept(rawStringEnd) {
if lx.accept(rawStringEnd) {
Expand All @@ -680,6 +700,10 @@ func lexMultilineRawString(lx *lexer) stateFn {
lx.backup()
}
}

if isControl(r) {
return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
}
return lexMultilineRawString
}

Expand Down Expand Up @@ -710,6 +734,10 @@ func lexStringEscape(lx *lexer) stateFn {
fallthrough
case '"':
fallthrough
// Inside """ .. """ strings you can use \ to escape newlines, and any
// amount of whitespace can be between the \ and \n.
case ' ', '\t':
fallthrough
case '\\':
return lx.pop()
case 'u':
Expand Down Expand Up @@ -908,6 +936,16 @@ func isNL(r rune) bool {
return r == '\n' || r == '\r'
}

// Control characters except \n, \t
func isControl(r rune) bool {
switch r {
case '\t', '\r', '\n':
return false
default:
return (r >= 0x00 && r <= 0x1f) || r == 0x7f
}
}

func isDigit(r rune) bool {
return r >= '0' && r <= '9'
}
Expand Down
69 changes: 38 additions & 31 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,47 +537,54 @@ func (p *parser) current() string {
}

func stripFirstNewline(s string) string {
if len(s) == 0 || s[0] != '\n' {
return s
if len(s) > 0 && s[0] == '\n' {
return s[1:]
}
if len(s) > 1 && s[0] == '\r' && s[1] == '\n' {
return s[2:]
}
return s[1:]
return s
}

// Remove newlines inside triple-quoted strings if a line ends with "\".
//
// \NL → remove
// \\NL → is escaped: do nothing
// \\\NL → is backslash and then \\n: remove
func stripEscapedNewlines(s string) string {
i := strings.Index(s, "\\\n")
if i == -1 {
split := strings.Split(s, "\n")
if len(split) < 1 {
return s
}

// Find all instances of "\\n"; remove them unless it's prefixed by an odd
// number of "\"s, incidating this was escaped.
var (
b strings.Builder
upto string
)
b.Grow(len(s))
for ; i > -1; i = strings.Index(s, "\\\n") {
upto, s = s[:i], s[i+1:]
c := 0
for j := len(upto) - 1; j >= 0 && upto[j] == '\\'; j-- {
c++
}

b.WriteString(upto)
if c > 0 && c%2 == 1 {
b.WriteString("\\")
} else {
s = strings.TrimLeft(s, " \n\t")
escNL := false // Keep track of the last non-blank line was escaped.
for i, line := range split {
line = strings.TrimRight(line, " \t\r")

if len(line) == 0 || line[len(line)-1] != '\\' {
split[i] = strings.TrimRight(split[i], "\r")
if !escNL && i != len(split)-1 {
split[i] += "\n"
}
continue
}
}

b.WriteString(s)
return b.String()
escBS := true
for j := len(line) - 1; j >= 0 && line[j] == '\\'; j-- {
escBS = !escBS
}
if escNL {
line = strings.TrimLeft(line, " \t\r")
}
escNL = !escBS

if escBS {
split[i] += "\n"
continue
}

split[i] = line[:len(line)-1] // Remove \
if len(split)-1 > i {
split[i+1] = strings.TrimLeft(split[i+1], " \t\r")
}
}
return strings.Join(split, "")
}

func (p *parser) replaceEscapes(str string) string {
Expand Down

0 comments on commit 35432de

Please sign in to comment.