syntax: strict string escapes

This change causes Starlark, like Go, to reject backslashes that are not part of an escape sequence. Previously they were treated literally, so "\(" would encode a two-character string, and much code relied on this, especially for regular expressions. This may break some programs, but the fix is simple: double each errant backslashes. Python does not yet enforce this behavior, but since 3.6 has emitted a deprecation warning for it. Also, document string escapes. This is Google issue b/34519173. Change-Id: I5c9609a4e28d58593e9d6918757bca2cfd838d51
google · Mar 25, 2020 · 3d63113 · 3d63113
1 parent 8dd3e2e
commit 3d63113
Show file tree

Hide file tree

Showing 12 changed files with 202 additions and 70 deletions.
diff --git a/doc/spec.md b/doc/spec.md
@@ -321,6 +321,132 @@ hex_digit     = '0' … '9' | 'A' … 'F' | 'a' … 'f' .
 binary_digit  = '0' | '1' .
 ```
 
+### String literals
+
+A Starlark string literal denotes a string. In its simplest form, it
+consists of the desired text surrounded by matching single quotation
+marks or double quotation marks:
+
+```python
+"abc"
+'abc'
+```
+
+Literal occurrences of the chosen quotation mark character must be
+escaped by a preceding backslash. So, if a string contains several
+of one kind of quotation mark, it may be convenient to quote the string
+using the other kind, as in these examples:
+
+```python
+'Have you read "To Kill a Mockingbird?"'
+"Yes, it's a classic."
+
+"Have you read \"To Kill a Mockingbird?\""
+'Yes, it\'s a classic.'
+```
+
+#### String escapes
+
+Within a string literal, the backslash character `\` indicates the
+start of an _escape sequence_, a notation for expressing things that
+are impossible or awkward to write directly.
+
+The following *traditional escape sequences*x represent the ASCII control
+codes 7-13:
+
+```
+\a   \x07 alert or bell
+\b   \x08 backspace
+\f   \x0C form feed
+\n   \x0A line feed
+\r   \x0D carriage return
+\t   \x09 horizontal tab
+\v   \X0B vertical tab
+```
+
+A *literal backslash* is written using the escape `\\`.
+
+An *escaped newline*---that is, a backlash at the end of a line---is
+ignored, allowing string literals to be conveniently split across
+multiple lines (though triple-quoted strings make this even more
+convenient.
+
+```python
+"abc\
+def"			# "abcdef"
+
+An *octal escape* encodes a single byte using its octal value. It
+consists of a backslash followed by one, two, or three octal digits
+(0-7). It is error if the value is greater than 255.
+
+```python
+'\0'			# "\x00"  a string containing a single NUL byte
+'\12'			# "\n"    octal 12 = decimal 10
+'\101-\132'		# "A-Z"
+'\101-\132'		# "A-Z"
+'\119'			# "\t9"   = "\11" + "9"
+```
+
+The Java implementation encodes strings using UTF-16, 
+so octal escape encodes a single UTF-16 code unit. 
+Octal escapes for values above 127 are therefore not portable across implementations.
+There is little reason to use octal escapes in new code.
+
+A *hex escape* encodes a single byte using its hexadecimal value. It
+consists of `\x` followed by exactly two hexadecimal digits [0-9A-Fa-f].
+
+```python
+"\x00"			# "\x00"  a string containing a single NUL byte 
+"(\20)"			# "( )"   ASCII 0x20 = 32 = space
+```
+<pre>
+red, reset = "\x1b[31m", "\x1b[0m"	# ANSI terminal control codes for color 
+"(" + red + "hello" + reset + ")"	# (<span color=red>hello</span>)
+</pre>
+
+The Java implementation does not support hex escapes.
+
+An ordinary string literal may not contain an unescaped newline, but a
+*multiline string literal* may spread over multiple source lines.
+It is denoted using three quotation marks at start and end.
+Within it, unescaped newlines and quotation marks (or even pairs of
+quotation marks) have their literal meaning, but three quotation marks
+ends the literal. This makes it easy to quote large blocks of text with
+few escapes.
+
+```
+haiku = '''
+Yesterday it worked.
+Today it is not working.
+That's computers. Sigh.
+'''
+```
+
+Regardless of the platform's convention for text line endings---for
+example, a linefeed (\n) on UNIX, or a carriage return followed by a
+linefeed (\r\n) on Microsoft Windows---an unescaped line ending in a
+multiline string literal always denotes a line feed (\n).
+
+Starlark also supports *raw string literals*, which look like an
+ordinary single- or double-quotation preceded by `r`. Within a raw
+string literal, there is no special processing of backslash escapes,
+other than an escaped quotation mark (which denotes a literal
+quotation mark), or an escaped newline (which denotes a backslash
+followed by a newline). This form of quotation is typically used when
+writing strings that contain many quotation marks or backslashes (such
+as regular expressions or shell commands) to reduce the burden of
+escaping:
+
+```python
+"a\nb"		# "a\nb"  = 'a' + '\n' + 'b'
+r"a\nb"		# "a\\nb" = 'a' + '\' + '\n' + 'b'
+
+"a\
+b"		# "ab"
+r"a\
+b"		# "a\\nb"
+```
+
 TODO: define string_lit, indent, outdent, semicolon, newline, eof
 
 ## Data types

diff --git a/starlark/testdata/dict.star b/starlark/testdata/dict.star
@@ -13,7 +13,7 @@ assert.true({False: False})
 assert.true(not {})
 
 # dict + dict is no longer supported.
-assert.fails(lambda: {"a": 1} + {"b": 2}, 'unknown binary op: dict \+ dict')
+assert.fails(lambda: {"a": 1} + {"b": 2}, 'unknown binary op: dict \\+ dict')
 
 # dict comprehension
 assert.eq({x: x*x for x in range(3)}, {0: 0, 1: 1, 2: 4})

diff --git a/starlark/testdata/function.star b/starlark/testdata/function.star
@@ -173,7 +173,7 @@ assert.fails(lambda: f(
     33, 34, 35, 36, 37, 38, 39, 40,
     41, 42, 43, 44, 45, 46, 47, 48,
     49, 50, 51, 52, 53, 54, 55, 56,
-    57, 58, 59, 60, 61, 62, 63, 64), "missing 1 argument \(mm\)")
+    57, 58, 59, 60, 61, 62, 63, 64), "missing 1 argument \\(mm\\)")
 
 assert.fails(lambda: f(
     1, 2, 3, 4, 5, 6, 7, 8,

diff --git a/starlark/testdata/int.star b/starlark/testdata/int.star
@@ -176,8 +176,8 @@ assert.fails(lambda: int("-0123", 0), "invalid literal.*base 0")
 assert.fails(lambda: int("0Oxa", 8), "invalid literal with base 8: 0Oxa")
 # follow-on bugs to issue 108
 assert.fails(lambda: int("--4"), "invalid literal with base 10: --4")
-assert.fails(lambda: int("++4"), "invalid literal with base 10: \+\+4")
-assert.fails(lambda: int("+-4"), "invalid literal with base 10: \+-4")
+assert.fails(lambda: int("++4"), "invalid literal with base 10: \\+\\+4")
+assert.fails(lambda: int("+-4"), "invalid literal with base 10: \\+-4")
 assert.fails(lambda: int("0x-4", 16), "invalid literal with base 16: 0x-4")
 
 # bitwise union (int|int), intersection (int&int), XOR (int^int), unary not (~int),

diff --git a/starlark/testdata/list.star b/starlark/testdata/list.star
@@ -16,14 +16,14 @@ assert.true(not [])
 
 # indexing, x[i]
 abc = list("abc".elems())
-assert.fails(lambda : abc[-4], "list index -4 out of range \[-3:2]")
+assert.fails(lambda : abc[-4], "list index -4 out of range \\[-3:2]")
 assert.eq(abc[-3], "a")
 assert.eq(abc[-2], "b")
 assert.eq(abc[-1], "c")
 assert.eq(abc[0], "a")
 assert.eq(abc[1], "b")
 assert.eq(abc[2], "c")
-assert.fails(lambda : abc[3], "list index 3 out of range \[-3:2]")
+assert.fails(lambda : abc[3], "list index 3 out of range \\[-3:2]")
 
 # x[i] = ...
 x3 = [0, 1, 2]
@@ -45,8 +45,8 @@ assert.fails(x3.clear, "cannot clear frozen list")
 
 # list + list
 assert.eq([1, 2, 3] + [3, 4, 5], [1, 2, 3, 3, 4, 5])
-assert.fails(lambda : [1, 2] + (3, 4), "unknown.*list \+ tuple")
-assert.fails(lambda : (1, 2) + [3, 4], "unknown.*tuple \+ list")
+assert.fails(lambda : [1, 2] + (3, 4), "unknown.*list \\+ tuple")
+assert.fails(lambda : (1, 2) + [3, 4], "unknown.*tuple \\+ list")
 
 # list * int,  int * list
 assert.eq(abc * 0, [])
@@ -98,8 +98,8 @@ listcompblock()
 
 # list.pop
 x4 = [1, 2, 3, 4, 5]
-assert.fails(lambda : x4.pop(-6), "index -6 out of range \[-5:4]")
-assert.fails(lambda : x4.pop(6), "index 6 out of range \[-5:4]")
+assert.fails(lambda : x4.pop(-6), "index -6 out of range \\[-5:4]")
+assert.fails(lambda : x4.pop(6), "index 6 out of range \\[-5:4]")
 assert.eq(x4.pop(), 5)
 assert.eq(x4, [1, 2, 3, 4])
 assert.eq(x4.pop(1), 2)

diff --git a/starlark/testdata/module.star b/starlark/testdata/module.star
@@ -14,4 +14,4 @@ assert.fails(assignfield, "can't assign to .foo field of module")
 
 # no such field
 assert.fails(lambda : assert.nonesuch, "module has no .nonesuch field or method$")
-assert.fails(lambda : assert.falls, "module has no .falls field or method .did you mean .fails\?")
+assert.fails(lambda : assert.falls, "module has no .falls field or method .did you mean .fails\\?")
diff --git a/starlark/testdata/set.star b/starlark/testdata/set.star
@@ -33,9 +33,9 @@ assert.eq(list(set([1, 3, 2, 3])), [1, 3, 2])
 assert.eq(type(set("hello".elems())), "set")
 assert.eq(list(set("hello".elems())), ["h", "e", "l", "o"])
 assert.eq(list(set(range(3))), [0, 1, 2])
-assert.fails(lambda: set(1), "got int, want iterable")
-assert.fails(lambda: set(1, 2, 3), "got 3 arguments")
-assert.fails(lambda: set([1, 2, {}]), "unhashable type: dict")
+assert.fails(lambda : set(1), "got int, want iterable")
+assert.fails(lambda : set(1, 2, 3), "got 3 arguments")
+assert.fails(lambda : set([1, 2, {}]), "unhashable type: dict")
 
 # truth
 assert.true(not set())
@@ -46,12 +46,12 @@ x = set([1, 2, 3])
 y = set([3, 4, 5])
 
 # set + any is not defined
-assert.fails(lambda: x + y, "unknown.*: set \+ set")
+assert.fails(lambda : x + y, "unknown.*: set \\+ set")
 
 # set | set (use resolve.AllowBitwise to enable it)
 assert.eq(list(set("a".elems()) | set("b".elems())), ["a", "b"])
 assert.eq(list(set("ab".elems()) | set("bc".elems())), ["a", "b", "c"])
-assert.fails(lambda: set() | [], "unknown binary op: set | list")
+assert.fails(lambda : set() | [], "unknown binary op: set | list")
 assert.eq(type(x | y), "set")
 assert.eq(list(x | y), [1, 2, 3, 4, 5])
 assert.eq(list(x | set([5, 1])), [1, 2, 3, 5])
@@ -65,7 +65,7 @@ assert.eq(type(x.union(y)), "set")
 assert.eq(list(x.union(y)), [1, 2, 3, 4, 5])
 assert.eq(list(x.union([5, 1])), [1, 2, 3, 5])
 assert.eq(list(x.union((6, 5, 4))), [1, 2, 3, 6, 5, 4])
-assert.fails(lambda: x.union([1, 2, {}]), "unhashable type: dict")
+assert.fails(lambda : x.union([1, 2, {}]), "unhashable type: dict")
 
 # intersection, set & set (use resolve.AllowBitwise to enable it)
 assert.eq(list(set("a".elems()) & set("b".elems())), [])
@@ -75,13 +75,14 @@ assert.eq(list(set("ab".elems()) & set("bc".elems())), ["b"])
 assert.eq(set([1, 2, 3]) ^ set([4, 5, 3]), set([1, 2, 4, 5]))
 
 def test_set_augmented_assign():
-  x = set([1, 2, 3])
-  x &= set([2, 3])
-  assert.eq(x, set([2, 3]))
-  x |= set([1])
-  assert.eq(x, set([1, 2, 3]))
-  x ^= set([4, 5, 3])
-  assert.eq(x, set([1, 2, 4, 5]))
+    x = set([1, 2, 3])
+    x &= set([2, 3])
+    assert.eq(x, set([2, 3]))
+    x |= set([1])
+    assert.eq(x, set([1, 2, 3]))
+    x ^= set([4, 5, 3])
+    assert.eq(x, set([1, 2, 4, 5]))
+
 test_set_augmented_assign()
 
 # len
@@ -99,17 +100,19 @@ assert.eq(x, x)
 assert.eq(y, y)
 assert.true(x != y)
 assert.eq(set([1, 2, 3]), set([3, 2, 1]))
-assert.fails(lambda: x < y, "set < set not implemented")
+assert.fails(lambda : x < y, "set < set not implemented")
 
 # iteration
 assert.true(type([elem for elem in x]), "list")
 assert.true(list([elem for elem in x]), [1, 2, 3])
+
 def iter():
-  list = []
-  for elem in x:
-    list.append(elem)
-  return list
+    list = []
+    for elem in x:
+        list.append(elem)
+    return list
+
 assert.eq(iter(), [1, 2, 3])
 
 # sets are not indexable
-assert.fails(lambda: x[0], "unhandled.*operation")
+assert.fails(lambda : x[0], "unhandled.*operation")
diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star
@@ -8,7 +8,7 @@ assert.eq(r'a\bc', "a\\bc")
 
 # truth
 assert.true("abc")
-assert.true("\0")
+assert.true(chr(0))
 assert.true(not "")
 
 # str + str
@@ -200,7 +200,7 @@ assert.eq("a{x!r}c".format(x='b'), r'a"b"c')
 assert.fails(lambda: "{x!}".format(x=1), "unknown conversion")
 assert.fails(lambda: "{x!:}".format(x=1), "unknown conversion")
 assert.fails(lambda: '{a.b}'.format(1), "syntax x.y is not supported")
-assert.fails(lambda: '{a[0]}'.format(1), "syntax a\[i\] is not supported")
+assert.fails(lambda: '{a[0]}'.format(1), "syntax a\\[i\\] is not supported")
 assert.fails(lambda: '{ {} }'.format(1), "nested replacement fields not supported")
 assert.fails(lambda: '{{}'.format(1), "single '}' in format")
 assert.fails(lambda: '{}}'.format(1), "single '}' in format")

diff --git a/starlarkstruct/testdata/struct.star b/starlarkstruct/testdata/struct.star
@@ -58,6 +58,6 @@ assert.eq(getattr(alice, "city"), "NYC")
 assert.eq(bob + bob, bob)
 assert.eq(bob + alice, person(age = 50, city = "NYC", name = "alice"))
 assert.eq(alice + bob, person(age = 50, city = "NYC", name = "bob"))  # not commutative! a misfeature
-assert.fails(lambda : alice + 1, "struct \+ int")
+assert.fails(lambda : alice + 1, "struct \\+ int")
 assert.eq(http + http, http)
-assert.fails(lambda : http + bob, "different constructors: hostport \+ person")
+assert.fails(lambda : http + bob, "different constructors: hostport \\+ person")
diff --git a/syntax/quote.go b/syntax/quote.go
@@ -40,13 +40,6 @@ var esc = [256]byte{
 	'"':  '"',
 }
 
-// notEsc is a list of characters that can follow a \ in a string value
-// without having to escape the \. That is, since ( is in this list, we
-// quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
-// This really does happen in BUILD files, especially in strings
-// being used as shell arguments containing regular expressions.
-const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
-
 // unquote unquotes the quoted string, returning the actual
 // string value, whether the original was triple-quoted, and
 // an error describing invalid input.
@@ -127,17 +120,19 @@ func unquote(quoted string) (s string, triple bool, err error) {
 
 		switch quoted[1] {
 		default:
-			// In Python, if \z (for some byte z) is not a known escape sequence
-			// then it appears as literal text in the string.
-			buf.WriteString(quoted[:2])
-			quoted = quoted[2:]
+			// In Starlark, like Go, a backslash must escape something.
+			// (Python still treats unnecessary backslashes literally,
+			// but since 3.6 has emitted a deprecation warning.)
+			err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
+			return
 
 		case '\n':
 			// Ignore the escape and the line break.
 			quoted = quoted[2:]
 
-		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
-			// One-char escape
+		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+			// One-char escape.
+			// We escape only the kind of quotation mark in use.
 			buf.WriteByte(unesc[quoted[1]])
 			quoted = quoted[2:]
 
@@ -227,18 +222,6 @@ func quote(unquoted string, triple bool) string {
 			buf.WriteByte(c)
 			continue
 		}
-		if c == '\\' {
-			if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
-				// Can pass \ through when followed by a byte that
-				// known not to be a valid escape sequence and also
-				// that does not trigger an escape sequence of its own.
-				// Use this, because various BUILD files do.
-				buf.WriteByte('\\')
-				buf.WriteByte(unquoted[i+1])
-				i++
-				continue
-			}
-		}
 		if esc[c] != 0 {
 			buf.WriteByte('\\')
 			buf.WriteByte(esc[c])