cmd/compile: accept new Go2 number literals

This CL introduces compiler support for the new binary and octal integer literals, hexadecimal floats, and digit separators for all number literals. The new Go 2 number literal scanner accepts the following liberal format: number = [ prefix ] digits [ "." digits ] [ exponent ] [ "i" ] . prefix = "0" [ "b" |"B" | "o" | "O" | "x" | "X" ] . digits = { digit | "_" } . exponent = ( "e" | "E" | "p" | "P" ) [ "+" | "-" ] digits . If the number starts with "0x" or "0X", digit is any hexadecimal digit; otherwise, digit is any decimal digit. If the accepted number is not valid, errors are reported accordingly. See the new test cases in scanner_test.go for a selection of valid and invalid numbers and the respective error messages. R=Go1.13 Updates golang#12711. Updates golang#19308. Updates golang#28493. Updates golang#29008. Change-Id: Ic8febc7bd4dc5186b16a8c8897691e81125cf0ca Reviewed-on: https://go-review.googlesource.com/c/157677 Reviewed-by: Ian Lance Taylor <iant@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
nebulabox · Feb 20, 2019 · 8f699f8 · 8f699f8
1 parent a069b1c
commit 8f699f8
Show file tree

Hide file tree

Showing 7 changed files with 487 additions and 88 deletions.
diff --git a/src/cmd/compile/internal/gc/mpfloat.go b/src/cmd/compile/internal/gc/mpfloat.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"math"
 	"math/big"
+	"strings"
 )
 
 // implements float arithmetic
@@ -177,11 +178,14 @@ func (a *Mpflt) Neg() {
 }
 
 func (a *Mpflt) SetString(as string) {
+	// TODO(gri) remove this code once math/big.Float.Parse can handle separators
+	as = strings.Replace(as, "_", "", -1) // strip separators
+
 	for len(as) > 0 && (as[0] == ' ' || as[0] == '\t') {
 		as = as[1:]
 	}
 
-	f, _, err := a.Val.Parse(as, 10)
+	f, _, err := a.Val.Parse(as, 0)
 	if err != nil {
 		yyerror("malformed constant: %s (%v)", as, err)
 		a.Val.SetFloat64(0)

diff --git a/src/cmd/compile/internal/gc/mpint.go b/src/cmd/compile/internal/gc/mpint.go
@@ -7,6 +7,7 @@ package gc
 import (
 	"fmt"
 	"math/big"
+	"strings"
 )
 
 // implements integer arithmetic
@@ -281,6 +282,12 @@ func (a *Mpint) SetInt64(c int64) {
 }
 
 func (a *Mpint) SetString(as string) {
+	// TODO(gri) remove this code once math/big.Int.SetString can handle 0o-octals and separators
+	as = strings.Replace(as, "_", "", -1) // strip separators
+	if len(as) >= 2 && as[0] == '0' && (as[1] == 'o' || as[1] == 'O') {
+		as = "0" + as[2:]
+	}
+
 	_, ok := a.Val.SetString(as, 0)
 	if !ok {
 		// required syntax is [+-][0[x]]d*

diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go
@@ -47,6 +47,10 @@ func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mod
 	s.nlsemi = false
 }
 
+func (s *scanner) errorf(format string, args ...interface{}) {
+	s.error(fmt.Sprintf(format, args...))
+}
+
 // next advances the scanner by reading the next token.
 //
 // If a read, source encoding, or lexical error occurs, next calls
@@ -149,8 +153,9 @@ redo:
 
 	case '.':
 		c = s.getr()
-		if isDigit(c) {
-			s.unread(1)
+		if isDecimal(c) {
+			s.ungetr()
+			s.unread(1) // correct position of '.' (needed by startLit in number)
 			s.number('.')
 			break
 		}
@@ -304,7 +309,7 @@ redo:
 
 	default:
 		s.tok = 0
-		s.error(fmt.Sprintf("invalid character %#U", c))
+		s.errorf("invalid character %#U", c)
 		goto redo
 	}
 
@@ -320,19 +325,15 @@ assignop:
 }
 
 func isLetter(c rune) bool {
-	return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_'
-}
-
-func isDigit(c rune) bool {
-	return '0' <= c && c <= '9'
+	return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
 }
 
 func (s *scanner) ident() {
 	s.startLit()
 
 	// accelerate common case (7bit ASCII)
 	c := s.getr()
-	for isLetter(c) || isDigit(c) {
+	for isLetter(c) || isDecimal(c) {
 		c = s.getr()
 	}
 
@@ -372,10 +373,10 @@ func (s *scanner) isIdentRune(c rune, first bool) bool {
 		// ok
 	case unicode.IsDigit(c):
 		if first {
-			s.error(fmt.Sprintf("identifier cannot begin with digit %#U", c))
+			s.errorf("identifier cannot begin with digit %#U", c)
 		}
 	case c >= utf8.RuneSelf:
-		s.error(fmt.Sprintf("invalid identifier character %#U", c))
+		s.errorf("invalid identifier character %#U", c)
 	default:
 		return false
 	}
@@ -401,86 +402,188 @@ func init() {
 	}
 }
 
+func lower(c rune) rune     { return ('a' - 'A') | c } // returns lower-case c iff c is ASCII letter
+func isDecimal(c rune) bool { return '0' <= c && c <= '9' }
+func isHex(c rune) bool     { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' }
+
+// digits accepts the sequence { digit | '_' } starting with c0.
+// If base <= 10, digits accepts any decimal digit but records
+// the index (relative to the literal start) of a digit >= base
+// in *invalid, if *invalid < 0.
+// digits returns the first rune that is not part of the sequence
+// anymore, and a bitset describing whether the sequence contained
+// digits (bit 0 is set), or separators '_' (bit 1 is set).
+func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) {
+	c = c0
+	if base <= 10 {
+		max := rune('0' + base)
+		for isDecimal(c) || c == '_' {
+			ds := 1
+			if c == '_' {
+				ds = 2
+			} else if c >= max && *invalid < 0 {
+				*invalid = int(s.col0 - s.col) // record invalid rune index
+			}
+			digsep |= ds
+			c = s.getr()
+		}
+	} else {
+		for isHex(c) || c == '_' {
+			ds := 1
+			if c == '_' {
+				ds = 2
+			}
+			digsep |= ds
+			c = s.getr()
+		}
+	}
+	return
+}
+
 func (s *scanner) number(c rune) {
 	s.startLit()
 
+	base := 10        // number base
+	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
+	digsep := 0       // bit 0: digit present, bit 1: '_' present
+	invalid := -1     // index of invalid digit in literal, or < 0
+
+	// integer part
+	var ds int
 	if c != '.' {
-		s.kind = IntLit // until proven otherwise
+		s.kind = IntLit
 		if c == '0' {
 			c = s.getr()
-			if c == 'x' || c == 'X' {
-				// hex
+			switch lower(c) {
+			case 'x':
 				c = s.getr()
-				hasDigit := false
-				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
-					c = s.getr()
-					hasDigit = true
-				}
-				if !hasDigit {
-					s.error("malformed hex constant")
-				}
-				goto done
-			}
-
-			// decimal 0, octal, or float
-			has8or9 := false
-			for isDigit(c) {
-				if c > '7' {
-					has8or9 = true
-				}
+				base, prefix = 16, 'x'
+			case 'o':
 				c = s.getr()
-			}
-			if c != '.' && c != 'e' && c != 'E' && c != 'i' {
-				// octal
-				if has8or9 {
-					s.error("malformed octal constant")
-				}
-				goto done
-			}
-
-		} else {
-			// decimal or float
-			for isDigit(c) {
+				base, prefix = 8, 'o'
+			case 'b':
 				c = s.getr()
+				base, prefix = 2, 'b'
+			default:
+				base, prefix = 8, '0'
+				digsep = 1 // leading 0
 			}
 		}
+		c, ds = s.digits(c, base, &invalid)
+		digsep |= ds
 	}
 
-	// float
+	// fractional part
 	if c == '.' {
 		s.kind = FloatLit
-		c = s.getr()
-		for isDigit(c) {
-			c = s.getr()
+		if prefix == 'o' || prefix == 'b' {
+			s.error("invalid radix point in " + litname(prefix))
 		}
+		c, ds = s.digits(s.getr(), base, &invalid)
+		digsep |= ds
+	}
+
+	if digsep&1 == 0 {
+		s.error(litname(prefix) + " has no digits")
 	}
 
 	// exponent
-	if c == 'e' || c == 'E' {
-		s.kind = FloatLit
+	if e := lower(c); e == 'e' || e == 'p' {
+		switch {
+		case e == 'e' && prefix != 0 && prefix != '0':
+			s.errorf("%q exponent requires decimal mantissa", c)
+		case e == 'p' && prefix != 'x':
+			s.errorf("%q exponent requires hexadecimal mantissa", c)
+		}
 		c = s.getr()
-		if c == '-' || c == '+' {
+		s.kind = FloatLit
+		if c == '+' || c == '-' {
 			c = s.getr()
 		}
-		if !isDigit(c) {
-			s.error("malformed floating-point constant exponent")
-		}
-		for isDigit(c) {
-			c = s.getr()
+		c, ds = s.digits(c, 10, nil)
+		digsep |= ds
+		if ds&1 == 0 {
+			s.error("exponent has no digits")
 		}
+	} else if prefix == 'x' && s.kind == FloatLit {
+		s.error("hexadecimal mantissa requires a 'p' exponent")
 	}
 
-	// complex
+	// suffix 'i'
 	if c == 'i' {
 		s.kind = ImagLit
-		s.getr()
+		if prefix != 0 && prefix != '0' {
+			s.error("invalid suffix 'i' on " + litname(prefix))
+		}
+		c = s.getr()
 	}
-
-done:
 	s.ungetr()
+
 	s.nlsemi = true
 	s.lit = string(s.stopLit())
 	s.tok = _Literal
+
+	if s.kind == IntLit && invalid >= 0 {
+		s.errh(s.line, s.col+uint(invalid), fmt.Sprintf("invalid digit %q in %s", s.lit[invalid], litname(prefix)))
+	}
+
+	if digsep&2 != 0 {
+		if i := invalidSep(s.lit); i >= 0 {
+			s.errh(s.line, s.col+uint(i), "'_' must separate successive digits")
+		}
+	}
+}
+
+func litname(prefix rune) string {
+	switch prefix {
+	case 'x':
+		return "hexadecimal literal"
+	case 'o', '0':
+		return "octal literal"
+	case 'b':
+		return "binary literal"
+	}
+	return "decimal literal"
+}
+
+// invalidSep returns the index of the first invalid separator in x, or -1.
+func invalidSep(x string) int {
+	x1 := ' ' // prefix char, we only care if it's 'x'
+	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
+	i := 0
+
+	// a prefix counts as a digit
+	if len(x) >= 2 && x[0] == '0' {
+		x1 = lower(rune(x[1]))
+		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
+			d = '0'
+			i = 2
+		}
+	}
+
+	// mantissa and exponent
+	for ; i < len(x); i++ {
+		p := d // previous digit
+		d = rune(x[i])
+		switch {
+		case d == '_':
+			if p != '0' {
+				return i
+			}
+		case isDecimal(d) || x1 == 'x' && isHex(d):
+			d = '0'
+		default:
+			if p == '_' {
+				return i - 1
+			}
+			d = '.'
+		}
+	}
+	if d == '_' {
+		return len(x) - 1
+	}
+
+	return -1
 }
 
 func (s *scanner) rune() {
@@ -713,12 +816,10 @@ func (s *scanner) escape(quote rune) bool {
 	for i := n; i > 0; i-- {
 		d := base
 		switch {
-		case isDigit(c):
+		case isDecimal(c):
 			d = uint32(c) - '0'
-		case 'a' <= c && c <= 'f':
-			d = uint32(c) - ('a' - 10)
-		case 'A' <= c && c <= 'F':
-			d = uint32(c) - ('A' - 10)
+		case 'a' <= lower(c) && lower(c) <= 'f':
+			d = uint32(lower(c)) - ('a' - 10)
 		}
 		if d >= base {
 			if c < 0 {
@@ -728,7 +829,7 @@ func (s *scanner) escape(quote rune) bool {
 			if base == 8 {
 				kind = "octal"
 			}
-			s.error(fmt.Sprintf("non-%s character in escape sequence: %c", kind, c))
+			s.errorf("non-%s character in escape sequence: %c", kind, c)
 			s.ungetr()
 			return false
 		}
@@ -739,7 +840,7 @@ func (s *scanner) escape(quote rune) bool {
 	s.ungetr()
 
 	if x > max && base == 8 {
-		s.error(fmt.Sprintf("octal escape value > 255: %d", x))
+		s.errorf("octal escape value > 255: %d", x)
 		return false
 	}