Rollup merge of rust-lang#62963 - estebank:homoglyph-recovery, r=petr…

…ochenkov Allow lexer to recover from some homoglyphs
Centril · Jul 26, 2019 · 1aae896 · 1aae896
2 parents 58fa03a + 6844976
commit 1aae896
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 36 deletions.
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
@@ -389,8 +389,18 @@ impl<'a> StringReader<'a> {
                                                           self.pos,
                                                           "unknown start of token",
                                                           c);
-                unicode_chars::check_for_substitution(self, start, c, &mut err);
-                return Err(err)
+                // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
+                // instead of keeping a table in `check_for_substitution`into the token. Ideally,
+                // this should be inside `rustc_lexer`. However, we should first remove compound
+                // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
+                // as there will be less overall work to do this way.
+                return match unicode_chars::check_for_substitution(self, start, c, &mut err) {
+                    Some(token) => {
+                        err.emit();
+                        Ok(token)
+                    }
+                    None => Err(err),
+                }
             }
         };
         Ok(kind)

diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs
@@ -3,7 +3,8 @@
 
 use super::StringReader;
 use errors::{Applicability, DiagnosticBuilder};
-use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
+use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION, symbol::kw};
+use crate::parse::token;
 
 #[rustfmt::skip] // for line breaks
 const UNICODE_ARRAY: &[(char, &str, char)] = &[
@@ -297,53 +298,59 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
     ('＞', "Fullwidth Greater-Than Sign", '>'),
 ];
 
-const ASCII_ARRAY: &[(char, &str)] = &[
-    (' ', "Space"),
-    ('_', "Underscore"),
-    ('-', "Minus/Hyphen"),
-    (',', "Comma"),
-    (';', "Semicolon"),
-    (':', "Colon"),
-    ('!', "Exclamation Mark"),
-    ('?', "Question Mark"),
-    ('.', "Period"),
-    ('\'', "Single Quote"),
-    ('"', "Quotation Mark"),
-    ('(', "Left Parenthesis"),
-    (')', "Right Parenthesis"),
-    ('[', "Left Square Bracket"),
-    (']', "Right Square Bracket"),
-    ('{', "Left Curly Brace"),
-    ('}', "Right Curly Brace"),
-    ('*', "Asterisk"),
-    ('/', "Slash"),
-    ('\\', "Backslash"),
-    ('&', "Ampersand"),
-    ('+', "Plus Sign"),
-    ('<', "Less-Than Sign"),
-    ('=', "Equals Sign"),
-    ('>', "Greater-Than Sign"),
+// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
+// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
+// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
+// fancier error recovery to it, as there will be less overall work to do this way.
+const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
+    (' ', "Space", Some(token::Whitespace)),
+    ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
+    ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
+    (',', "Comma", Some(token::Comma)),
+    (';', "Semicolon", Some(token::Semi)),
+    (':', "Colon", Some(token::Colon)),
+    ('!', "Exclamation Mark", Some(token::Not)),
+    ('?', "Question Mark", Some(token::Question)),
+    ('.', "Period", Some(token::Dot)),
+    ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
+    (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
+    ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
+    (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
+    ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
+    ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
+    ('*', "Asterisk", Some(token::BinOp(token::Star))),
+    ('/', "Slash", Some(token::BinOp(token::Slash))),
+    ('\\', "Backslash", None),
+    ('&', "Ampersand", Some(token::BinOp(token::And))),
+    ('+', "Plus Sign", Some(token::BinOp(token::Plus))),
+    ('<', "Less-Than Sign", Some(token::Lt)),
+    ('=', "Equals Sign", Some(token::Eq)),
+    ('>', "Greater-Than Sign", Some(token::Gt)),
+    // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
+    // spitting the correct token out.
+    ('\'', "Single Quote", None),
+    ('"', "Quotation Mark", None),
 ];
 
 crate fn check_for_substitution<'a>(
     reader: &StringReader<'a>,
     pos: BytePos,
     ch: char,
     err: &mut DiagnosticBuilder<'a>,
-) -> bool {
+) -> Option<token::TokenKind> {
     let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
         Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
-        None => return false,
+        None => return None,
     };
 
     let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
 
-    let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
-        Some((_ascii_char, ascii_name)) => ascii_name,
+    let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
+        Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
         None => {
             let msg = format!("substitution character not found for '{}'", ch);
             reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
-            return false;
+            return None;
         }
     };
 
@@ -371,7 +378,7 @@ crate fn check_for_substitution<'a>(
         );
         err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
     }
-    true
+    token.clone()
 }
 
 /// Extract string if found at current position with given delimiters

diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.rs
@@ -1,5 +1,6 @@
 const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
 //~^ ERROR expected at least one digit in exponent
 //~| ERROR unknown start of token: \u{2212}
+//~| ERROR cannot subtract `{integer}` from `{float}`
 
 fn main() {}
diff --git a/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr b/src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
@@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
 LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
    |                                                     ^
 
-error: aborting due to 2 previous errors
+error[E0277]: cannot subtract `{integer}` from `{float}`
+  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
+   |
+LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
+   |                                                     ^ no implementation for `{float} - {integer}`
+   |
+   = help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
+
+error: aborting due to 3 previous errors
 
+For more information about this error, try `rustc --explain E0277`.
diff --git a/src/test/ui/parser/recover-from-homoglyph.rs b/src/test/ui/parser/recover-from-homoglyph.rs
@@ -0,0 +1,4 @@
+fn main() {
+    println!(""); //~ ERROR unknown start of token: \u{37e}
+    let x: usize = (); //~ ERROR mismatched types
+}
diff --git a/src/test/ui/parser/recover-from-homoglyph.stderr b/src/test/ui/parser/recover-from-homoglyph.stderr
@@ -0,0 +1,22 @@
+error: unknown start of token: \u{37e}
+  --> $DIR/recover-from-homoglyph.rs:2:17
+   |
+LL |     println!("");
+   |                 ^
+help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
+   |
+LL |     println!("");
+   |                 ^
+
+error[E0308]: mismatched types
+  --> $DIR/recover-from-homoglyph.rs:3:20
+   |
+LL |     let x: usize = ();
+   |                    ^^ expected usize, found ()
+   |
+   = note: expected type `usize`
+              found type `()`
+
+error: aborting due to 2 previous errors
+
+For more information about this error, try `rustc --explain E0308`.