diff --git a/compiler/rustc_lint/messages.ftl b/compiler/rustc_lint/messages.ftl index 785895e0ab823..c08ff0fef3c26 100644 --- a/compiler/rustc_lint/messages.ftl +++ b/compiler/rustc_lint/messages.ftl @@ -241,9 +241,28 @@ lint_hidden_unicode_codepoints = unicode codepoint changing visible direction of lint_identifier_non_ascii_char = identifier contains non-ASCII characters lint_identifier_uncommon_codepoints = identifier contains {$codepoints_len -> - [one] an uncommon Unicode codepoint - *[other] uncommon Unicode codepoints + [one] { $identifier_type -> + [Exclusion] a character from an archaic script + [Technical] a character that is for non-linguistic, specialized usage + [Limited_Use] a character from a script in limited use + [Not_NFKC] a non normalized (NFKC) character + *[other] an uncommon character + } + *[other] { $identifier_type -> + [Exclusion] {$codepoints_len} characters from archaic scripts + [Technical] {$codepoints_len} characters that are for non-linguistic, specialized usage + [Limited_Use] {$codepoints_len} characters from scripts in limited use + [Not_NFKC] {$codepoints_len} non normalized (NFKC) characters + *[other] uncommon characters + } }: {$codepoints} + .note = {$codepoints_len -> + [one] this character is + *[other] these characters are + } included in the{$identifier_type -> + [Restricted] {""} + *[other] {" "}{$identifier_type} + } Unicode general security profile lint_ignored_unless_crate_specified = {$level}({$name}) is ignored unless specified at crate level diff --git a/compiler/rustc_lint/src/lib.rs b/compiler/rustc_lint/src/lib.rs index 85f9d3bd63ec7..6c4e717faa6c4 100644 --- a/compiler/rustc_lint/src/lib.rs +++ b/compiler/rustc_lint/src/lib.rs @@ -31,6 +31,7 @@ #![feature(array_windows)] #![feature(box_patterns)] #![feature(control_flow_enum)] +#![feature(extract_if)] #![feature(generic_nonzero)] #![feature(if_let_guard)] #![feature(iter_order_by)] diff --git a/compiler/rustc_lint/src/lints.rs b/compiler/rustc_lint/src/lints.rs index c204c67fc1f7c..70d30611e8fcf 100644 --- a/compiler/rustc_lint/src/lints.rs +++ b/compiler/rustc_lint/src/lints.rs @@ -1098,9 +1098,11 @@ pub struct IdentifierNonAsciiChar; #[derive(LintDiagnostic)] #[diag(lint_identifier_uncommon_codepoints)] +#[note] pub struct IdentifierUncommonCodepoints { pub codepoints: Vec, pub codepoints_len: usize, + pub identifier_type: &'static str, } #[derive(LintDiagnostic)] diff --git a/compiler/rustc_lint/src/non_ascii_idents.rs b/compiler/rustc_lint/src/non_ascii_idents.rs index e112cd6915c35..5e66ade035774 100644 --- a/compiler/rustc_lint/src/non_ascii_idents.rs +++ b/compiler/rustc_lint/src/non_ascii_idents.rs @@ -7,6 +7,7 @@ use rustc_ast as ast; use rustc_data_structures::fx::FxIndexMap; use rustc_data_structures::unord::UnordMap; use rustc_span::symbol::Symbol; +use unicode_security::general_security_profile::IdentifierType; declare_lint! { /// The `non_ascii_idents` lint detects non-ASCII identifiers. @@ -189,17 +190,47 @@ impl EarlyLintPass for NonAsciiIdents { if check_uncommon_codepoints && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed) { - let codepoints: Vec<_> = symbol_str + let mut chars: Vec<_> = symbol_str .chars() - .filter(|c| !GeneralSecurityProfile::identifier_allowed(*c)) + .map(|c| (c, GeneralSecurityProfile::identifier_type(c))) .collect(); - let codepoints_len = codepoints.len(); - cx.emit_span_lint( - UNCOMMON_CODEPOINTS, - sp, - IdentifierUncommonCodepoints { codepoints, codepoints_len }, - ); + for (id_ty, id_ty_descr) in [ + (IdentifierType::Exclusion, "Exclusion"), + (IdentifierType::Technical, "Technical"), + (IdentifierType::Limited_Use, "Limited_Use"), + (IdentifierType::Not_NFKC, "Not_NFKC"), + ] { + let codepoints: Vec<_> = + chars.extract_if(|(_, ty)| *ty == Some(id_ty)).collect(); + if codepoints.is_empty() { + continue; + } + cx.emit_span_lint( + UNCOMMON_CODEPOINTS, + sp, + IdentifierUncommonCodepoints { + codepoints_len: codepoints.len(), + codepoints: codepoints.into_iter().map(|(c, _)| c).collect(), + identifier_type: id_ty_descr, + }, + ); + } + + let remaining = chars + .extract_if(|(c, _)| !GeneralSecurityProfile::identifier_allowed(*c)) + .collect::>(); + if !remaining.is_empty() { + cx.emit_span_lint( + UNCOMMON_CODEPOINTS, + sp, + IdentifierUncommonCodepoints { + codepoints_len: remaining.len(), + codepoints: remaining.into_iter().map(|(c, _)| c).collect(), + identifier_type: "Restricted", + }, + ); + } } } diff --git a/tests/ui/lexer/lex-emoji-identifiers.rs b/tests/ui/lexer/lex-emoji-identifiers.rs index bbc088521b7bd..4fcd102018beb 100644 --- a/tests/ui/lexer/lex-emoji-identifiers.rs +++ b/tests/ui/lexer/lex-emoji-identifiers.rs @@ -4,7 +4,7 @@ fn invalid_emoji_usages() { let wireless🛜 = "basic emoji"; //~ ERROR: identifiers cannot contain emoji // FIXME let key1️⃣ = "keycap sequence"; //~ ERROR: unknown start of token - //~^ WARN: identifier contains an uncommon Unicode codepoint + //~^ WARN: identifier contains an uncommon character: '\u{fe0f}' let flag🇺🇳 = "flag sequence"; //~ ERROR: identifiers cannot contain emoji let wales🏴 = "tag sequence"; //~ ERROR: identifiers cannot contain emoji let folded🙏🏿 = "modifier sequence"; //~ ERROR: identifiers cannot contain emoji diff --git a/tests/ui/lexer/lex-emoji-identifiers.stderr b/tests/ui/lexer/lex-emoji-identifiers.stderr index 679b7422bc150..8e2daa6d1d38f 100644 --- a/tests/ui/lexer/lex-emoji-identifiers.stderr +++ b/tests/ui/lexer/lex-emoji-identifiers.stderr @@ -40,12 +40,13 @@ error: identifiers cannot contain emoji: `folded🙏🏿` LL | let folded🙏🏿 = "modifier sequence"; | ^^^^^^^^^^ -warning: identifier contains an uncommon Unicode codepoint: '\u{fe0f}' +warning: identifier contains an uncommon character: '\u{fe0f}' --> $DIR/lex-emoji-identifiers.rs:6:9 | LL | let key1️⃣ = "keycap sequence"; | ^^^^ | + = note: this character is included in the Unicode general security profile = note: `#[warn(uncommon_codepoints)]` on by default error: aborting due to 7 previous errors; 1 warning emitted diff --git a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs index c3459930a94c0..a51452f069590 100644 --- a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs +++ b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.rs @@ -1,12 +1,13 @@ #![deny(uncommon_codepoints)] -const µ: f64 = 0.000001; //~ ERROR identifier contains an uncommon Unicode codepoint +const µ: f64 = 0.000001; //~ identifier contains a non normalized (NFKC) character: 'µ' //~| WARNING should have an upper case name -fn dijkstra() {} //~ ERROR identifier contains an uncommon Unicode codepoint +fn dijkstra() {} +//~^ ERROR identifier contains a non normalized (NFKC) character: 'ij' fn main() { - let ㇻㇲㇳ = "rust"; //~ ERROR identifier contains uncommon Unicode codepoints + let ㇻㇲㇳ = "rust"; //~ ERROR identifier contains uncommon characters: 'ㇻ', 'ㇲ', and 'ㇳ' // using the same identifier the second time won't trigger the lint. println!("{}", ㇻㇲㇳ); diff --git a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr index bae5ac654d354..000545a060075 100644 --- a/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr +++ b/tests/ui/lint/rfc-2457-non-ascii-idents/lint-uncommon-codepoints.stderr @@ -1,26 +1,31 @@ -error: identifier contains an uncommon Unicode codepoint: 'µ' +error: identifier contains a non normalized (NFKC) character: 'µ' --> $DIR/lint-uncommon-codepoints.rs:3:7 | LL | const µ: f64 = 0.000001; | ^ | + = note: this character is included in the Not_NFKC Unicode general security profile note: the lint level is defined here --> $DIR/lint-uncommon-codepoints.rs:1:9 | LL | #![deny(uncommon_codepoints)] | ^^^^^^^^^^^^^^^^^^^ -error: identifier contains an uncommon Unicode codepoint: 'ij' +error: identifier contains a non normalized (NFKC) character: 'ij' --> $DIR/lint-uncommon-codepoints.rs:6:4 | LL | fn dijkstra() {} | ^^^^^^^ + | + = note: this character is included in the Not_NFKC Unicode general security profile -error: identifier contains uncommon Unicode codepoints: 'ㇻ', 'ㇲ', and 'ㇳ' - --> $DIR/lint-uncommon-codepoints.rs:9:9 +error: identifier contains uncommon characters: 'ㇻ', 'ㇲ', and 'ㇳ' + --> $DIR/lint-uncommon-codepoints.rs:10:9 | LL | let ㇻㇲㇳ = "rust"; | ^^^^^^ + | + = note: these characters are included in the Unicode general security profile warning: constant `µ` should have an upper case name --> $DIR/lint-uncommon-codepoints.rs:3:7