rust-lang · oli-obk · Jul 25, 2024
diff --git a/Cargo.lock b/Cargo.lock
@@ -3882,6 +3882,7 @@ dependencies = [
  "termcolor",
  "termize",
  "tracing",
+ "unicode-width",
  "windows",
 ]
 

diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml
@@ -26,6 +26,7 @@ serde_json = "1.0.59"
 termcolor = "1.2.0"
 termize = "0.1.1"
 tracing = "0.1"
+unicode-width = "0.1.4"
 # tidy-alphabetical-end
 
 [target.'cfg(windows)'.dependencies.windows]

diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
@@ -8,7 +8,7 @@
 //! The output types are defined in `rustc_session::config::ErrorOutputType`.
 
 use rustc_span::source_map::SourceMap;
-use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};
+use rustc_span::{FileLines, FileName, SourceFile, Span};
 
 use crate::snippet::{
     Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
@@ -677,7 +677,10 @@ impl HumanEmitter {
             .skip(left)
             .take_while(|ch| {
                 // Make sure that the trimming on the right will fall within the terminal width.
-                let next = char_width(*ch);
+                // FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
+                // is. For now, just accept that sometimes the code line will be longer than
+                // desired.
+                let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
                 if taken + next > right - left {
                     return false;
                 }
@@ -739,7 +742,11 @@ impl HumanEmitter {
         let left = margin.left(source_string.len());
 
         // Account for unicode characters of width !=0 that were removed.
-        let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();
+        let left = source_string
+            .chars()
+            .take(left)
+            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
+            .sum();
 
         self.draw_line(
             buffer,
@@ -2032,7 +2039,7 @@ impl HumanEmitter {
                     let sub_len: usize =
                         if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
                             .chars()
-                            .map(|ch| char_width(ch))
+                            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
                             .sum();
 
                     let offset: isize = offsets
@@ -2069,8 +2076,11 @@ impl HumanEmitter {
                     }
 
                     // length of the code after substitution
-                    let full_sub_len =
-                        part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;
+                    let full_sub_len = part
+                        .snippet
+                        .chars()
+                        .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
+                        .sum::<usize>() as isize;
 
                     // length of the code to be substituted
                     let snippet_len = span_end_pos as isize - span_start_pos as isize;
@@ -2558,53 +2568,18 @@ fn num_decimal_digits(num: usize) -> usize {
 }
 
 // We replace some characters so the CLI output is always consistent and underlines aligned.
-// Keep the following list in sync with `rustc_span::char_width`.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),    // We do our own tab replacement
+    ('\t', "    "),   // We do our own tab replacement
     ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', "�"),
-    ('\u{2066}', "�"),
-    ('\u{2067}', "�"),
-    ('\u{2068}', "�"),
-    ('\u{202C}', "�"),
-    ('\u{2069}', "�"),
-    // In terminals without Unicode support the following will be garbled, but in *all* terminals
-    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
-    // support" gate.
-    ('\u{0000}', "␀"),
-    ('\u{0001}', "␁"),
-    ('\u{0002}', "␂"),
-    ('\u{0003}', "␃"),
-    ('\u{0004}', "␄"),
-    ('\u{0005}', "␅"),
-    ('\u{0006}', "␆"),
-    ('\u{0007}', "␇"),
-    ('\u{0008}', "␈"),
-    ('\u{000B}', "␋"),
-    ('\u{000C}', "␌"),
-    ('\u{000D}', "␍"),
-    ('\u{000E}', "␎"),
-    ('\u{000F}', "␏"),
-    ('\u{0010}', "␐"),
-    ('\u{0011}', "␑"),
-    ('\u{0012}', "␒"),
-    ('\u{0013}', "␓"),
-    ('\u{0014}', "␔"),
-    ('\u{0015}', "␕"),
-    ('\u{0016}', "␖"),
-    ('\u{0017}', "␗"),
-    ('\u{0018}', "␘"),
-    ('\u{0019}', "␙"),
-    ('\u{001A}', "␚"),
-    ('\u{001B}', "␛"),
-    ('\u{001C}', "␜"),
-    ('\u{001D}', "␝"),
-    ('\u{001E}', "␞"),
-    ('\u{001F}', "␟"),
-    ('\u{007F}', "␡"),
+    ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
+    ('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202E}', ""),
+    ('\u{2066}', ""),
+    ('\u{2067}', ""),
+    ('\u{2068}', ""),
+    ('\u{202C}', ""),
+    ('\u{2069}', ""),
 ];
 
 fn normalize_whitespace(str: &str) -> String {

diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs
@@ -1728,6 +1728,7 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
                     source_len,
                     lines,
                     multibyte_chars,
+                    non_narrow_chars,
                     normalized_pos,
                     stable_id,
                     ..
@@ -1779,6 +1780,7 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
                     self.cnum,
                     lines,
                     multibyte_chars,
+                    non_narrow_chars,
                     normalized_pos,
                     source_file_index,
                 );

diff --git a/compiler/rustc_query_system/src/ich/impls_syntax.rs b/compiler/rustc_query_system/src/ich/impls_syntax.rs
@@ -73,6 +73,7 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
             source_len: _,
             lines: _,
             ref multibyte_chars,
+            ref non_narrow_chars,
             ref normalized_pos,
         } = *self;
 
@@ -97,6 +98,11 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
             char_pos.hash_stable(hcx, hasher);
         }
 
+        non_narrow_chars.len().hash_stable(hcx, hasher);
+        for &char_pos in non_narrow_chars.iter() {
+            char_pos.hash_stable(hcx, hasher);
+        }
+
         normalized_pos.len().hash_stable(hcx, hasher);
         for &char_pos in normalized_pos.iter() {
             char_pos.hash_stable(hcx, hasher);

diff --git a/compiler/rustc_span/src/analyze_source_file.rs b/compiler/rustc_span/src/analyze_source_file.rs
@@ -1,4 +1,5 @@
 use super::*;
+use unicode_width::UnicodeWidthChar;
 
 #[cfg(test)]
 mod tests;
@@ -8,12 +9,15 @@ mod tests;
 ///
 /// This function will use an SSE2 enhanced implementation if hardware support
 /// is detected at runtime.
-pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
+pub fn analyze_source_file(
+    src: &str,
+) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
     let mut lines = vec![RelativeBytePos::from_u32(0)];
     let mut multi_byte_chars = vec![];
+    let mut non_narrow_chars = vec![];
 
     // Calls the right implementation, depending on hardware support available.
-    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
+    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
 
     // The code above optimistically registers a new line *after* each \n
     // it encounters. If that point is already outside the source_file, remove
@@ -26,7 +30,7 @@ pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteCha
         }
     }
 
-    (lines, multi_byte_chars)
+    (lines, multi_byte_chars, non_narrow_chars)
 }
 
 cfg_match! {
@@ -35,10 +39,11 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
+            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             if is_x86_feature_detected!("sse2") {
                 unsafe {
-                    analyze_source_file_sse2(src, lines, multi_byte_chars);
+                    analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
                 }
             } else {
                 analyze_source_file_generic(
@@ -47,6 +52,7 @@ cfg_match! {
                     RelativeBytePos::from_u32(0),
                     lines,
                     multi_byte_chars,
+                    non_narrow_chars,
                 );
             }
         }
@@ -60,6 +66,7 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
+            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             #[cfg(target_arch = "x86")]
             use std::arch::x86::*;
@@ -152,6 +159,7 @@ cfg_match! {
                     RelativeBytePos::from_usize(scan_start),
                     lines,
                     multi_byte_chars,
+                    non_narrow_chars,
                 );
             }
 
@@ -164,6 +172,7 @@ cfg_match! {
                     RelativeBytePos::from_usize(tail_start),
                     lines,
                     multi_byte_chars,
+                    non_narrow_chars,
                 );
             }
         }
@@ -174,13 +183,15 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
+            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             analyze_source_file_generic(
                 src,
                 src.len(),
                 RelativeBytePos::from_u32(0),
                 lines,
                 multi_byte_chars,
+                non_narrow_chars,
             );
         }
     }
@@ -194,6 +205,7 @@ fn analyze_source_file_generic(
     output_offset: RelativeBytePos,
     lines: &mut Vec<RelativeBytePos>,
     multi_byte_chars: &mut Vec<MultiByteChar>,
+    non_narrow_chars: &mut Vec<NonNarrowChar>,
 ) -> usize {
     assert!(src.len() >= scan_len);
     let mut i = 0;
@@ -215,8 +227,16 @@ fn analyze_source_file_generic(
 
             let pos = RelativeBytePos::from_usize(i) + output_offset;
 
-            if let b'\n' = byte {
-                lines.push(pos + RelativeBytePos(1));
+            match byte {
+                b'\n' => {
+                    lines.push(pos + RelativeBytePos(1));
+                }
+                b'\t' => {
+                    non_narrow_chars.push(NonNarrowChar::Tab(pos));
+                }
+                _ => {
+                    non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
+                }
             }
         } else if byte >= 127 {
             // The slow path:
@@ -232,6 +252,14 @@ fn analyze_source_file_generic(
                 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
                 multi_byte_chars.push(mbc);
             }
+
+            // Assume control characters are zero width.
+            // FIXME: How can we decide between `width` and `width_cjk`?
+            let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
+
+            if char_width != 1 {
+                non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
+            }
         }
 
         i += char_len;