From ae031ef8e87e5456e9c3812b488f25c18a0fa35e Mon Sep 17 00:00:00 2001
From: Zalathar <Zalathar@users.noreply.github.com>
Date: Fri, 15 Dec 2023 14:12:46 +1100
Subject: [PATCH] coverage: `llvm-cov` expects column numbers to be bytes, not
 code points

---
 .../rustc_mir_transform/src/coverage/mod.rs   | 59 +++++++++++++++----
 compiler/rustc_mir_transform/src/lib.rs       |  1 +
 tests/coverage/unicode.cov-map                | 18 +++---
 tests/coverage/unicode.coverage               |  4 +-
 4 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/compiler/rustc_mir_transform/src/coverage/mod.rs b/compiler/rustc_mir_transform/src/coverage/mod.rs
index 5f361fa09465b..0854f6d677223 100644
--- a/compiler/rustc_mir_transform/src/coverage/mod.rs
+++ b/compiler/rustc_mir_transform/src/coverage/mod.rs
@@ -23,7 +23,7 @@ use rustc_middle::mir::{
 use rustc_middle::ty::TyCtxt;
 use rustc_span::def_id::LocalDefId;
 use rustc_span::source_map::SourceMap;
-use rustc_span::{ExpnKind, Span, Symbol};
+use rustc_span::{BytePos, ExpnKind, Pos, RelativeBytePos, Span, Symbol};
 
 /// Inserts `StatementKind::Coverage` statements that either instrument the binary with injected
 /// counters, via intrinsic `llvm.instrprof.increment`, and/or inject metadata used during codegen
@@ -258,6 +258,10 @@ fn inject_statement(mir_body: &mut mir::Body<'_>, counter_kind: CoverageKind, bb
 
 /// Convert the Span into its file name, start line and column, and end line and column.
 ///
+/// Line numbers and column numbers are 1-based. Unlike most column numbers emitted by
+/// the compiler, these column numbers are denoted in **bytes**, because that's what
+/// LLVM's `llvm-cov` tool expects to see in coverage maps.
+///
 /// Returns `None` if the conversion failed for some reason. There is no known example
 /// of code that would cause this to happen, but it's hard to rule out entirely
 /// (especially in the presence of complex macros or other expansions), and if it does
@@ -276,20 +280,49 @@ fn make_code_region(
         source_map.span_to_diagnostic_string(body_span)
     );
 
-    let (file, mut start_line, mut start_col, mut end_line, mut end_col) =
-        source_map.span_to_location_info(span);
-    if span.hi() == span.lo() {
-        // Extend an empty span by one character so the region will be counted.
-        if span.hi() == body_span.hi() {
-            start_col = start_col.saturating_sub(1);
-        } else {
-            end_col = start_col + 1;
-        }
+    let lo = span.lo();
+    let hi = span.hi();
+
+    let file = source_map.lookup_source_file(lo);
+
+    // Column numbers need to be in bytes, so we can't use the more convenient
+    // `SourceMap` methods for looking up file coordinates.
+    let rpos_and_line_and_byte_column = |pos: BytePos| -> Option<(RelativeBytePos, usize, usize)> {
+        let rpos = file.relative_position(pos);
+        let line_index = file.lookup_line(rpos)?;
+        let line_start = file.lines()[line_index];
+        // Line numbers and column numbers are 1-based, so add 1 to each.
+        Some((rpos, line_index + 1, (rpos - line_start).to_usize() + 1))
     };
-    if let Some(file) = file {
-        start_line = source_map.doctest_offset_line(&file.name, start_line);
-        end_line = source_map.doctest_offset_line(&file.name, end_line);
+
+    let (lo_rpos, mut start_line, mut start_col) = rpos_and_line_and_byte_column(lo)?;
+    let (hi_rpos, mut end_line, mut end_col) = rpos_and_line_and_byte_column(hi)?;
+
+    // If the span is empty, try to expand it by one character so that it is
+    // more visible in `llvm-cov` reports. (LLVM measures columns in bytes,
+    // so "one character" might be multiple bytes.)
+    if lo == hi
+        && let Some(src) = &file.src
+    {
+        // Prefer to expand the end position, if it won't go outside the body span.
+        if hi < body_span.hi() {
+            let hi_rpos = hi_rpos.to_usize();
+            let nudge_bytes = src.ceil_char_boundary(hi_rpos + 1) - hi_rpos;
+            end_col += nudge_bytes;
+        } else if lo > body_span.lo() {
+            let lo_rpos = lo_rpos.to_usize();
+            let nudge_bytes = lo_rpos - src.floor_char_boundary(lo_rpos - 1);
+            // Subtract the nudge, but don't go below column 1.
+            start_col = start_col.saturating_sub(nudge_bytes).max(1);
+        }
+        // If neither nudge could be applied, stick with the empty span coordinates.
     }
+
+    // Apply an offset so that code in doctests has correct line numbers.
+    // FIXME(#79417): Currently we have no way to offset doctest _columns_.
+    start_line = source_map.doctest_offset_line(&file.name, start_line);
+    end_line = source_map.doctest_offset_line(&file.name, end_line);
+
     Some(CodeRegion {
         file_name,
         start_line: start_line as u32,
diff --git a/compiler/rustc_mir_transform/src/lib.rs b/compiler/rustc_mir_transform/src/lib.rs
index 89e897191e852..654d5dcc23f99 100644
--- a/compiler/rustc_mir_transform/src/lib.rs
+++ b/compiler/rustc_mir_transform/src/lib.rs
@@ -10,6 +10,7 @@
 #![feature(min_specialization)]
 #![feature(never_type)]
 #![feature(option_get_or_insert_default)]
+#![feature(round_char_boundary)]
 #![feature(trusted_step)]
 #![feature(try_blocks)]
 #![feature(yeet_expr)]
diff --git a/tests/coverage/unicode.cov-map b/tests/coverage/unicode.cov-map
index 241ef2d44d0c7..0cbc066de14ac 100644
--- a/tests/coverage/unicode.cov-map
+++ b/tests/coverage/unicode.cov-map
@@ -1,5 +1,5 @@
 Function name: unicode::main
-Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0c, 01, 00, 0b, 05, 01, 09, 00, 0b, 03, 00, 0f, 00, 18, 05, 00, 19, 00, 24, 22, 02, 08, 00, 13, 09, 00, 17, 00, 22, 11, 00, 23, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02]
+Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0c, 01, 00, 0b, 05, 01, 09, 00, 0c, 03, 00, 10, 00, 1b, 05, 00, 1c, 00, 28, 22, 02, 08, 00, 25, 09, 00, 29, 00, 46, 11, 00, 47, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 9
@@ -14,26 +14,26 @@ Number of expressions: 9
 - expression 8 operands: lhs = Expression(0, Add), rhs = Counter(1)
 Number of file 0 mappings: 9
 - Code(Counter(0)) at (prev + 12, 1) to (start + 0, 11)
-- Code(Counter(1)) at (prev + 1, 9) to (start + 0, 11)
-- Code(Expression(0, Add)) at (prev + 0, 15) to (start + 0, 24)
+- Code(Counter(1)) at (prev + 1, 9) to (start + 0, 12)
+- Code(Expression(0, Add)) at (prev + 0, 16) to (start + 0, 27)
     = (c0 + c1)
-- Code(Counter(1)) at (prev + 0, 25) to (start + 0, 36)
-- Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 19)
+- Code(Counter(1)) at (prev + 0, 28) to (start + 0, 40)
+- Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 37)
     = ((c0 + c1) - c1)
-- Code(Counter(2)) at (prev + 0, 23) to (start + 0, 34)
-- Code(Counter(4)) at (prev + 0, 35) to (start + 2, 6)
+- Code(Counter(2)) at (prev + 0, 41) to (start + 0, 70)
+- Code(Counter(4)) at (prev + 0, 71) to (start + 2, 6)
 - Code(Expression(6, Add)) at (prev + 2, 6) to (start + 0, 7)
     = ((((c0 + c1) - c1) - c2) + c3)
 - Code(Expression(5, Add)) at (prev + 2, 5) to (start + 1, 2)
     = (c4 + ((((c0 + c1) - c1) - c2) + c3))
 
 Function name: unicode::サビ
-Raw bytes (9): 0x[01, 01, 00, 01, 01, 1c, 12, 00, 14]
+Raw bytes (9): 0x[01, 01, 00, 01, 01, 1c, 14, 00, 18]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 0
 Number of file 0 mappings: 1
-- Code(Counter(0)) at (prev + 28, 18) to (start + 0, 20)
+- Code(Counter(0)) at (prev + 28, 20) to (start + 0, 24)
 
 Function name: unicode::申し訳ございません
 Raw bytes (9): 0x[01, 01, 00, 01, 01, 16, 01, 02, 02]
diff --git a/tests/coverage/unicode.coverage b/tests/coverage/unicode.coverage
index 7d7d4889bcc23..edc7114f79450 100644
--- a/tests/coverage/unicode.coverage
+++ b/tests/coverage/unicode.coverage
@@ -11,10 +11,10 @@
    LL|       |
    LL|      1|fn main() {
    LL|     33|    for _İ in 'А'..='Я' { /* Я */ }
-                      ^32             ^32
+                      ^32                ^32
    LL|       |
    LL|      1|    if 申し訳ございません() && 申し訳ございません() {
-                                    ^0
+                                                      ^0
    LL|      0|        println!("true");
    LL|      1|    }
    LL|       |