nedbat · zackw · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/coverage/config.py b/coverage/config.py
@@ -250,6 +250,7 @@ def __init__(self) -> None:
 
         # Defaults for [lcov]
         self.lcov_output = "coverage.lcov"
+        self.lcov_checksums = "off"
 
         # Defaults for [paths]
         self.paths: dict[str, list[str]] = {}
@@ -428,6 +429,7 @@ def copy(self) -> CoverageConfig:
 
         # [lcov]
         ("lcov_output", "lcov:output"),
+        ("lcov_checksums", "lcov:checksums")
     ]
 
     def _set_attr_from_config_option(

diff --git a/coverage/lcovreport.py b/coverage/lcovreport.py
@@ -22,22 +22,43 @@
 
 def line_hash(line: str) -> str:
     """Produce a hash of a source line for use in the LCOV file."""
-    # The LCOV file format requires MD5 as a fingerprint of the file. This is
-    # not a security use.  Some security scanners raise alarms about the use of
-    # MD5 here, but it is a false positive. This is not a security concern.
+    # The LCOV file format optionally allows each line to be MD5ed as a
+    # fingerprint of the file.  This is not a security use.  Some security
+    # scanners raise alarms about the use of MD5 here, but it is a false
+    # positive.  This is not a security concern.
+    # The unusual encoding of the MD5 hash, as a base64 sequence with the
+    # trailing = signs stripped, is specified by the LCOV file format.
     hashed = hashlib.md5(line.encode("utf-8")).digest()
     return base64.b64encode(hashed).decode("ascii").rstrip("=")
 
 
+def file_hash(file: str) -> str:
+    """Produce a hash of an entire source file for use in the LCOV file."""
+    # The LCOV file format optionally allows each entire file to be
+    # fingerprinted, using a hash algorithm and format of the generator's
+    # choice.  We use sha256 (unlike line hashes), with the result written out
+    # in base64 with trailing = signs stripped (like line hashes).  See the
+    # documentation of the 'checksums' option for how to tell the LCOV tools
+    # to check these hashes.
+    hashed = hashlib.sha256(file.encode("utf-8")).digest()
+    return base64.b64encode(hashed).decode("ascii").rstrip("=")
+
+
 class LcovReporter:
     """A reporter for writing LCOV coverage reports."""
 
     report_type = "LCOV report"
 
     def __init__(self, coverage: Coverage) -> None:
         self.coverage = coverage
+        self.config = coverage.config
+        self.checksum_mode = self.config.lcov_checksums.lower().strip()
         self.total = Numbers(self.coverage.config.precision)
 
+        if self.checksum_mode not in ("file", "line", "off"):
+            raise ValueError(f"invalid configuration, checksums = {self.checksum_mode!r}"
+                             " not understood")
+
     def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
         """Renders the full lcov report.
 
@@ -49,85 +70,90 @@ def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
         self.coverage.get_data()
         outfile = outfile or sys.stdout
 
-        for fr, analysis in get_analysis_to_report(self.coverage, morfs):
-            self.get_lcov(fr, analysis, outfile)
+        # ensure file records are sorted by the _relative_ filename, not the full path
+        to_report = [(fr.relative_filename(), fr, analysis)
+                     for fr, analysis in get_analysis_to_report(self.coverage, morfs)]
+        to_report.sort()
+
+        for fname, fr, analysis in to_report:
+            self.total += analysis.numbers
+            self.lcov_file(fname, fr, analysis, outfile)
 
         return self.total.n_statements and self.total.pc_covered
 
-    def get_lcov(self, fr: FileReporter, analysis: Analysis, outfile: IO[str]) -> None:
+    def lcov_file(self, rel_fname: str,
+                  fr: FileReporter, analysis: Analysis,
+                  outfile: IO[str]) -> None:
         """Produces the lcov data for a single file.
 
         This currently supports both line and branch coverage,
         however function coverage is not supported.
         """
-        self.total += analysis.numbers
-
-        outfile.write("TN:\n")
-        outfile.write(f"SF:{fr.relative_filename()}\n")
-        source_lines = fr.source().splitlines()
-        for covered in sorted(analysis.executed):
-            if covered in analysis.excluded:
-                # Do not report excluded as executed
-                continue
-            # Note: Coverage.py currently only supports checking *if* a line
-            # has been executed, not how many times, so we set this to 1 for
-            # nice output even if it's technically incorrect.
-
-            # The lines below calculate a 64-bit encoded md5 hash of the line
-            # corresponding to the DA lines in the lcov file, for either case
-            # of the line being covered or missed in coverage.py. The final two
-            # characters of the encoding ("==") are removed from the hash to
-            # allow genhtml to run on the resulting lcov file.
-            if source_lines:
-                if covered-1 >= len(source_lines):
-                    break
-                line = source_lines[covered-1]
-            else:
-                line = ""
-            outfile.write(f"DA:{covered},1,{line_hash(line)}\n")
-
-        for missed in sorted(analysis.missing):
-            # We don't have to skip excluded lines here, because `missing`
-            # already doesn't have them.
-            assert source_lines
-            line = source_lines[missed-1]
-            outfile.write(f"DA:{missed},0,{line_hash(line)}\n")
-
-        outfile.write(f"LF:{analysis.numbers.n_statements}\n")
-        outfile.write(f"LH:{analysis.numbers.n_executed}\n")
-
-        # More information dense branch coverage data.
-        missing_arcs = analysis.missing_branch_arcs()
-        executed_arcs = analysis.executed_branch_arcs()
-        for block_number, block_line_number in enumerate(
-            sorted(analysis.branch_stats().keys()),
-        ):
-            for branch_number, line_number in enumerate(
-                sorted(missing_arcs[block_line_number]),
-            ):
-                # The exit branches have a negative line number,
-                # this will not produce valid lcov. Setting
-                # the line number of the exit branch to 0 will allow
-                # for valid lcov, while preserving the data.
-                line_number = max(line_number, 0)
-                outfile.write(f"BRDA:{line_number},{block_number},{branch_number},-\n")
-
-            # The start value below allows for the block number to be
-            # preserved between these two for loops (stopping the loop from
-            # resetting the value of the block number to 0).
-            for branch_number, line_number in enumerate(
-                sorted(executed_arcs[block_line_number]),
-                start=len(missing_arcs[block_line_number]),
-            ):
-                line_number = max(line_number, 0)
-                outfile.write(f"BRDA:{line_number},{block_number},{branch_number},1\n")
-
-        # Summary of the branch coverage.
+        if analysis.numbers.n_statements == 0:
+            if self.config.skip_empty:
+                return
+
+        outfile.write(f"SF:{rel_fname}\n")
+
+        source_lines = None
+        if self.checksum_mode == "line":
+            source_lines = fr.source().splitlines()
+        elif self.checksum_mode == "file":
+            outfile.write(f"VER:{file_hash(fr.source())}\n")
+
+        # Emit a DA: record for each line of the file.
+        lines = sorted(analysis.statements)
+        hash_suffix = ""
+        for line in lines:
+            if self.checksum_mode == "line":
+                hash_suffix = "," + line_hash(source_lines[line-1])
+            # Q: can we get info about the number of times a statement is
+            # executed?  If so, that should be recorded here.
+            hit = int(line not in analysis.missing)
+            outfile.write(f"DA:{line},{hit}{hash_suffix}\n")
+
+        if analysis.numbers.n_statements > 0:
+            outfile.write(f"LF:{analysis.numbers.n_statements}\n")
+            outfile.write(f"LH:{analysis.numbers.n_executed}\n")
+
+        # More information dense branch coverage data, if available.
         if analysis.has_arcs:
             branch_stats = analysis.branch_stats()
+            executed_arcs = analysis.executed_branch_arcs()
+            missing_arcs = analysis.missing_branch_arcs()
+
+            for line in lines:
+                if line in branch_stats:
+                    # In our data, exit branches have negative destination line numbers.
+                    # The lcov tools will reject these - but the lcov tools consider the
+                    # destinations of branches to be opaque tokens.  Use the absolute
+                    # value of the destination line number as the destination block
+                    # number, and its sign as the destination branch number.  This will
+                    # ensure destinations are unique and stable, source line numbers are
+                    # always positive, and destination block and branch numbers are always
+                    # nonnegative, which are the properties we need.
+
+                    # The data we have does not permit us to identify branches that were
+                    # never *reached*, which is what "-" in the hit column means.  Such
+                    # branches aren't in either executed_arcs or missing_arcs - we don't
+                    # even know they exist.
+
+                    # Q: can we get counts of the number of times each arc was executed?
+                    # branch_stats has "total" and "taken" counts but it doesn't have
+                    # "taken" broken down by destination.
+                    arcs = []
+                    arcs.extend((abs(l), int(l <= 0), 1) for l in executed_arcs[line])
+                    arcs.extend((abs(l), int(l <= 0), 0) for l in missing_arcs[line])
+                    arcs.sort()
+
+                    for block, branch, hit in arcs:
+                        outfile.write(f"BRDA:{line},{block},{branch},{hit}\n")
+
+            # Summary of the branch coverage.
             brf = sum(t for t, k in branch_stats.values())
             brh = brf - sum(t - k for t, k in branch_stats.values())
-            outfile.write(f"BRF:{brf}\n")
-            outfile.write(f"BRH:{brh}\n")
+            if brf > 0:
+                outfile.write(f"BRF:{brf}\n")
+                outfile.write(f"BRH:{brh}\n")
 
         outfile.write("end_of_record\n")
diff --git a/doc/config.rst b/doc/config.rst
@@ -890,3 +890,57 @@ Settings particular to LCOV reporting (see :ref:`cmd_lcov`).
 .............
 
 (string, default "coverage.lcov") Where to write the LCOV file.
+
+[lcov] checksums
+................
+
+(one of "off", "file", or "line"; default "off") What kind of checksums to
+write as part of the LCOV file.  The default is to not write any checksums.
+
+"line" gives the historical behavior, in which an MD5 checksum is computed of
+each line *containing code subject to analysis*, separately.  Because the
+checksums do not cover the entire file and do not verify the ordering of
+lines, this mode provides only a weak assurance that the source code
+available to an analysis tool (e.g. ``genhtml``) matches the code that was
+used to generate the coverage data.  It also produces larger reports
+than either of the other two modes.
+
+"file" computes an SHA-256 hash of each file, as a whole; this gives a
+stronger assurance that the source code has not changed.  To validate
+the hashes emitted by "file" mode, use the following script as the
+``--version-script`` when running ``genhtml`` etc.::
+
+   #! /usr/bin/env python3
+   import argparse
+   import base64
+   import hashlib
+   import sys
+
+   def main():
+       ap = argparse.ArgumentParser()
+       ap.add_argument("--compare", action="store_true")
+       ap.add_argument("source_file_name")
+       ap.add_argument("source_file_id", nargs="?", default=None)
+       ap.add_argument("info_file_id", nargs="?", default=None)
+       args = ap.parse_args()
+
+       if args.compare:
+           if args.source_file_id is None or info_file_id is None:
+               ap.error("--compare mode requires source_file_id and info_file_id")
+           sys.exit(0 if args.source_file_id == args.info_file_id else 1)
+       else:
+           if args.source_file_id is not None or info_file_id is not None:
+               ap.error("determine mode does not use source_file_id and info_file_id")
+           with sys.stdout as ofp, \
+                open(args.source_file_name, "rb") as ifp:
+               digest = hashlib.sha256(ifp.read()).digest()
+               file_id = base64.b64encode(digest).decode("ascii").rstrip("=")
+               ofp.write(file_id + "\n")
+           sys.exit(0)
+
+    main()
+
+Note that for either "file" or "line" mode to work correctly, all of your
+source files must be encoded using UTF-8.
+
+.. versionadded:: 7.6.2