Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Overhaul lcov reports #1847

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions coverage/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def __init__(self) -> None:

# Defaults for [lcov]
self.lcov_output = "coverage.lcov"
self.lcov_checksums = "off"

# Defaults for [paths]
self.paths: dict[str, list[str]] = {}
Expand Down Expand Up @@ -428,6 +429,7 @@ def copy(self) -> CoverageConfig:

# [lcov]
("lcov_output", "lcov:output"),
("lcov_checksums", "lcov:checksums")
]

def _set_attr_from_config_option(
Expand Down
168 changes: 97 additions & 71 deletions coverage/lcovreport.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,43 @@

def line_hash(line: str) -> str:
"""Produce a hash of a source line for use in the LCOV file."""
# The LCOV file format requires MD5 as a fingerprint of the file. This is
# not a security use. Some security scanners raise alarms about the use of
# MD5 here, but it is a false positive. This is not a security concern.
# The LCOV file format optionally allows each line to be MD5ed as a
# fingerprint of the file. This is not a security use. Some security
# scanners raise alarms about the use of MD5 here, but it is a false
# positive. This is not a security concern.
# The unusual encoding of the MD5 hash, as a base64 sequence with the
# trailing = signs stripped, is specified by the LCOV file format.
hashed = hashlib.md5(line.encode("utf-8")).digest()
return base64.b64encode(hashed).decode("ascii").rstrip("=")


def file_hash(file: str) -> str:
"""Produce a hash of an entire source file for use in the LCOV file."""
# The LCOV file format optionally allows each entire file to be
# fingerprinted, using a hash algorithm and format of the generator's
# choice. We use sha256 (unlike line hashes), with the result written out
# in base64 with trailing = signs stripped (like line hashes). See the
# documentation of the 'checksums' option for how to tell the LCOV tools
# to check these hashes.
hashed = hashlib.sha256(file.encode("utf-8")).digest()
return base64.b64encode(hashed).decode("ascii").rstrip("=")


class LcovReporter:
"""A reporter for writing LCOV coverage reports."""

report_type = "LCOV report"

def __init__(self, coverage: Coverage) -> None:
self.coverage = coverage
self.config = coverage.config
self.checksum_mode = self.config.lcov_checksums.lower().strip()
self.total = Numbers(self.coverage.config.precision)

if self.checksum_mode not in ("file", "line", "off"):
raise ValueError(f"invalid configuration, checksums = {self.checksum_mode!r}"
" not understood")

def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
"""Renders the full lcov report.

Expand All @@ -49,85 +70,90 @@ def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
self.coverage.get_data()
outfile = outfile or sys.stdout

for fr, analysis in get_analysis_to_report(self.coverage, morfs):
self.get_lcov(fr, analysis, outfile)
# ensure file records are sorted by the _relative_ filename, not the full path
to_report = [(fr.relative_filename(), fr, analysis)
for fr, analysis in get_analysis_to_report(self.coverage, morfs)]
to_report.sort()

for fname, fr, analysis in to_report:
self.total += analysis.numbers
self.lcov_file(fname, fr, analysis, outfile)

return self.total.n_statements and self.total.pc_covered

def get_lcov(self, fr: FileReporter, analysis: Analysis, outfile: IO[str]) -> None:
def lcov_file(self, rel_fname: str,
fr: FileReporter, analysis: Analysis,
outfile: IO[str]) -> None:
"""Produces the lcov data for a single file.

This currently supports both line and branch coverage,
however function coverage is not supported.
"""
self.total += analysis.numbers

outfile.write("TN:\n")
outfile.write(f"SF:{fr.relative_filename()}\n")
source_lines = fr.source().splitlines()
for covered in sorted(analysis.executed):
if covered in analysis.excluded:
# Do not report excluded as executed
continue
# Note: Coverage.py currently only supports checking *if* a line
# has been executed, not how many times, so we set this to 1 for
# nice output even if it's technically incorrect.

# The lines below calculate a 64-bit encoded md5 hash of the line
# corresponding to the DA lines in the lcov file, for either case
# of the line being covered or missed in coverage.py. The final two
# characters of the encoding ("==") are removed from the hash to
# allow genhtml to run on the resulting lcov file.
if source_lines:
if covered-1 >= len(source_lines):
break
line = source_lines[covered-1]
else:
line = ""
outfile.write(f"DA:{covered},1,{line_hash(line)}\n")

for missed in sorted(analysis.missing):
# We don't have to skip excluded lines here, because `missing`
# already doesn't have them.
assert source_lines
line = source_lines[missed-1]
outfile.write(f"DA:{missed},0,{line_hash(line)}\n")

outfile.write(f"LF:{analysis.numbers.n_statements}\n")
outfile.write(f"LH:{analysis.numbers.n_executed}\n")

# More information dense branch coverage data.
missing_arcs = analysis.missing_branch_arcs()
executed_arcs = analysis.executed_branch_arcs()
for block_number, block_line_number in enumerate(
sorted(analysis.branch_stats().keys()),
):
for branch_number, line_number in enumerate(
sorted(missing_arcs[block_line_number]),
):
# The exit branches have a negative line number,
# this will not produce valid lcov. Setting
# the line number of the exit branch to 0 will allow
# for valid lcov, while preserving the data.
line_number = max(line_number, 0)
outfile.write(f"BRDA:{line_number},{block_number},{branch_number},-\n")

# The start value below allows for the block number to be
# preserved between these two for loops (stopping the loop from
# resetting the value of the block number to 0).
for branch_number, line_number in enumerate(
sorted(executed_arcs[block_line_number]),
start=len(missing_arcs[block_line_number]),
):
line_number = max(line_number, 0)
outfile.write(f"BRDA:{line_number},{block_number},{branch_number},1\n")

# Summary of the branch coverage.
if analysis.numbers.n_statements == 0:
if self.config.skip_empty:
return

outfile.write(f"SF:{rel_fname}\n")

source_lines = None
if self.checksum_mode == "line":
source_lines = fr.source().splitlines()
elif self.checksum_mode == "file":
outfile.write(f"VER:{file_hash(fr.source())}\n")

# Emit a DA: record for each line of the file.
lines = sorted(analysis.statements)
hash_suffix = ""
for line in lines:
if self.checksum_mode == "line":
hash_suffix = "," + line_hash(source_lines[line-1])
# Q: can we get info about the number of times a statement is
# executed? If so, that should be recorded here.
hit = int(line not in analysis.missing)
outfile.write(f"DA:{line},{hit}{hash_suffix}\n")

if analysis.numbers.n_statements > 0:
outfile.write(f"LF:{analysis.numbers.n_statements}\n")
outfile.write(f"LH:{analysis.numbers.n_executed}\n")

# More information dense branch coverage data, if available.
if analysis.has_arcs:
branch_stats = analysis.branch_stats()
executed_arcs = analysis.executed_branch_arcs()
missing_arcs = analysis.missing_branch_arcs()

for line in lines:
if line in branch_stats:
# In our data, exit branches have negative destination line numbers.
# The lcov tools will reject these - but the lcov tools consider the
# destinations of branches to be opaque tokens. Use the absolute
# value of the destination line number as the destination block
# number, and its sign as the destination branch number. This will
# ensure destinations are unique and stable, source line numbers are
# always positive, and destination block and branch numbers are always
# nonnegative, which are the properties we need.

# The data we have does not permit us to identify branches that were
# never *reached*, which is what "-" in the hit column means. Such
# branches aren't in either executed_arcs or missing_arcs - we don't
# even know they exist.

# Q: can we get counts of the number of times each arc was executed?
# branch_stats has "total" and "taken" counts but it doesn't have
# "taken" broken down by destination.
arcs = []
arcs.extend((abs(l), int(l <= 0), 1) for l in executed_arcs[line])
arcs.extend((abs(l), int(l <= 0), 0) for l in missing_arcs[line])
arcs.sort()

for block, branch, hit in arcs:
outfile.write(f"BRDA:{line},{block},{branch},{hit}\n")

# Summary of the branch coverage.
brf = sum(t for t, k in branch_stats.values())
brh = brf - sum(t - k for t, k in branch_stats.values())
outfile.write(f"BRF:{brf}\n")
outfile.write(f"BRH:{brh}\n")
if brf > 0:
outfile.write(f"BRF:{brf}\n")
outfile.write(f"BRH:{brh}\n")

outfile.write("end_of_record\n")
54 changes: 54 additions & 0 deletions doc/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -890,3 +890,57 @@ Settings particular to LCOV reporting (see :ref:`cmd_lcov`).
.............

(string, default "coverage.lcov") Where to write the LCOV file.

[lcov] checksums
................

(one of "off", "file", or "line"; default "off") What kind of checksums to
write as part of the LCOV file. The default is to not write any checksums.

"line" gives the historical behavior, in which an MD5 checksum is computed of
each line *containing code subject to analysis*, separately. Because the
checksums do not cover the entire file and do not verify the ordering of
lines, this mode provides only a weak assurance that the source code
available to an analysis tool (e.g. ``genhtml``) matches the code that was
used to generate the coverage data. It also produces larger reports
than either of the other two modes.

"file" computes an SHA-256 hash of each file, as a whole; this gives a
stronger assurance that the source code has not changed. To validate
the hashes emitted by "file" mode, use the following script as the
``--version-script`` when running ``genhtml`` etc.::

#! /usr/bin/env python3
import argparse
import base64
import hashlib
import sys

def main():
ap = argparse.ArgumentParser()
ap.add_argument("--compare", action="store_true")
ap.add_argument("source_file_name")
ap.add_argument("source_file_id", nargs="?", default=None)
ap.add_argument("info_file_id", nargs="?", default=None)
args = ap.parse_args()

if args.compare:
if args.source_file_id is None or info_file_id is None:
ap.error("--compare mode requires source_file_id and info_file_id")
sys.exit(0 if args.source_file_id == args.info_file_id else 1)
else:
if args.source_file_id is not None or info_file_id is not None:
ap.error("determine mode does not use source_file_id and info_file_id")
with sys.stdout as ofp, \
open(args.source_file_name, "rb") as ifp:
digest = hashlib.sha256(ifp.read()).digest()
file_id = base64.b64encode(digest).decode("ascii").rstrip("=")
ofp.write(file_id + "\n")
sys.exit(0)

main()

Note that for either "file" or "line" mode to work correctly, all of your
source files must be encoded using UTF-8.

.. versionadded:: 7.6.2
Loading
Loading