Skip to content

Commit

Permalink
[Issue #1018] sprint burnup metric (#1371)
Browse files Browse the repository at this point in the history
- Adds `SprintBurnup` metric sub-class
- Abstracts common calculation for burnup and burndown to a `utils.py`
- Adds CLI entrypoint for sprint burnup
  • Loading branch information
AlexanderStephensonUSDS authored Mar 20, 2024
1 parent 74c9480 commit 8af7708
Show file tree
Hide file tree
Showing 9 changed files with 1,717 additions and 220 deletions.
732 changes: 514 additions & 218 deletions analytics/reporting.ipynb

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions analytics/src/analytics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from analytics.integrations import github, slack
from analytics.metrics.base import BaseMetric, Unit
from analytics.metrics.burndown import SprintBurndown
from analytics.metrics.burnup import SprintBurnup
from analytics.metrics.percent_complete import DeliverablePercentComplete

# fmt: off
Expand Down Expand Up @@ -89,6 +90,30 @@ def calculate_sprint_burndown(
post_results=post_results,
)

@metrics_app.command(name="sprint_burnup")
def calculate_sprint_burnup(
sprint_file: Annotated[str, SPRINT_FILE_ARG],
issue_file: Annotated[str, ISSUE_FILE_ARG],
sprint: Annotated[str, SPRINT_ARG],
unit: Annotated[Unit, UNIT_ARG] = Unit.points.value, # type: ignore[assignment]
*, # makes the following args keyword only
show_results: Annotated[bool, SHOW_RESULTS_ARG] = False,
post_results: Annotated[bool, POST_RESULTS_ARG] = False,
) -> None:
"""Calculate the burnup of a particular sprint"""
# load the input data
sprint_data = SprintBoard.load_from_json_files(
sprint_file=sprint_file,
issue_file=issue_file,
)
# calculate burnup
burnup = SprintBurnup(sprint_data, sprint=sprint, unit=unit)
show_and_or_post_results(
metric=burnup,
show_results=show_results,
post_results=post_results,
)


@metrics_app.command(name="deliverable_percent_complete")
def calculate_deliverable_percent_complete(
Expand Down
3 changes: 3 additions & 0 deletions analytics/src/analytics/datasets/sprint_board.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
This is a sub-class of BaseDataset that stores the tickets and metadata
set for each ticket in the Sprint Planning Board
"""
from __future__ import annotations

from typing import Self

import pandas as pd
Expand Down Expand Up @@ -109,6 +111,7 @@ def load_from_json_files(
-------
Self:
An instance of the SprintBoard dataset class
"""
# load and merge input datasets
df_sprints = load_json_data_as_df(
Expand Down
7 changes: 7 additions & 0 deletions analytics/src/analytics/metrics/burndown.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
This is a subclass of the BaseMetric class that calculates the running total of
open issues for each day in a sprint
"""
from __future__ import annotations

from typing import Literal

import pandas as pd
Expand Down Expand Up @@ -47,6 +49,7 @@ def calculate(self) -> pd.DataFrame:
3. Count the number of issues opened and closed on each day of that range
4. Calculate the delta between opened and closed issues per day
5. Cumulatively sum those deltas to get the running total of open tix
"""
# make a copy of columns and rows we need to calculate burndown for this sprint
burndown_cols = [self.opened_col, self.closed_col, self.points_col]
Expand Down Expand Up @@ -91,6 +94,7 @@ def get_stats(self) -> dict[str, Statistic]:
Notes
-----
TODO(@widal001): 2023-12-04 - Should stats be calculated in separate private methods?
"""
df = self.results
# get sprint start and end dates
Expand Down Expand Up @@ -160,6 +164,7 @@ def _get_daily_tix_counts_by_status(
It does this by:
- Grouping on the created_date or opened_date column, depending on status
- Counting the total number of rows per group
"""
# create local copies of the key column names
agg_col = self.opened_col if status == "opened" else self.closed_col
Expand All @@ -185,6 +190,7 @@ def _get_tix_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
- Creating a row for each day between the earliest date a ticket was opened
and either the sprint end _or_ the latest date an issue was closed,
whichever is the later date.
"""
# get earliest date an issue was opened and latest date one was closed
sprint_end = self.dataset.sprint_end(self.sprint)
Expand Down Expand Up @@ -214,6 +220,7 @@ def _get_cum_sum_of_open_tix(
opened and a column for tix closed on that day
- Subtracting closed from opened to get the "delta" on each day in the range
- Cumulatively summing the deltas to get the running total of open tix
"""
# left join the full date range to open and closed counts
df = (
Expand Down
220 changes: 220 additions & 0 deletions analytics/src/analytics/metrics/burnup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
"""
Calculates burnup for sprints.
This is a subclass of the BaseMetric class that calculates the running total of
open issues for each day in a sprint
"""
from __future__ import annotations

from typing import Literal

import pandas as pd
import plotly.express as px
from numpy import nan
from plotly.graph_objects import Figure

from analytics.datasets.sprint_board import SprintBoard
from analytics.metrics.base import BaseMetric, Statistic, Unit
from analytics.metrics.utils import (
get_cum_sum_of_tix,
get_daily_tix_counts_by_status,
get_tix_date_range,
)


class SprintBurnup(BaseMetric[SprintBoard]):
"""Calculates the running total of open issues per day in the sprint."""

def __init__(
self,
dataset: SprintBoard,
sprint: str,
unit: Unit,
) -> None:
"""Initialize the SprintBurnup metric."""
self.dataset = dataset
self.sprint = self._get_and_validate_sprint_name(sprint)
self.sprint_data = self._isolate_data_for_this_sprint()
self.date_col = "date"
self.points_col = "points"
self.opened_col = dataset.opened_col # type: ignore[attr-defined]
self.closed_col = dataset.closed_col # type: ignore[attr-defined]
self.unit = unit
super().__init__(dataset)

def calculate(self) -> pd.DataFrame:
"""
Calculate the sprint burnup.
Notes
-----
Sprint burnup is calculated with the following algorithm:
1. Isolate Sprint records
2. Create data range for burnup
3. Group issues/points by date opened and date closed
4. Join on date
"""
# make a copy of columns and rows we need to calculate burndown for this sprint
burnup_cols = [self.opened_col, self.closed_col, self.points_col]
df_sprint = self.sprint_data[burnup_cols].copy()
# get the date range over which tix were created and closed
df_tix_range = get_tix_date_range(
df_sprint,
self.opened_col,
self.closed_col,
self.dataset.sprint_end(self.sprint),
)
# get the number of tix opened and closed each day
df_opened = get_daily_tix_counts_by_status(df_sprint, "opened", self.unit)
df_closed = get_daily_tix_counts_by_status(df_sprint, "closed", self.unit)
# combine the daily opened and closed counts to get total open and closed per day
return get_cum_sum_of_tix(df_tix_range, df_opened, df_closed)

def plot_results(self) -> Figure:
"""Plot the sprint burnup using a plotly area chart."""
# Limit the data in the area chart to dates within the sprint
# or through today, if the sprint hasn't yet ended
# NOTE: This will *not* affect the running totals on those days
sprint_start = self.dataset.sprint_start(self.sprint)
sprint_end = self.dataset.sprint_end(self.sprint)
date_mask = self.results[self.date_col].between(
sprint_start,
min(sprint_end, pd.Timestamp.today(tz="utc")),
)
df = self.results[date_mask].melt(
id_vars=self.date_col,
value_vars=["total_closed", "total_open"],
var_name="cols",
)

# create a area chart from the data in self.results
chart = px.area(
data_frame=df,
x=self.date_col,
y="value",
color="cols",
color_discrete_sequence=["#EFE0FC", "#2DA34D"],
markers=True,
title=f"{self.sprint} Burnup by {self.unit.value}",
template="none",
)
# set the scale of the y axis to start at 0
chart.update_yaxes(range=[0, df["value"].max() + 10])
chart.update_xaxes(range=[sprint_start, sprint_end])
chart.update_layout(
xaxis_title="Date",
yaxis_title=f"Total {self.unit.value.capitalize()}",
legend_title=f"{self.unit.value.capitalize()}",
)
return chart

def get_stats(self) -> dict[str, Statistic]:
"""Calculate summary statistics for this metric."""
df = self.results
# get sprint start and end dates
sprint_start = self.dataset.sprint_start(self.sprint).strftime("%Y-%m-%d")
sprint_end = self.dataset.sprint_end(self.sprint).strftime("%Y-%m-%d")
# get open and closed counts and percentages
total_opened = int(df["opened"].sum())
total_closed = int(df["closed"].sum())
pct_closed = round(total_closed / total_opened * 100, 2)
# For burnup, we want to know at a glance the pct_remaining
pct_remaining = round(100 - pct_closed, 2)
# get the percentage of tickets that were ticketed
is_pointed = self.sprint_data[Unit.points.value] >= 1
issues_pointed = len(self.sprint_data[is_pointed])
issues_total = len(self.sprint_data)
pct_pointed = round(issues_pointed / issues_total * 100, 2)
# format and return stats
return {
"Sprint start date": Statistic(value=sprint_start),
"Sprint end date": Statistic(value=sprint_end),
"Total opened": Statistic(total_opened, suffix=f" {self.unit.value}"),
"Total closed": Statistic(total_closed, suffix=f" {self.unit.value}"),
"Percent closed": Statistic(value=pct_closed, suffix="%"),
"Percent remaining": Statistic(value=pct_remaining, suffix="%"),
"Percent pointed": Statistic(
value=pct_pointed,
suffix=f"% of {Unit.issues.value}",
),
}

def format_slack_message(self) -> str:
"""Format the message that will be included with the charts posted to slack."""
message = f"*:github: Burnup summary for {self.sprint} by {self.unit.value}*\n"
for label, stat in self.stats.items():
message += f"• *{label}:* {stat.value}{stat.suffix}\n"
return message

def _get_and_validate_sprint_name(self, sprint: str | None) -> str:
"""Get the name of the sprint we're using to calculate burndown or raise an error."""
# save dataset to local variable for brevity
dataset = self.dataset
# update sprint name if calculating burndown for the current sprint
if sprint == "@current":
sprint = dataset.current_sprint
# check that the sprint name matches one of the sprints in the dataset
valid_sprint = sprint in list(dataset.sprints[dataset.sprint_col])
if not sprint or not valid_sprint: # needs `not sprint` for mypy checking
msg = "Sprint value doesn't match one of the available sprints"
raise ValueError(msg)
# return the sprint name if it's valid
return sprint

def _isolate_data_for_this_sprint(self) -> pd.DataFrame:
"""Filter out issues that are not assigned to the current sprint."""
sprint_filter = self.dataset.df[self.dataset.sprint_col] == self.sprint
return self.dataset.df[sprint_filter]

# def _get_daily_tix_counts_by_status(
# self,
# df: pd.DataFrame,
# status: Literal["opened", "closed"],
# ) -> pd.DataFrame:
# """
# Count the number of issues or points opened or closed by date.

# Notes
# -----
# It does this by:
# - Grouping on the created_date or opened_date column, depending on status
# - Counting the total number of rows per group

# """
# # create local copies of the key column names
# agg_col = self.opened_col if status == "opened" else self.closed_col
# unit_col = self.unit.value
# key_cols = [agg_col, unit_col]
# # create a dummy column to sum per row if the unit is tasks
# if self.unit == Unit.issues:
# df[unit_col] = 1
# # isolate the key columns, group by open or closed date, then sum the units
# df_agg = df[key_cols].groupby(agg_col, as_index=False).agg({unit_col: "sum"})
# return df_agg.rename(columns={agg_col: self.date_col, unit_col: status})

# def _get_tix_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
# """
# Get the date range over which issues were created and closed.

# Notes
# -----
# It does this by:
# - Finding the date when the sprint ends
# - Finding the earliest date a issue was created
# - Finding the latest date a issue was closed
# - Creating a row for each day between the earliest date a ticket was opened
# and either the sprint end _or_ the latest date an issue was closed,
# whichever is the later date.

# """
# # get earliest date an issue was opened and latest date one was closed
# sprint_end = self.dataset.sprint_end(self.sprint)
# opened_min = df[self.opened_col].min()
# closed_max = df[self.closed_col].max()
# closed_max = sprint_end if closed_max is nan else max(sprint_end, closed_max)
# # creates a dataframe with one row for each day between min and max date
# return pd.DataFrame(
# pd.date_range(opened_min, closed_max),
# columns=[self.date_col],
# )
Loading

0 comments on commit 8af7708

Please sign in to comment.