Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Json output #21

Merged
merged 7 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions orion.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,16 @@ def cli(max_content_width=120):
@click.option("--uuid", default="", help="UUID to use as base for comparisons")
@click.option("--baseline", default="", help="Baseline UUID(s) to to compare against uuid")
@click.option("--config", default="config.yaml", help="Path to the configuration file")
@click.option("--output", default="output.csv", help="Path to save the output csv file")
@click.option("--output-path", default="output.csv", help="Path to save the output csv file")
@click.option("--debug", is_flag=True, help="log level ")
@click.option("--hunter-analyze",is_flag=True, help="run hunter analyze")
@click.option(
"-o",
"--output",
type=click.Choice(["json", "text"]),
default="text",
help="Choose output format (json or text)",
)
def orion(**kwargs):
"""Orion is the cli tool to detect regressions over the runs

Expand Down Expand Up @@ -67,8 +74,8 @@ def orion(**kwargs):
benchmarkIndex=test['benchmarkIndex']
uuid = kwargs["uuid"]
baseline = kwargs["baseline"]
index = "ospst-perf-scale-ci-*"
match = Matcher(index=index,
fingerprint_index = test["index"]
match = Matcher(index=fingerprint_index,
level=level, ES_URL=ES_URL, verify_certs=False)
if uuid == "":
metadata = orion_funcs.get_metadata(test, logger)
Expand All @@ -86,23 +93,22 @@ def orion(**kwargs):
else:
uuids = [uuid for uuid in re.split(' |,',baseline) if uuid]
uuids.append(uuid)
buildUrls = orion_funcs.get_build_urls(index, uuids,match)
buildUrls = orion_funcs.get_build_urls(fingerprint_index, uuids,match)

index=benchmarkIndex
fingerprint_index=benchmarkIndex
if metadata["benchmark.keyword"] in ["ingress-perf","k8s-netperf"] :
ids = uuids
else:
if baseline == "":
runs = match.match_kube_burner(uuids, index)
runs = match.match_kube_burner(uuids, fingerprint_index)
ids = match.filter_runs(runs, runs)
else:
ids = uuids

metrics = test["metrics"]
dataframe_list = orion_funcs.get_metric_data(ids, index, metrics, match, logger)
dataframe_list = orion_funcs.get_metric_data(ids, fingerprint_index, metrics, match, logger)

for i, df in enumerate(dataframe_list):
if i != 0:
if i != 0 and ('timestamp' in df.columns):
dataframe_list[i] = df.drop(columns=['timestamp'])

merged_df = reduce(
Expand All @@ -113,13 +119,13 @@ def orion(**kwargs):
shortener = pyshorteners.Shortener()
merged_df["buildUrl"] = merged_df["uuid"].apply(
lambda uuid: shortener.tinyurl.short(buildUrls[uuid])) #pylint: disable = cell-var-from-loop
csv_name = kwargs["output"].split(".")[0]+"-"+test['name']+".csv"
csv_name = kwargs["output_path"].split(".")[0]+"-"+test['name']+".csv"
match.save_results(
merged_df, csv_file_path=csv_name
)

if kwargs["hunter_analyze"]:
_ = orion_funcs.run_hunter_analyze(merged_df,test)
orion_funcs.run_hunter_analyze(merged_df,test,kwargs["output"])


if __name__ == "__main__":
Expand Down
106 changes: 75 additions & 31 deletions utils/orion_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
# pylint: disable = import-error

import json
import logging
import sys

Expand All @@ -14,7 +15,7 @@
from hunter.series import Metric, Series


def run_hunter_analyze(merged_df,test):
def run_hunter_analyze(merged_df, test, output):
"""Start hunter analyze function

Args:
Expand All @@ -23,27 +24,71 @@ def run_hunter_analyze(merged_df,test):
"""
merged_df["timestamp"] = pd.to_datetime(merged_df["timestamp"])
merged_df["timestamp"] = merged_df["timestamp"].astype(int) // 10**9
metrics = {column: Metric(1, 1.0)
for column in merged_df.columns
if column not in ["uuid","timestamp","buildUrl"]}
data = {column: merged_df[column]
for column in merged_df.columns
if column not in ["uuid","timestamp","buildUrl"]}
metrics = {
column: Metric(1, 1.0)
for column in merged_df.columns
if column not in ["uuid","timestamp","buildUrl"]
}
data = {
column: merged_df[column]
for column in merged_df.columns
if column not in ["uuid","timestamp","buildUrl"]
}
attributes={column: merged_df[column]
for column in merged_df.columns if column in ["uuid","buildUrl"]}
series=Series(
series = Series(
test_name=test["name"],
branch=None,
time=list(merged_df["timestamp"]),
metrics=metrics,
data=data,
attributes=attributes
attributes=attributes,
)
change_points=series.analyze().change_points_by_time
report=Report(series,change_points)
output = report.produce_report(test_name="test",report_type=ReportType.LOG)
print(output)
return change_points
change_points = series.analyze().change_points_by_time
report = Report(series, change_points)
if output == "text":
output_table = report.produce_report(
test_name="test", report_type=ReportType.LOG
)
print(output_table)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use a logger here?

elif output == "json":
change_points_by_metric = series.analyze().change_points
Copy link
Collaborator

@vishnuchalla vishnuchalla Mar 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't need this custom logic at all. We can just use the same Report class to get change points reported in json using inbuilt functionality.

output = report.produce_report(test_name="test",report_type=ReportType.JSON)
print(output)

{"test": [{"time": 1703075837, "changes": [{"metric": "etcdDisk_duration_avg", "forward_change_percent": "43"}]}, {"time": 1704890296, "changes": [{"metric": "ovnCPU_cpu_avg", "forward_change_percent": "26"}, {"metric": "etcdCPU_cpu_avg", "forward_change_percent": "13"}]}, {"time": 1704994226, "changes": [{"metric": "apiserverCPU_cpu_avg", "forward_change_percent": "20"}]}]}

This will give the exact timestamps of all the change points along with metric and their percentage change attributes. If we want the value we can just get it from the merged dataframe by doing some thing like below.

index = df[df['timestamp'] == given_timestamp].index.item()
value = df.loc[index, 'value']

Also instead of entire data dump, having only change points in the JSON output would be cleaner. just my .02 cents.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the comment @vishnuchalla. I believe the main reason we find to have json is reusability of data and logic in other tools. If we have change points it might just be constrained to changepoint detection, if we have the whole data, the same can be used in cpt dashboards such that we can view the changepoints in graphs too. I think this is the major reason we went ahead with implementing a custom logic. Also custom logic allows us to add the buildUrl and else if we needed in the json document. Thoughts?

Copy link
Collaborator

@vishnuchalla vishnuchalla Mar 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood. So can we have both regulated with a toogle between just changepoints and the entire list of docs with changepoints? Because getting the entire data might be very huge sometimes. Also can we have a follow-up story to only detect regressions within a given time range? To have something like this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have this toggle and time range implemented along with the next PR, as already the current PR #26 has too many tasks going on.

output_json = parse_json_output(merged_df, change_points_by_metric)
print(json.dumps(output_json, indent=4))


def parse_json_output(merged_df, change_points_by_metric):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be removed based on the logic suggested in the above comment

"""json output generator function

Args:
merged_df (pd.Dataframe): the dataframe to be converted to json
change_points_by_metric (_type_): different change point

Returns:
_type_: _description_
"""
df_json = merged_df.to_json(orient="records")
df_json = json.loads(df_json)

for index, entry in enumerate(df_json):
entry["metrics"] = {
key: {"value": entry.pop(key), "percentage_change": 0}
for key in entry.keys() - {"uuid", "timestamp", "buildUrl"}
}
entry["is_changepoint"] = False

for key in change_points_by_metric.keys():
for change_point in change_points_by_metric[key]:
index = change_point.index
percentage_change = (
(change_point.stats.mean_2 - change_point.stats.mean_1)
/ change_point.stats.mean_1
) * 100
df_json[index]["metrics"][key]["percentage_change"] = percentage_change
df_json[index]["is_changepoint"] = True

return df_json


# pylint: disable=too-many-locals
def get_metric_data(ids, index, metrics, match, logger):
Expand All @@ -61,22 +106,19 @@ def get_metric_data(ids, index, metrics, match, logger):
"""
dataframe_list = []
for metric in metrics:
metric_name = metric['name']
metric_name = metric["name"]
logger.info("Collecting %s", metric_name)
metric_of_interest = metric['metric_of_interest']
metric_of_interest = metric["metric_of_interest"]

if "agg" in metric.keys():
try:
cpu = match.get_agg_metric_query(
ids, index, metric
)
agg_value = metric['agg']['value']
agg_type = metric['agg']['agg_type']
cpu = match.get_agg_metric_query(ids, index, metric)
agg_value = metric["agg"]["value"]
agg_type = metric["agg"]["agg_type"]
agg_name = agg_value + "_" + agg_type
cpu_df = match.convert_to_df(cpu, columns=["uuid","timestamp", agg_name])
cpu_df = cpu_df.rename(
columns={agg_name: metric_name+ "_" + agg_name}
)
cpu_df = match.convert_to_df(cpu, columns=["uuid", "timestamp", agg_name])
cpu_df= cpu_df.drop_duplicates(subset=['uuid'],keep='first')
cpu_df = cpu_df.rename(columns={agg_name: metric_name + "_" + agg_type})
dataframe_list.append(cpu_df)
logger.debug(cpu_df)

Expand All @@ -92,6 +134,9 @@ def get_metric_data(ids, index, metrics, match, logger):
podl_df = match.convert_to_df(
podl, columns=["uuid", "timestamp", metric_of_interest]
)
podl_df= podl_df.drop_duplicates(subset=['uuid'],keep='first')
podl_df = podl_df.rename(columns={metric_of_interest:
metric_name + "_" + metric_of_interest})
dataframe_list.append(podl_df)
logger.debug(podl_df)
except Exception as e: # pylint: disable=broad-exception-caught
Expand All @@ -103,7 +148,7 @@ def get_metric_data(ids, index, metrics, match, logger):
return dataframe_list


def get_metadata(test,logger):
def get_metadata(test, logger):
"""Gets metadata of the run from each test

Args:
Expand All @@ -112,12 +157,11 @@ def get_metadata(test,logger):
Returns:
dict: dictionary of the metadata
"""
metadata=test['metadata']
metadata = test["metadata"]
metadata["ocpVersion"] = str(metadata["ocpVersion"])
logger.debug('metadata' + str(metadata))
logger.debug("metadata" + str(metadata))
return metadata


def get_build_urls(index, uuids,match):
"""Gets metadata of the run from each test
to get the build url
Expand All @@ -135,7 +179,6 @@ def get_build_urls(index, uuids,match):
buildUrls = {run["uuid"]: run["buildUrl"] for run in test}
return buildUrls


def filter_metadata(uuid,match,logger):
"""Gets metadata of the run from each test

Expand Down Expand Up @@ -202,7 +245,8 @@ def set_logging(level, logger):
logger.addHandler(handler)
return logger

def load_config(config,logger):

def load_config(config, logger):
"""Loads config file

Args:
Expand Down
Loading