-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Json output #21
Json output #21
Changes from all commits
4a83a3b
26b9ee0
3952133
b9d8559
6b3c58b
ca86ceb
b73c893
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
""" | ||
# pylint: disable = import-error | ||
|
||
import json | ||
import logging | ||
import sys | ||
|
||
|
@@ -14,7 +15,7 @@ | |
from hunter.series import Metric, Series | ||
|
||
|
||
def run_hunter_analyze(merged_df,test): | ||
def run_hunter_analyze(merged_df, test, output): | ||
"""Start hunter analyze function | ||
|
||
Args: | ||
|
@@ -23,27 +24,71 @@ def run_hunter_analyze(merged_df,test): | |
""" | ||
merged_df["timestamp"] = pd.to_datetime(merged_df["timestamp"]) | ||
merged_df["timestamp"] = merged_df["timestamp"].astype(int) // 10**9 | ||
metrics = {column: Metric(1, 1.0) | ||
for column in merged_df.columns | ||
if column not in ["uuid","timestamp","buildUrl"]} | ||
data = {column: merged_df[column] | ||
for column in merged_df.columns | ||
if column not in ["uuid","timestamp","buildUrl"]} | ||
metrics = { | ||
column: Metric(1, 1.0) | ||
for column in merged_df.columns | ||
if column not in ["uuid","timestamp","buildUrl"] | ||
} | ||
data = { | ||
column: merged_df[column] | ||
for column in merged_df.columns | ||
if column not in ["uuid","timestamp","buildUrl"] | ||
} | ||
attributes={column: merged_df[column] | ||
for column in merged_df.columns if column in ["uuid","buildUrl"]} | ||
series=Series( | ||
series = Series( | ||
test_name=test["name"], | ||
branch=None, | ||
time=list(merged_df["timestamp"]), | ||
metrics=metrics, | ||
data=data, | ||
attributes=attributes | ||
attributes=attributes, | ||
) | ||
change_points=series.analyze().change_points_by_time | ||
report=Report(series,change_points) | ||
output = report.produce_report(test_name="test",report_type=ReportType.LOG) | ||
print(output) | ||
return change_points | ||
change_points = series.analyze().change_points_by_time | ||
report = Report(series, change_points) | ||
if output == "text": | ||
output_table = report.produce_report( | ||
test_name="test", report_type=ReportType.LOG | ||
) | ||
print(output_table) | ||
elif output == "json": | ||
change_points_by_metric = series.analyze().change_points | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't need this custom logic at all. We can just use the same Report class to get change points reported in json using inbuilt functionality.
This will give the exact timestamps of all the change points along with metric and their percentage change attributes. If we want the value we can just get it from the merged dataframe by doing some thing like below.
Also instead of entire data dump, having only change points in the JSON output would be cleaner. just my .02 cents. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the comment @vishnuchalla. I believe the main reason we find to have json is reusability of data and logic in other tools. If we have change points it might just be constrained to changepoint detection, if we have the whole data, the same can be used in cpt dashboards such that we can view the changepoints in graphs too. I think this is the major reason we went ahead with implementing a custom logic. Also custom logic allows us to add the buildUrl and else if we needed in the json document. Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Understood. So can we have both regulated with a toogle between just changepoints and the entire list of docs with changepoints? Because getting the entire data might be very huge sometimes. Also can we have a follow-up story to only detect regressions within a given time range? To have something like this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll have this toggle and time range implemented along with the next PR, as already the current PR #26 has too many tasks going on. |
||
output_json = parse_json_output(merged_df, change_points_by_metric) | ||
print(json.dumps(output_json, indent=4)) | ||
|
||
|
||
def parse_json_output(merged_df, change_points_by_metric): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be removed based on the logic suggested in the above comment |
||
"""json output generator function | ||
|
||
Args: | ||
merged_df (pd.Dataframe): the dataframe to be converted to json | ||
change_points_by_metric (_type_): different change point | ||
|
||
Returns: | ||
_type_: _description_ | ||
""" | ||
df_json = merged_df.to_json(orient="records") | ||
df_json = json.loads(df_json) | ||
|
||
for index, entry in enumerate(df_json): | ||
entry["metrics"] = { | ||
key: {"value": entry.pop(key), "percentage_change": 0} | ||
for key in entry.keys() - {"uuid", "timestamp", "buildUrl"} | ||
} | ||
entry["is_changepoint"] = False | ||
|
||
for key in change_points_by_metric.keys(): | ||
for change_point in change_points_by_metric[key]: | ||
index = change_point.index | ||
percentage_change = ( | ||
(change_point.stats.mean_2 - change_point.stats.mean_1) | ||
/ change_point.stats.mean_1 | ||
) * 100 | ||
df_json[index]["metrics"][key]["percentage_change"] = percentage_change | ||
df_json[index]["is_changepoint"] = True | ||
|
||
return df_json | ||
|
||
|
||
# pylint: disable=too-many-locals | ||
def get_metric_data(ids, index, metrics, match, logger): | ||
|
@@ -61,22 +106,19 @@ def get_metric_data(ids, index, metrics, match, logger): | |
""" | ||
dataframe_list = [] | ||
for metric in metrics: | ||
metric_name = metric['name'] | ||
metric_name = metric["name"] | ||
logger.info("Collecting %s", metric_name) | ||
metric_of_interest = metric['metric_of_interest'] | ||
metric_of_interest = metric["metric_of_interest"] | ||
|
||
if "agg" in metric.keys(): | ||
try: | ||
cpu = match.get_agg_metric_query( | ||
ids, index, metric | ||
) | ||
agg_value = metric['agg']['value'] | ||
agg_type = metric['agg']['agg_type'] | ||
cpu = match.get_agg_metric_query(ids, index, metric) | ||
agg_value = metric["agg"]["value"] | ||
agg_type = metric["agg"]["agg_type"] | ||
agg_name = agg_value + "_" + agg_type | ||
cpu_df = match.convert_to_df(cpu, columns=["uuid","timestamp", agg_name]) | ||
cpu_df = cpu_df.rename( | ||
columns={agg_name: metric_name+ "_" + agg_name} | ||
) | ||
cpu_df = match.convert_to_df(cpu, columns=["uuid", "timestamp", agg_name]) | ||
cpu_df= cpu_df.drop_duplicates(subset=['uuid'],keep='first') | ||
cpu_df = cpu_df.rename(columns={agg_name: metric_name + "_" + agg_type}) | ||
dataframe_list.append(cpu_df) | ||
logger.debug(cpu_df) | ||
|
||
|
@@ -92,6 +134,9 @@ def get_metric_data(ids, index, metrics, match, logger): | |
podl_df = match.convert_to_df( | ||
podl, columns=["uuid", "timestamp", metric_of_interest] | ||
) | ||
podl_df= podl_df.drop_duplicates(subset=['uuid'],keep='first') | ||
podl_df = podl_df.rename(columns={metric_of_interest: | ||
metric_name + "_" + metric_of_interest}) | ||
dataframe_list.append(podl_df) | ||
logger.debug(podl_df) | ||
except Exception as e: # pylint: disable=broad-exception-caught | ||
|
@@ -103,7 +148,7 @@ def get_metric_data(ids, index, metrics, match, logger): | |
return dataframe_list | ||
|
||
|
||
def get_metadata(test,logger): | ||
def get_metadata(test, logger): | ||
"""Gets metadata of the run from each test | ||
|
||
Args: | ||
|
@@ -112,12 +157,11 @@ def get_metadata(test,logger): | |
Returns: | ||
dict: dictionary of the metadata | ||
""" | ||
metadata=test['metadata'] | ||
metadata = test["metadata"] | ||
metadata["ocpVersion"] = str(metadata["ocpVersion"]) | ||
logger.debug('metadata' + str(metadata)) | ||
logger.debug("metadata" + str(metadata)) | ||
return metadata | ||
|
||
|
||
def get_build_urls(index, uuids,match): | ||
"""Gets metadata of the run from each test | ||
to get the build url | ||
|
@@ -135,7 +179,6 @@ def get_build_urls(index, uuids,match): | |
buildUrls = {run["uuid"]: run["buildUrl"] for run in test} | ||
return buildUrls | ||
|
||
|
||
def filter_metadata(uuid,match,logger): | ||
"""Gets metadata of the run from each test | ||
|
||
|
@@ -202,7 +245,8 @@ def set_logging(level, logger): | |
logger.addHandler(handler) | ||
return logger | ||
|
||
def load_config(config,logger): | ||
|
||
def load_config(config, logger): | ||
"""Loads config file | ||
|
||
Args: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we use a logger here?