cloud-bulldozer · jtaleric · May 22, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 20, 2024
diff --git a/orion.py b/orion.py
@@ -32,9 +32,16 @@ def cli(max_content_width=120):
 @click.option("--uuid", default="", help="UUID to use as base for comparisons")
 @click.option("--baseline", default="", help="Baseline UUID(s) to to compare against uuid")
 @click.option("--config", default="config.yaml", help="Path to the configuration file")
-@click.option("--output", default="output.csv", help="Path to save the output csv file")
+@click.option("--output-path", default="output.csv", help="Path to save the output csv file")
 @click.option("--debug", is_flag=True, help="log level ")
 @click.option("--hunter-analyze",is_flag=True, help="run hunter analyze")
+@click.option(
+    "-o",
+    "--output",
+    type=click.Choice(["json", "text"]),
+    default="text",
+    help="Choose output format (json or text)",
+)
 def orion(**kwargs):
     """Orion is the cli tool to detect regressions over the runs
 
@@ -67,8 +74,8 @@ def orion(**kwargs):
         benchmarkIndex=test['benchmarkIndex']
         uuid = kwargs["uuid"]
         baseline = kwargs["baseline"]
-        index = "ospst-perf-scale-ci-*"
-        match = Matcher(index=index,
+        fingerprint_index = test["index"]
+        match = Matcher(index=fingerprint_index,
                         level=level, ES_URL=ES_URL, verify_certs=False)
         if uuid == "":
             metadata = orion_funcs.get_metadata(test, logger)
@@ -86,23 +93,22 @@ def orion(**kwargs):
         else:
             uuids = [uuid for uuid in re.split(' |,',baseline) if uuid]
             uuids.append(uuid)
-            buildUrls = orion_funcs.get_build_urls(index, uuids,match)
+            buildUrls = orion_funcs.get_build_urls(fingerprint_index, uuids,match)
 
-        index=benchmarkIndex
+        fingerprint_index=benchmarkIndex
         if metadata["benchmark.keyword"] in ["ingress-perf","k8s-netperf"] :
             ids = uuids
         else:
             if baseline == "":
-                runs = match.match_kube_burner(uuids, index)
+                runs = match.match_kube_burner(uuids, fingerprint_index)
                 ids = match.filter_runs(runs, runs)
             else:
                 ids = uuids
-
         metrics = test["metrics"]
-        dataframe_list = orion_funcs.get_metric_data(ids, index, metrics, match, logger)
+        dataframe_list = orion_funcs.get_metric_data(ids, fingerprint_index, metrics, match, logger)
 
         for i, df in enumerate(dataframe_list):
-            if i != 0:
+            if i != 0 and ('timestamp' in df.columns):
                 dataframe_list[i] = df.drop(columns=['timestamp'])
 
         merged_df = reduce(
@@ -113,13 +119,13 @@ def orion(**kwargs):
         shortener = pyshorteners.Shortener()
         merged_df["buildUrl"] = merged_df["uuid"].apply(
             lambda uuid: shortener.tinyurl.short(buildUrls[uuid])) #pylint: disable = cell-var-from-loop
-        csv_name = kwargs["output"].split(".")[0]+"-"+test['name']+".csv"
+        csv_name = kwargs["output_path"].split(".")[0]+"-"+test['name']+".csv"
         match.save_results(
             merged_df, csv_file_path=csv_name
         )
 
         if kwargs["hunter_analyze"]:
-            _ = orion_funcs.run_hunter_analyze(merged_df,test)
+            orion_funcs.run_hunter_analyze(merged_df,test,kwargs["output"])
 
 
 if __name__ == "__main__":

diff --git a/utils/orion_funcs.py b/utils/orion_funcs.py
@@ -4,6 +4,7 @@
 """
 # pylint: disable = import-error
 
+import json
 import logging
 import sys
 
@@ -14,7 +15,7 @@
 from hunter.series import Metric, Series
 
 
-def run_hunter_analyze(merged_df,test):
+def run_hunter_analyze(merged_df, test, output):
     """Start hunter analyze function
 
     Args:
@@ -23,27 +24,71 @@ def run_hunter_analyze(merged_df,test):
     """
     merged_df["timestamp"] = pd.to_datetime(merged_df["timestamp"])
     merged_df["timestamp"] = merged_df["timestamp"].astype(int) // 10**9
-    metrics = {column: Metric(1, 1.0)
-               for column in merged_df.columns
-               if column not in ["uuid","timestamp","buildUrl"]}
-    data = {column: merged_df[column]
-            for column in merged_df.columns
-            if column not in ["uuid","timestamp","buildUrl"]}
+    metrics = {
+        column: Metric(1, 1.0)
+        for column in merged_df.columns
+        if column not in ["uuid","timestamp","buildUrl"]
+    }
+    data = {
+        column: merged_df[column]
+        for column in merged_df.columns
+        if column not in ["uuid","timestamp","buildUrl"]
+    }
     attributes={column: merged_df[column]
                 for column in merged_df.columns if column in ["uuid","buildUrl"]}
-    series=Series(
+    series = Series(
         test_name=test["name"],
         branch=None,
         time=list(merged_df["timestamp"]),
         metrics=metrics,
         data=data,
-        attributes=attributes
+        attributes=attributes,
     )
-    change_points=series.analyze().change_points_by_time
-    report=Report(series,change_points)
-    output = report.produce_report(test_name="test",report_type=ReportType.LOG)
-    print(output)
-    return change_points
+    change_points = series.analyze().change_points_by_time
+    report = Report(series, change_points)
+    if output == "text":
+        output_table = report.produce_report(
+            test_name="test", report_type=ReportType.LOG
+        )
+        print(output_table)
+    elif output == "json":
+        change_points_by_metric = series.analyze().change_points
+        output_json = parse_json_output(merged_df, change_points_by_metric)
+        print(json.dumps(output_json, indent=4))
+
+
+def parse_json_output(merged_df, change_points_by_metric):
+    """json output generator function
+
+    Args:
+        merged_df (pd.Dataframe): the dataframe to be converted to json
+        change_points_by_metric (_type_): different change point
+
+    Returns:
+        _type_: _description_
+    """
+    df_json = merged_df.to_json(orient="records")
+    df_json = json.loads(df_json)
+
+    for index, entry in enumerate(df_json):
+        entry["metrics"] = {
+            key: {"value": entry.pop(key), "percentage_change": 0}
+            for key in entry.keys() - {"uuid", "timestamp", "buildUrl"}
+        }
+        entry["is_changepoint"] = False
+
+    for key in change_points_by_metric.keys():
+        for change_point in change_points_by_metric[key]:
+            index = change_point.index
+            percentage_change = (
+                (change_point.stats.mean_2 - change_point.stats.mean_1)
+                / change_point.stats.mean_1
+            ) * 100
+            df_json[index]["metrics"][key]["percentage_change"] = percentage_change
+            df_json[index]["is_changepoint"] = True
+
+    return df_json
+
 
 # pylint: disable=too-many-locals
 def get_metric_data(ids, index, metrics, match, logger):
@@ -61,22 +106,19 @@ def get_metric_data(ids, index, metrics, match, logger):
     """
     dataframe_list = []
     for metric in metrics:
-        metric_name = metric['name']
+        metric_name = metric["name"]
         logger.info("Collecting %s", metric_name)
-        metric_of_interest = metric['metric_of_interest']
+        metric_of_interest = metric["metric_of_interest"]
 
         if "agg" in metric.keys():
             try:
-                cpu = match.get_agg_metric_query(
-                    ids, index, metric
-                )
-                agg_value = metric['agg']['value']
-                agg_type = metric['agg']['agg_type']
+                cpu = match.get_agg_metric_query(ids, index, metric)
+                agg_value = metric["agg"]["value"]
+                agg_type = metric["agg"]["agg_type"]
                 agg_name = agg_value + "_" + agg_type
-                cpu_df = match.convert_to_df(cpu, columns=["uuid","timestamp", agg_name])
-                cpu_df = cpu_df.rename(
-                    columns={agg_name: metric_name+ "_" +  agg_name}
-                )
+                cpu_df = match.convert_to_df(cpu, columns=["uuid", "timestamp", agg_name])
+                cpu_df= cpu_df.drop_duplicates(subset=['uuid'],keep='first')
+                cpu_df = cpu_df.rename(columns={agg_name: metric_name + "_" + agg_type})
                 dataframe_list.append(cpu_df)
                 logger.debug(cpu_df)
 
@@ -92,6 +134,9 @@ def get_metric_data(ids, index, metrics, match, logger):
                 podl_df = match.convert_to_df(
                     podl, columns=["uuid", "timestamp", metric_of_interest]
                 )
+                podl_df= podl_df.drop_duplicates(subset=['uuid'],keep='first')
+                podl_df = podl_df.rename(columns={metric_of_interest:
+                                                    metric_name + "_" + metric_of_interest})
                 dataframe_list.append(podl_df)
                 logger.debug(podl_df)
             except Exception as e:  # pylint: disable=broad-exception-caught
@@ -103,7 +148,7 @@ def get_metric_data(ids, index, metrics, match, logger):
     return dataframe_list
 
 
-def get_metadata(test,logger):
+def get_metadata(test, logger):
     """Gets metadata of the run from each test
 
     Args:
@@ -112,12 +157,11 @@ def get_metadata(test,logger):
     Returns:
         dict: dictionary of the metadata
     """
-    metadata=test['metadata']
+    metadata = test["metadata"]
     metadata["ocpVersion"] = str(metadata["ocpVersion"])
-    logger.debug('metadata' + str(metadata))
+    logger.debug("metadata" + str(metadata))
     return metadata
 
-
 def get_build_urls(index, uuids,match):
     """Gets metadata of the run from each test 
         to get the build url
@@ -135,7 +179,6 @@ def get_build_urls(index, uuids,match):
     buildUrls = {run["uuid"]: run["buildUrl"] for run in test}
     return buildUrls
 
-
 def filter_metadata(uuid,match,logger):
     """Gets metadata of the run from each test
 
@@ -202,7 +245,8 @@ def set_logging(level, logger):
     logger.addHandler(handler)
     return logger
 
-def load_config(config,logger):
+
+def load_config(config, logger):
     """Loads config file
 
     Args: