diff --git a/autorag/evaluator.py b/autorag/evaluator.py index 4c1f5c6b4..26d1227be 100644 --- a/autorag/evaluator.py +++ b/autorag/evaluator.py @@ -52,6 +52,7 @@ def start_trial(self, yaml_path: str): node_lines = self._load_node_lines(yaml_path) self.__ingest(node_lines) + trial_summary_df = pd.DataFrame(columns=['node_line_name', 'node_type', 'best_module_filename']) for i, (node_line_name, node_line) in enumerate(node_lines.items()): logger.info(f'Running node line {node_line_name}...') node_line_dir = os.path.join(self.project_dir, trial_name, node_line_name) @@ -60,7 +61,12 @@ def start_trial(self, yaml_path: str): previous_result = self.qa_data previous_result = run_node_line(node_line, node_line_dir, previous_result) - # TODO: record summary of each node line to trial summary + summary_df = pd.read_csv(os.path.join(node_line_dir, 'summary.csv')) + summary_df = summary_df.assign(node_line_name=node_line_name) + summary_df = summary_df[list(trial_summary_df.columns)] + trial_summary_df = pd.concat([trial_summary_df, summary_df], ignore_index=True) + + trial_summary_df.to_csv(os.path.join(self.project_dir, trial_name, 'summary.csv'), index=False) def __ingest(self, node_lines: Dict[str, List[Node]]): if any(list(map(lambda nodes: module_type_exists(nodes, 'bm25'), node_lines.values()))): diff --git a/autorag/node_line.py b/autorag/node_line.py index 99feda281..56c2e87c2 100644 --- a/autorag/node_line.py +++ b/autorag/node_line.py @@ -5,6 +5,7 @@ import pandas as pd from autorag.schema import Node +from autorag.utils.util import find_best_result_path def make_node_lines(node_line_dict: Dict) -> List[Node]: @@ -39,8 +40,10 @@ def run_node_line(nodes: List[Node], raise ValueError(f"qa.parquet does not exist in {qa_path}.") previous_result = pd.read_parquet(qa_path) + summary_lst = [] for node in nodes: previous_result = node.run(previous_result, node_line_dir) - # TODO: record summary of each node to node_line summary - + best_module_filename = os.path.basename(find_best_result_path(os.path.join(node_line_dir, node.node_type))) + summary_lst.append({'node_type': node.node_type, 'best_module_filename': best_module_filename}) + pd.DataFrame(summary_lst).to_csv(os.path.join(node_line_dir, 'summary.csv'), index=False) return previous_result diff --git a/autorag/nodes/retrieval/run.py b/autorag/nodes/retrieval/run.py index 8f35d9b25..07bb3136a 100644 --- a/autorag/nodes/retrieval/run.py +++ b/autorag/nodes/retrieval/run.py @@ -49,16 +49,23 @@ def run_retrieval_node(modules: List[Callable], zip(modules, module_params))) list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) # execute save to parquet - # TODO: make summary and save it to summary.parquet + summary_df = pd.DataFrame({ + 'filename': list(map(lambda x: os.path.basename(x), filepaths)), + **{metric: list(map(lambda result: result[metric].mean(), results)) for metric in strategies.get('metrics')}, + }) + summary_df.to_csv(os.path.join(save_dir, 'summary.csv'), index=False) # filter by strategies + module_filenames = list(map(lambda x: os.path.splitext(os.path.basename(x))[0], filepaths)) if strategies.get('speed_threshold') is not None: - results = filter_by_threshold(results, average_times, strategies['speed_threshold']) - selected_result = select_best_average(results, strategies.get('metrics')) + results, module_filenames = filter_by_threshold(results, average_times, strategies['speed_threshold'], + module_filenames) + selected_result, selected_module_filename = select_best_average(results, strategies.get('metrics'), + module_filenames) best_result = pd.concat([previous_result, selected_result], axis=1) # save the best result to best.parquet - best_result.to_parquet(os.path.join(save_dir, 'best.parquet'), index=False) + best_result.to_parquet(os.path.join(save_dir, f'best_{selected_module_filename}.parquet'), index=False) return best_result diff --git a/autorag/strategy.py b/autorag/strategy.py index c0192e46e..fda06316a 100644 --- a/autorag/strategy.py +++ b/autorag/strategy.py @@ -1,6 +1,6 @@ import functools import time -from typing import List, Iterable +from typing import List, Iterable, Tuple import pandas as pd @@ -36,7 +36,7 @@ def wrapper(*args, **kwargs) -> List: @avoid_empty_result -def filter_by_threshold(results, value, threshold) -> List: +def filter_by_threshold(results, value, threshold, module_filename: Iterable[str]) -> Tuple[List, List[str]]: """ Filter results by value's threshold. @@ -44,14 +44,18 @@ def filter_by_threshold(results, value, threshold) -> List: :param value: The value list to be filtered. It must have the same length with results. :param threshold: The threshold value. + :param module_filename: The module filename list. + It uses to recognize which module is filtered or not. :return: Filtered list of results. """ assert len(results) == len(value), "results and value must have the same length." - filtered_results, _ = zip(*filter(lambda x: x[1] <= threshold, zip(results, value))) - return list(filtered_results) + filtered_results, _, filtered_module_filename = zip(*filter(lambda x: x[1] <= threshold, + zip(results, value, module_filename))) + return list(filtered_results), list(filtered_module_filename) -def select_best_average(results: List[pd.DataFrame], columns=Iterable[str]) -> pd.DataFrame: +def select_best_average(results: List[pd.DataFrame], columns: Iterable[str], + module_filename: List[str]) -> Tuple[pd.DataFrame, str]: """ Select the best result by average value among given columns. @@ -59,12 +63,15 @@ def select_best_average(results: List[pd.DataFrame], columns=Iterable[str]) -> p Each result must be pd.DataFrame. :param columns: Column names to be averaged. Standard to select the best result. + :param module_filename: The module filename list. + It uses to recognize which module is selected. :return: The best result. """ + assert len(results) == len(module_filename), "results and module_filename must have the same length." assert all([isinstance(result, pd.DataFrame) for result in results]), \ "results must be pd.DataFrame." assert all([column in result.columns for result in results for column in columns]), \ "columns must be in the columns of results." each_average = [df[columns].mean(axis=1).mean() for df in results] best_index = each_average.index(max(each_average)) - return results[best_index] + return results[best_index], module_filename[best_index] diff --git a/autorag/utils/util.py b/autorag/utils/util.py index f806cb9d5..98e22dadc 100644 --- a/autorag/utils/util.py +++ b/autorag/utils/util.py @@ -1,4 +1,5 @@ import functools +import os from typing import List, Callable, Dict import pandas as pd @@ -44,3 +45,12 @@ def make_module_file_name(module_name: str, module_params: Dict) -> str: if len(module_params_str) <= 0: return f"{module_name}.parquet" return f"{module_name}=>{module_params_str}.parquet" + + +def find_best_result_path(node_dir: str) -> str: + """ + Find the best result filepath from node directory. + :param node_dir: The directory of the node. + :return: The filepath of the best result. + """ + return list(filter(lambda x: x.endswith(".parquet") and x.startswith("best_"), os.listdir(node_dir)))[0] diff --git a/tests/autorag/nodes/retrieval/test_run_retrieval_node.py b/tests/autorag/nodes/retrieval/test_run_retrieval_node.py index be74f385a..d545c7053 100644 --- a/tests/autorag/nodes/retrieval/test_run_retrieval_node.py +++ b/tests/autorag/nodes/retrieval/test_run_retrieval_node.py @@ -40,7 +40,21 @@ def test_run_retrieval_node(node_line_dir): previous_result = pd.read_parquet(qa_path) best_result = run_retrieval_node(modules, module_params, previous_result, node_line_dir, strategies) assert os.path.exists(os.path.join(node_line_dir, "retrieval")) - assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet")) expect_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall'] assert all([expect_column in best_result.columns for expect_column in expect_columns]) + # test summary feature + summary_path = os.path.join(node_line_dir, "retrieval", "summary.csv") + bm25_top_k_path = os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet") + assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet")) + bm25_top_k_df = pd.read_parquet(bm25_top_k_path) + assert os.path.exists(summary_path) + summary_df = pd.read_csv(summary_path) + assert ['filename', 'retrieval_f1', 'retrieval_recall'] == summary_df.columns.tolist() + assert len(summary_df) == 1 + assert summary_df['filename'][0] == "bm25=>top_k_4.parquet" + assert summary_df['retrieval_f1'][0] == bm25_top_k_df['retrieval_f1'].mean() + assert summary_df['retrieval_recall'][0] == bm25_top_k_df['retrieval_recall'].mean() + # test the best file is saved properly + best_path = os.path.join(node_line_dir, "retrieval", "best_bm25=>top_k_4.parquet") + assert os.path.exists(best_path) diff --git a/tests/autorag/test_evaluator.py b/tests/autorag/test_evaluator.py index 9ec17d291..cb3c9afd6 100644 --- a/tests/autorag/test_evaluator.py +++ b/tests/autorag/test_evaluator.py @@ -73,5 +73,25 @@ def test_start_trial(evaluator): assert all([expect_column in each_result.columns for expect_column in expect_each_result_columns]) expect_best_result_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall'] - best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval', 'best.parquet')) + best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval', + 'best_bm25=>top_k_50.parquet')) assert all([expect_column in best_result.columns for expect_column in expect_best_result_columns]) + + # test node line summary + node_line_summary_path = os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'summary.csv') + assert os.path.exists(node_line_summary_path) + node_line_summary_df = pd.read_csv(node_line_summary_path) + assert len(node_line_summary_df) == 1 + assert set(node_line_summary_df.columns) == {'node_type', 'best_module_filename'} + assert node_line_summary_df['node_type'][0] == 'retrieval' + assert node_line_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet' + + # test trial summary + trial_summary_path = os.path.join(os.getcwd(), '0', 'summary.csv') + assert os.path.exists(trial_summary_path) + trial_summary_df = pd.read_csv(trial_summary_path) + assert len(trial_summary_df) == 1 + assert set(trial_summary_df.columns) == {'node_line_name', 'node_type', 'best_module_filename'} + assert trial_summary_df['node_line_name'][0] == 'retrieve_node_line' + assert trial_summary_df['node_type'][0] == 'retrieval' + assert trial_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet' diff --git a/tests/autorag/test_strategy.py b/tests/autorag/test_strategy.py index 94d737c32..525d6505b 100644 --- a/tests/autorag/test_strategy.py +++ b/tests/autorag/test_strategy.py @@ -16,16 +16,20 @@ def test_filter_by_threshold(): results = [1, 2, 3, 4] values = [1, 2, 3, 4] threshold = 3 - filtered_results = filter_by_threshold(results, values, threshold) + filename = ['a', 'b', 'c', 'd'] + filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filename) assert filtered_results == [1, 2, 3] + assert filtered_filenames == ['a', 'b', 'c'] def test_avoid_empty_result(): results = [1, 2, 3, 4] values = [1, 2, 3, 4] threshold = 5 - filtered_results = filter_by_threshold(results, values, threshold) + filenames = ['a', 'b', 'c', 'd'] + filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filenames) assert filtered_results == [1, 2, 3, 4] + assert filtered_filenames == ['a', 'b', 'c', 'd'] def test_select_best_average(): @@ -34,7 +38,9 @@ def test_select_best_average(): pd.DataFrame({'content': ['d', 'e', 'f'], 'retrieval_f1': [0.2, 0.3, 0.4], 'retrieval_recall': [0.2, 0.3, 0.4]}), pd.DataFrame({'content': ['g', 'h', 'i'], 'retrieval_f1': [0.3, 0.4, 0.5], 'retrieval_recall': [0.3, 0.4, 0.5]}), ] - best_df = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall']) + sample_filenames = ['a', 'b', 'c'] + best_df, best_filename = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall'], sample_filenames) assert best_df['content'].tolist() == ['g', 'h', 'i'] assert best_df['retrieval_f1'].tolist() == [0.3, 0.4, 0.5] assert best_df['retrieval_recall'].tolist() == [0.3, 0.4, 0.5] + assert best_filename == 'c' diff --git a/tests/autorag/utils/test_util.py b/tests/autorag/utils/test_util.py index 9d0178aad..7779dc14e 100644 --- a/tests/autorag/utils/test_util.py +++ b/tests/autorag/utils/test_util.py @@ -1,10 +1,12 @@ import itertools import os import pathlib +import tempfile import pandas as pd from autorag.utils import fetch_contents +from autorag.utils.util import find_best_result_path root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent @@ -16,3 +18,24 @@ def test_fetch_contents(): find_contents = fetch_contents(corpus_data, list(map(lambda x: [x], search_rows['doc_id'].tolist()))) assert len(find_contents) == len(search_rows) assert list(itertools.chain.from_iterable(find_contents)) == search_rows['contents'].tolist() + + +def test_find_best_result_path(): + # Create a temporary directory + with tempfile.TemporaryDirectory() as tmpdirname: + # Set up the test files + paths = [ + "best_result.parquet", + "average_result.parquet", + "worst_result.parquet", + "best_other.txt" + ] + for file_name in paths: + with open(os.path.join(tmpdirname, file_name), 'w') as f: + f.write("test data") + + # Run the function under test + best_path = find_best_result_path(tmpdirname) + + # Check that the function returns the correct path + assert best_path == "best_result.parquet"