Skip to content

Commit

Permalink
add summary feature (#37)
Browse files Browse the repository at this point in the history
* create and save summary file to every node

* add module_filename param at strategy function for recognize which module params had selected.

* test that strategy module file name adapt properly

* fix best parquet name error

* add find_best_result_path for find the best module and module params in node dir

* add node line summary feature

* change from summary.parquet to summary.csv

* add trial summary

* refactor node_line a little bit

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
  • Loading branch information
vkehfdl1 and jeffrey authored Jan 22, 2024
1 parent 41968fe commit 0d7ed98
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 18 deletions.
8 changes: 7 additions & 1 deletion autorag/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def start_trial(self, yaml_path: str):
node_lines = self._load_node_lines(yaml_path)
self.__ingest(node_lines)

trial_summary_df = pd.DataFrame(columns=['node_line_name', 'node_type', 'best_module_filename'])
for i, (node_line_name, node_line) in enumerate(node_lines.items()):
logger.info(f'Running node line {node_line_name}...')
node_line_dir = os.path.join(self.project_dir, trial_name, node_line_name)
Expand All @@ -60,7 +61,12 @@ def start_trial(self, yaml_path: str):
previous_result = self.qa_data
previous_result = run_node_line(node_line, node_line_dir, previous_result)

# TODO: record summary of each node line to trial summary
summary_df = pd.read_csv(os.path.join(node_line_dir, 'summary.csv'))
summary_df = summary_df.assign(node_line_name=node_line_name)
summary_df = summary_df[list(trial_summary_df.columns)]
trial_summary_df = pd.concat([trial_summary_df, summary_df], ignore_index=True)

trial_summary_df.to_csv(os.path.join(self.project_dir, trial_name, 'summary.csv'), index=False)

def __ingest(self, node_lines: Dict[str, List[Node]]):
if any(list(map(lambda nodes: module_type_exists(nodes, 'bm25'), node_lines.values()))):
Expand Down
7 changes: 5 additions & 2 deletions autorag/node_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from autorag.schema import Node
from autorag.utils.util import find_best_result_path


def make_node_lines(node_line_dict: Dict) -> List[Node]:
Expand Down Expand Up @@ -39,8 +40,10 @@ def run_node_line(nodes: List[Node],
raise ValueError(f"qa.parquet does not exist in {qa_path}.")
previous_result = pd.read_parquet(qa_path)

summary_lst = []
for node in nodes:
previous_result = node.run(previous_result, node_line_dir)
# TODO: record summary of each node to node_line summary

best_module_filename = os.path.basename(find_best_result_path(os.path.join(node_line_dir, node.node_type)))
summary_lst.append({'node_type': node.node_type, 'best_module_filename': best_module_filename})
pd.DataFrame(summary_lst).to_csv(os.path.join(node_line_dir, 'summary.csv'), index=False)
return previous_result
15 changes: 11 additions & 4 deletions autorag/nodes/retrieval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,23 @@ def run_retrieval_node(modules: List[Callable],
zip(modules, module_params)))
list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) # execute save to parquet

# TODO: make summary and save it to summary.parquet
summary_df = pd.DataFrame({
'filename': list(map(lambda x: os.path.basename(x), filepaths)),
**{metric: list(map(lambda result: result[metric].mean(), results)) for metric in strategies.get('metrics')},
})
summary_df.to_csv(os.path.join(save_dir, 'summary.csv'), index=False)

# filter by strategies
module_filenames = list(map(lambda x: os.path.splitext(os.path.basename(x))[0], filepaths))
if strategies.get('speed_threshold') is not None:
results = filter_by_threshold(results, average_times, strategies['speed_threshold'])
selected_result = select_best_average(results, strategies.get('metrics'))
results, module_filenames = filter_by_threshold(results, average_times, strategies['speed_threshold'],
module_filenames)
selected_result, selected_module_filename = select_best_average(results, strategies.get('metrics'),
module_filenames)
best_result = pd.concat([previous_result, selected_result], axis=1)

# save the best result to best.parquet
best_result.to_parquet(os.path.join(save_dir, 'best.parquet'), index=False)
best_result.to_parquet(os.path.join(save_dir, f'best_{selected_module_filename}.parquet'), index=False)
return best_result


Expand Down
19 changes: 13 additions & 6 deletions autorag/strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import functools
import time
from typing import List, Iterable
from typing import List, Iterable, Tuple

import pandas as pd

Expand Down Expand Up @@ -36,35 +36,42 @@ def wrapper(*args, **kwargs) -> List:


@avoid_empty_result
def filter_by_threshold(results, value, threshold) -> List:
def filter_by_threshold(results, value, threshold, module_filename: Iterable[str]) -> Tuple[List, List[str]]:
"""
Filter results by value's threshold.
:param results: The result list to be filtered.
:param value: The value list to be filtered.
It must have the same length with results.
:param threshold: The threshold value.
:param module_filename: The module filename list.
It uses to recognize which module is filtered or not.
:return: Filtered list of results.
"""
assert len(results) == len(value), "results and value must have the same length."
filtered_results, _ = zip(*filter(lambda x: x[1] <= threshold, zip(results, value)))
return list(filtered_results)
filtered_results, _, filtered_module_filename = zip(*filter(lambda x: x[1] <= threshold,
zip(results, value, module_filename)))
return list(filtered_results), list(filtered_module_filename)


def select_best_average(results: List[pd.DataFrame], columns=Iterable[str]) -> pd.DataFrame:
def select_best_average(results: List[pd.DataFrame], columns: Iterable[str],
module_filename: List[str]) -> Tuple[pd.DataFrame, str]:
"""
Select the best result by average value among given columns.
:param results: The list of results.
Each result must be pd.DataFrame.
:param columns: Column names to be averaged.
Standard to select the best result.
:param module_filename: The module filename list.
It uses to recognize which module is selected.
:return: The best result.
"""
assert len(results) == len(module_filename), "results and module_filename must have the same length."
assert all([isinstance(result, pd.DataFrame) for result in results]), \
"results must be pd.DataFrame."
assert all([column in result.columns for result in results for column in columns]), \
"columns must be in the columns of results."
each_average = [df[columns].mean(axis=1).mean() for df in results]
best_index = each_average.index(max(each_average))
return results[best_index]
return results[best_index], module_filename[best_index]
10 changes: 10 additions & 0 deletions autorag/utils/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import os
from typing import List, Callable, Dict

import pandas as pd
Expand Down Expand Up @@ -44,3 +45,12 @@ def make_module_file_name(module_name: str, module_params: Dict) -> str:
if len(module_params_str) <= 0:
return f"{module_name}.parquet"
return f"{module_name}=>{module_params_str}.parquet"


def find_best_result_path(node_dir: str) -> str:
"""
Find the best result filepath from node directory.
:param node_dir: The directory of the node.
:return: The filepath of the best result.
"""
return list(filter(lambda x: x.endswith(".parquet") and x.startswith("best_"), os.listdir(node_dir)))[0]
16 changes: 15 additions & 1 deletion tests/autorag/nodes/retrieval/test_run_retrieval_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,21 @@ def test_run_retrieval_node(node_line_dir):
previous_result = pd.read_parquet(qa_path)
best_result = run_retrieval_node(modules, module_params, previous_result, node_line_dir, strategies)
assert os.path.exists(os.path.join(node_line_dir, "retrieval"))
assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet"))
expect_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt',
'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall']
assert all([expect_column in best_result.columns for expect_column in expect_columns])
# test summary feature
summary_path = os.path.join(node_line_dir, "retrieval", "summary.csv")
bm25_top_k_path = os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet")
assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet"))
bm25_top_k_df = pd.read_parquet(bm25_top_k_path)
assert os.path.exists(summary_path)
summary_df = pd.read_csv(summary_path)
assert ['filename', 'retrieval_f1', 'retrieval_recall'] == summary_df.columns.tolist()
assert len(summary_df) == 1
assert summary_df['filename'][0] == "bm25=>top_k_4.parquet"
assert summary_df['retrieval_f1'][0] == bm25_top_k_df['retrieval_f1'].mean()
assert summary_df['retrieval_recall'][0] == bm25_top_k_df['retrieval_recall'].mean()
# test the best file is saved properly
best_path = os.path.join(node_line_dir, "retrieval", "best_bm25=>top_k_4.parquet")
assert os.path.exists(best_path)
22 changes: 21 additions & 1 deletion tests/autorag/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,25 @@ def test_start_trial(evaluator):
assert all([expect_column in each_result.columns for expect_column in expect_each_result_columns])
expect_best_result_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt',
'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall']
best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval', 'best.parquet'))
best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval',
'best_bm25=>top_k_50.parquet'))
assert all([expect_column in best_result.columns for expect_column in expect_best_result_columns])

# test node line summary
node_line_summary_path = os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'summary.csv')
assert os.path.exists(node_line_summary_path)
node_line_summary_df = pd.read_csv(node_line_summary_path)
assert len(node_line_summary_df) == 1
assert set(node_line_summary_df.columns) == {'node_type', 'best_module_filename'}
assert node_line_summary_df['node_type'][0] == 'retrieval'
assert node_line_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet'

# test trial summary
trial_summary_path = os.path.join(os.getcwd(), '0', 'summary.csv')
assert os.path.exists(trial_summary_path)
trial_summary_df = pd.read_csv(trial_summary_path)
assert len(trial_summary_df) == 1
assert set(trial_summary_df.columns) == {'node_line_name', 'node_type', 'best_module_filename'}
assert trial_summary_df['node_line_name'][0] == 'retrieve_node_line'
assert trial_summary_df['node_type'][0] == 'retrieval'
assert trial_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet'
12 changes: 9 additions & 3 deletions tests/autorag/test_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@ def test_filter_by_threshold():
results = [1, 2, 3, 4]
values = [1, 2, 3, 4]
threshold = 3
filtered_results = filter_by_threshold(results, values, threshold)
filename = ['a', 'b', 'c', 'd']
filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filename)
assert filtered_results == [1, 2, 3]
assert filtered_filenames == ['a', 'b', 'c']


def test_avoid_empty_result():
results = [1, 2, 3, 4]
values = [1, 2, 3, 4]
threshold = 5
filtered_results = filter_by_threshold(results, values, threshold)
filenames = ['a', 'b', 'c', 'd']
filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filenames)
assert filtered_results == [1, 2, 3, 4]
assert filtered_filenames == ['a', 'b', 'c', 'd']


def test_select_best_average():
Expand All @@ -34,7 +38,9 @@ def test_select_best_average():
pd.DataFrame({'content': ['d', 'e', 'f'], 'retrieval_f1': [0.2, 0.3, 0.4], 'retrieval_recall': [0.2, 0.3, 0.4]}),
pd.DataFrame({'content': ['g', 'h', 'i'], 'retrieval_f1': [0.3, 0.4, 0.5], 'retrieval_recall': [0.3, 0.4, 0.5]}),
]
best_df = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall'])
sample_filenames = ['a', 'b', 'c']
best_df, best_filename = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall'], sample_filenames)
assert best_df['content'].tolist() == ['g', 'h', 'i']
assert best_df['retrieval_f1'].tolist() == [0.3, 0.4, 0.5]
assert best_df['retrieval_recall'].tolist() == [0.3, 0.4, 0.5]
assert best_filename == 'c'
23 changes: 23 additions & 0 deletions tests/autorag/utils/test_util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import itertools
import os
import pathlib
import tempfile

import pandas as pd

from autorag.utils import fetch_contents
from autorag.utils.util import find_best_result_path

root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent

Expand All @@ -16,3 +18,24 @@ def test_fetch_contents():
find_contents = fetch_contents(corpus_data, list(map(lambda x: [x], search_rows['doc_id'].tolist())))
assert len(find_contents) == len(search_rows)
assert list(itertools.chain.from_iterable(find_contents)) == search_rows['contents'].tolist()


def test_find_best_result_path():
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
# Set up the test files
paths = [
"best_result.parquet",
"average_result.parquet",
"worst_result.parquet",
"best_other.txt"
]
for file_name in paths:
with open(os.path.join(tmpdirname, file_name), 'w') as f:
f.write("test data")

# Run the function under test
best_path = find_best_result_path(tmpdirname)

# Check that the function returns the correct path
assert best_path == "best_result.parquet"

0 comments on commit 0d7ed98

Please sign in to comment.