Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add summary feature #37

Merged
merged 9 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion autorag/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def start_trial(self, yaml_path: str):
node_lines = self._load_node_lines(yaml_path)
self.__ingest(node_lines)

trial_summary_df = pd.DataFrame(columns=['node_line_name', 'node_type', 'best_module_filename'])
for i, (node_line_name, node_line) in enumerate(node_lines.items()):
logger.info(f'Running node line {node_line_name}...')
node_line_dir = os.path.join(self.project_dir, trial_name, node_line_name)
Expand All @@ -60,7 +61,12 @@ def start_trial(self, yaml_path: str):
previous_result = self.qa_data
previous_result = run_node_line(node_line, node_line_dir, previous_result)

# TODO: record summary of each node line to trial summary
summary_df = pd.read_csv(os.path.join(node_line_dir, 'summary.csv'))
summary_df = summary_df.assign(node_line_name=node_line_name)
summary_df = summary_df[list(trial_summary_df.columns)]
trial_summary_df = pd.concat([trial_summary_df, summary_df], ignore_index=True)

trial_summary_df.to_csv(os.path.join(self.project_dir, trial_name, 'summary.csv'), index=False)

def __ingest(self, node_lines: Dict[str, List[Node]]):
if any(list(map(lambda nodes: module_type_exists(nodes, 'bm25'), node_lines.values()))):
Expand Down
7 changes: 5 additions & 2 deletions autorag/node_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from autorag.schema import Node
from autorag.utils.util import find_best_result_path


def make_node_lines(node_line_dict: Dict) -> List[Node]:
Expand Down Expand Up @@ -39,8 +40,10 @@ def run_node_line(nodes: List[Node],
raise ValueError(f"qa.parquet does not exist in {qa_path}.")
previous_result = pd.read_parquet(qa_path)

summary_lst = []
for node in nodes:
previous_result = node.run(previous_result, node_line_dir)
# TODO: record summary of each node to node_line summary

best_module_filename = os.path.basename(find_best_result_path(os.path.join(node_line_dir, node.node_type)))
summary_lst.append({'node_type': node.node_type, 'best_module_filename': best_module_filename})
pd.DataFrame(summary_lst).to_csv(os.path.join(node_line_dir, 'summary.csv'), index=False)
return previous_result
15 changes: 11 additions & 4 deletions autorag/nodes/retrieval/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,23 @@ def run_retrieval_node(modules: List[Callable],
zip(modules, module_params)))
list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) # execute save to parquet

# TODO: make summary and save it to summary.parquet
summary_df = pd.DataFrame({
'filename': list(map(lambda x: os.path.basename(x), filepaths)),
**{metric: list(map(lambda result: result[metric].mean(), results)) for metric in strategies.get('metrics')},
})
summary_df.to_csv(os.path.join(save_dir, 'summary.csv'), index=False)

# filter by strategies
module_filenames = list(map(lambda x: os.path.splitext(os.path.basename(x))[0], filepaths))
if strategies.get('speed_threshold') is not None:
results = filter_by_threshold(results, average_times, strategies['speed_threshold'])
selected_result = select_best_average(results, strategies.get('metrics'))
results, module_filenames = filter_by_threshold(results, average_times, strategies['speed_threshold'],
module_filenames)
selected_result, selected_module_filename = select_best_average(results, strategies.get('metrics'),
module_filenames)
best_result = pd.concat([previous_result, selected_result], axis=1)

# save the best result to best.parquet
best_result.to_parquet(os.path.join(save_dir, 'best.parquet'), index=False)
best_result.to_parquet(os.path.join(save_dir, f'best_{selected_module_filename}.parquet'), index=False)
return best_result


Expand Down
19 changes: 13 additions & 6 deletions autorag/strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import functools
import time
from typing import List, Iterable
from typing import List, Iterable, Tuple

import pandas as pd

Expand Down Expand Up @@ -36,35 +36,42 @@ def wrapper(*args, **kwargs) -> List:


@avoid_empty_result
def filter_by_threshold(results, value, threshold) -> List:
def filter_by_threshold(results, value, threshold, module_filename: Iterable[str]) -> Tuple[List, List[str]]:
"""
Filter results by value's threshold.

:param results: The result list to be filtered.
:param value: The value list to be filtered.
It must have the same length with results.
:param threshold: The threshold value.
:param module_filename: The module filename list.
It uses to recognize which module is filtered or not.
:return: Filtered list of results.
"""
assert len(results) == len(value), "results and value must have the same length."
filtered_results, _ = zip(*filter(lambda x: x[1] <= threshold, zip(results, value)))
return list(filtered_results)
filtered_results, _, filtered_module_filename = zip(*filter(lambda x: x[1] <= threshold,
zip(results, value, module_filename)))
return list(filtered_results), list(filtered_module_filename)


def select_best_average(results: List[pd.DataFrame], columns=Iterable[str]) -> pd.DataFrame:
def select_best_average(results: List[pd.DataFrame], columns: Iterable[str],
module_filename: List[str]) -> Tuple[pd.DataFrame, str]:
"""
Select the best result by average value among given columns.

:param results: The list of results.
Each result must be pd.DataFrame.
:param columns: Column names to be averaged.
Standard to select the best result.
:param module_filename: The module filename list.
It uses to recognize which module is selected.
:return: The best result.
"""
assert len(results) == len(module_filename), "results and module_filename must have the same length."
assert all([isinstance(result, pd.DataFrame) for result in results]), \
"results must be pd.DataFrame."
assert all([column in result.columns for result in results for column in columns]), \
"columns must be in the columns of results."
each_average = [df[columns].mean(axis=1).mean() for df in results]
best_index = each_average.index(max(each_average))
return results[best_index]
return results[best_index], module_filename[best_index]
10 changes: 10 additions & 0 deletions autorag/utils/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import os
from typing import List, Callable, Dict

import pandas as pd
Expand Down Expand Up @@ -44,3 +45,12 @@ def make_module_file_name(module_name: str, module_params: Dict) -> str:
if len(module_params_str) <= 0:
return f"{module_name}.parquet"
return f"{module_name}=>{module_params_str}.parquet"


def find_best_result_path(node_dir: str) -> str:
"""
Find the best result filepath from node directory.
:param node_dir: The directory of the node.
:return: The filepath of the best result.
"""
return list(filter(lambda x: x.endswith(".parquet") and x.startswith("best_"), os.listdir(node_dir)))[0]
16 changes: 15 additions & 1 deletion tests/autorag/nodes/retrieval/test_run_retrieval_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,21 @@ def test_run_retrieval_node(node_line_dir):
previous_result = pd.read_parquet(qa_path)
best_result = run_retrieval_node(modules, module_params, previous_result, node_line_dir, strategies)
assert os.path.exists(os.path.join(node_line_dir, "retrieval"))
assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet"))
expect_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt',
'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall']
assert all([expect_column in best_result.columns for expect_column in expect_columns])
# test summary feature
summary_path = os.path.join(node_line_dir, "retrieval", "summary.csv")
bm25_top_k_path = os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet")
assert os.path.exists(os.path.join(node_line_dir, "retrieval", "bm25=>top_k_4.parquet"))
bm25_top_k_df = pd.read_parquet(bm25_top_k_path)
assert os.path.exists(summary_path)
summary_df = pd.read_csv(summary_path)
assert ['filename', 'retrieval_f1', 'retrieval_recall'] == summary_df.columns.tolist()
assert len(summary_df) == 1
assert summary_df['filename'][0] == "bm25=>top_k_4.parquet"
assert summary_df['retrieval_f1'][0] == bm25_top_k_df['retrieval_f1'].mean()
assert summary_df['retrieval_recall'][0] == bm25_top_k_df['retrieval_recall'].mean()
# test the best file is saved properly
best_path = os.path.join(node_line_dir, "retrieval", "best_bm25=>top_k_4.parquet")
assert os.path.exists(best_path)
22 changes: 21 additions & 1 deletion tests/autorag/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,25 @@ def test_start_trial(evaluator):
assert all([expect_column in each_result.columns for expect_column in expect_each_result_columns])
expect_best_result_columns = ['qid', 'query', 'retrieval_gt', 'generation_gt',
'retrieved_contents', 'retrieved_ids', 'retrieve_scores', 'retrieval_f1', 'retrieval_recall']
best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval', 'best.parquet'))
best_result = pd.read_parquet(os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'retrieval',
'best_bm25=>top_k_50.parquet'))
assert all([expect_column in best_result.columns for expect_column in expect_best_result_columns])

# test node line summary
node_line_summary_path = os.path.join(os.getcwd(), '0', 'retrieve_node_line', 'summary.csv')
assert os.path.exists(node_line_summary_path)
node_line_summary_df = pd.read_csv(node_line_summary_path)
assert len(node_line_summary_df) == 1
assert set(node_line_summary_df.columns) == {'node_type', 'best_module_filename'}
assert node_line_summary_df['node_type'][0] == 'retrieval'
assert node_line_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet'

# test trial summary
trial_summary_path = os.path.join(os.getcwd(), '0', 'summary.csv')
assert os.path.exists(trial_summary_path)
trial_summary_df = pd.read_csv(trial_summary_path)
assert len(trial_summary_df) == 1
assert set(trial_summary_df.columns) == {'node_line_name', 'node_type', 'best_module_filename'}
assert trial_summary_df['node_line_name'][0] == 'retrieve_node_line'
assert trial_summary_df['node_type'][0] == 'retrieval'
assert trial_summary_df['best_module_filename'][0] == 'best_bm25=>top_k_50.parquet'
12 changes: 9 additions & 3 deletions tests/autorag/test_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@ def test_filter_by_threshold():
results = [1, 2, 3, 4]
values = [1, 2, 3, 4]
threshold = 3
filtered_results = filter_by_threshold(results, values, threshold)
filename = ['a', 'b', 'c', 'd']
filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filename)
assert filtered_results == [1, 2, 3]
assert filtered_filenames == ['a', 'b', 'c']


def test_avoid_empty_result():
results = [1, 2, 3, 4]
values = [1, 2, 3, 4]
threshold = 5
filtered_results = filter_by_threshold(results, values, threshold)
filenames = ['a', 'b', 'c', 'd']
filtered_results, filtered_filenames = filter_by_threshold(results, values, threshold, filenames)
assert filtered_results == [1, 2, 3, 4]
assert filtered_filenames == ['a', 'b', 'c', 'd']


def test_select_best_average():
Expand All @@ -34,7 +38,9 @@ def test_select_best_average():
pd.DataFrame({'content': ['d', 'e', 'f'], 'retrieval_f1': [0.2, 0.3, 0.4], 'retrieval_recall': [0.2, 0.3, 0.4]}),
pd.DataFrame({'content': ['g', 'h', 'i'], 'retrieval_f1': [0.3, 0.4, 0.5], 'retrieval_recall': [0.3, 0.4, 0.5]}),
]
best_df = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall'])
sample_filenames = ['a', 'b', 'c']
best_df, best_filename = select_best_average(sample_dfs, ['retrieval_f1', 'retrieval_recall'], sample_filenames)
assert best_df['content'].tolist() == ['g', 'h', 'i']
assert best_df['retrieval_f1'].tolist() == [0.3, 0.4, 0.5]
assert best_df['retrieval_recall'].tolist() == [0.3, 0.4, 0.5]
assert best_filename == 'c'
23 changes: 23 additions & 0 deletions tests/autorag/utils/test_util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import itertools
import os
import pathlib
import tempfile

import pandas as pd

from autorag.utils import fetch_contents
from autorag.utils.util import find_best_result_path

root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent

Expand All @@ -16,3 +18,24 @@ def test_fetch_contents():
find_contents = fetch_contents(corpus_data, list(map(lambda x: [x], search_rows['doc_id'].tolist())))
assert len(find_contents) == len(search_rows)
assert list(itertools.chain.from_iterable(find_contents)) == search_rows['contents'].tolist()


def test_find_best_result_path():
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
# Set up the test files
paths = [
"best_result.parquet",
"average_result.parquet",
"worst_result.parquet",
"best_other.txt"
]
for file_name in paths:
with open(os.path.join(tmpdirname, file_name), 'w') as f:
f.write("test data")

# Run the function under test
best_path = find_best_result_path(tmpdirname)

# Check that the function returns the correct path
assert best_path == "best_result.parquet"