diff --git a/autorag/deploy.py b/autorag/deploy.py index 863021622..29c5c8f16 100644 --- a/autorag/deploy.py +++ b/autorag/deploy.py @@ -9,7 +9,7 @@ from fastapi import FastAPI from pydantic import BaseModel -from autorag.schema.module import SUPPORT_MODULES +from autorag.support import get_support_modules from autorag.utils.util import load_summary_file logger = logging.getLogger("AutoRAG") @@ -131,7 +131,7 @@ def run(self, query: str, result_column: str = "answer"): module = node['modules'][0] module_type = module.pop('module_type') module_params = module - previous_result = SUPPORT_MODULES[module_type]( + previous_result = get_support_modules(module_type)( project_dir=self.project_dir, previous_result=previous_result, **module_params diff --git a/autorag/nodes/generator/run.py b/autorag/nodes/generator/run.py index 495c39ebd..1f291ffed 100644 --- a/autorag/nodes/generator/run.py +++ b/autorag/nodes/generator/run.py @@ -69,7 +69,7 @@ def run_generator_node(modules: List[Callable], selected_result, selected_filename = select_best_average(results, strategies.get('metrics'), filenames) best_result = pd.concat([previous_result, selected_result], axis=1) - # add summary.csv 'is_best' column + # add 'is_best' column at summary file summary_df['is_best'] = summary_df['filename'] == selected_filename # save files diff --git a/autorag/nodes/promptmaker/run.py b/autorag/nodes/promptmaker/run.py new file mode 100644 index 000000000..31c277254 --- /dev/null +++ b/autorag/nodes/promptmaker/run.py @@ -0,0 +1,164 @@ +import os +import pathlib +from copy import deepcopy +from typing import List, Callable, Dict, Optional + +import pandas as pd + +from autorag.evaluate import evaluate_generation +from autorag.support import get_support_modules +from autorag.strategy import measure_speed, filter_by_threshold, select_best_average +from autorag.utils import validate_qa_dataset +from autorag.utils.util import make_combinations, explode, make_module_file_name + + +def run_prompt_maker_node(modules: List[Callable], + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, + ) -> pd.DataFrame: + """ + Run prompt maker node. + With this function, you can select the best prompt maker module. + As default, when you can use only one module, the evaluation will be skipped. + If you want to select the best prompt among modules, you can use strategies. + When you use them, you must pass 'generator_modules' and its parameters at strategies. + Because it uses generator modules and generator metrics for evaluation this module. + It is recommended to use one params and modules for evaluation, + but you can use multiple params and modules for evaluation. + When you don't set generator module at strategies, it will use the default generator module. + The default generator module is llama_index_llm with openai gpt-3.5-turbo model. + + :param modules: Prompt maker modules to run. + :param module_params: Prompt maker module parameters. + :param previous_result: Previous result dataframe. + Could be query expansion's best result or qa data. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for prompt maker node. + :return: The best result dataframe. + It contains previous result columns and prompt maker's result columns which is 'prompts'. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + node_dir = os.path.join(node_line_dir, "prompt_maker") + if not os.path.exists(node_dir): + os.makedirs(node_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + + # run modules + results, execution_times = zip(*map(lambda task: measure_speed( + task[0], project_dir=project_dir, previous_result=previous_result, **task[1]), zip(modules, module_params))) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # save results to folder + pseudo_module_params = deepcopy(module_params) + for i, module_param in enumerate(pseudo_module_params): + module_param['prompt'] = str(i) + filepaths = list(map(lambda x: os.path.join(node_dir, make_module_file_name(x[0].__name__, x[1])), + zip(modules, pseudo_module_params))) + list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + # make summary file + summary_df = pd.DataFrame({ + 'filename': filenames, + 'module_name': list(map(lambda module: module.__name__, modules)), + 'module_params': module_params, + 'execution_time': average_times, + }) + + # Run evaluation when there are more than one module. + if len(modules) > 1: + # pop general keys from strategies (e.g. metrics, speed_threshold) + general_key = ['metrics', 'speed_threshold'] + general_strategy = dict(filter(lambda x: x[0] in general_key, strategies.items())) + extra_strategy = dict(filter(lambda x: x[0] not in general_key, strategies.items())) + + # first, filter by threshold if it is enabled. + if general_strategy.get('speed_threshold') is not None: + results, filenames = filter_by_threshold(results, average_times, general_strategy['speed_threshold'], + filenames) + + # run metrics before filtering + if general_strategy.get('metrics') is None: + raise ValueError("You must at least one metrics for prompt maker evaluation.") + + # get generator modules from strategy + generator_callables, generator_params = make_generator_callable_params(extra_strategy) + + # get generation_gt + qa_data = pd.read_parquet(os.path.join(project_dir, "data", "qa.parquet")) + validate_qa_dataset(qa_data) + generation_gt = qa_data['generation_gt'].tolist() + generation_gt = list(map(lambda x: x.tolist(), generation_gt)) + + # run evaluations + evaluation_results = list(map(lambda result: evaluate_one_prompt_maker_node( + generator_callables, generator_params, result['prompts'].tolist(), + generation_gt, general_strategy['metrics'], project_dir), results)) + + for metric_name in general_strategy['metrics']: + summary_df[f'prompt_maker_{metric_name}'] = list(map(lambda x: x[metric_name].mean(), evaluation_results)) + + best_result, best_filename = select_best_average(evaluation_results, general_strategy['metrics'], filenames) + # change metric name columns to prompt_maker_metric_name + best_result = best_result.rename(columns={ + metric_name: f'prompt_maker_{metric_name}' for metric_name in strategies['metrics']}) + best_result = best_result.drop(columns=['generated_texts']) + else: + best_result, best_filename = results[0], filenames[0] + + # add 'is_best' column at summary file + summary_df['is_best'] = summary_df['filename'] == best_filename + + best_result = pd.concat([previous_result, best_result], axis=1) + + # save files + summary_df.to_parquet(os.path.join(node_dir, "summary.parquet"), index=False) + best_result.to_parquet(os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"), index=False) + + return best_result + + +def make_generator_callable_params(strategy_dict: Dict): + node_dict = deepcopy(strategy_dict) + generator_module_list: Optional[List[Dict]] = node_dict.pop('generator_modules', None) + if generator_module_list is None: + generator_module_list = [{ + 'module_type': 'llama_index_llm', + 'llm': 'openai', + 'model_name': 'gpt-3.5-turbo', + }] + node_params = node_dict + modules = list(map(lambda module_dict: get_support_modules(module_dict.pop('module_type')), + generator_module_list)) + param_combinations = list(map(lambda module_dict: make_combinations({**module_dict, **node_params}), + generator_module_list)) + return explode(modules, param_combinations) + + +def evaluate_one_prompt_maker_node(generator_funcs: List[Callable], + generator_params: List[Dict], + prompts: List[str], + generation_gt: List[List[str]], + metrics: List[str], + project_dir) -> pd.DataFrame: + input_df = pd.DataFrame({'prompts': prompts}) + generator_results = list(map(lambda x: x[0](project_dir=project_dir, previous_result=input_df, **x[1]), + zip(generator_funcs, generator_params))) + evaluation_results = list(map(lambda x: evaluate_generator_result(x[0], generation_gt, metrics), + zip(generator_results, generator_funcs))) + best_result, _ = select_best_average(evaluation_results, metrics) + best_result = pd.concat([input_df, best_result], axis=1) + return best_result # it has 'generated_texts' column + + +def evaluate_generator_result(result_df: pd.DataFrame, + generation_gt: List[List[str]], + metrics: List[str]) -> pd.DataFrame: + @evaluate_generation(generation_gt=generation_gt, metrics=metrics) + def evaluate(df): + return df['generated_texts'].tolist() + + return evaluate(result_df) diff --git a/autorag/schema/module.py b/autorag/schema/module.py index 43e52388f..67ed453bd 100644 --- a/autorag/schema/module.py +++ b/autorag/schema/module.py @@ -2,16 +2,7 @@ from dataclasses import dataclass, field from typing import Callable, Dict -from autorag.nodes.promptmaker import fstring -from autorag.nodes.generator import llama_index_llm -from autorag.nodes.retrieval import bm25, vectordb - -SUPPORT_MODULES = { - 'bm25': bm25, - 'vectordb': vectordb, - 'fstring': fstring, - 'llama_index_llm': llama_index_llm, -} +from autorag.support import get_support_modules @dataclass @@ -21,7 +12,7 @@ class Module: module: Callable = field(init=False) def __post_init__(self): - self.module = SUPPORT_MODULES.get(self.module_type) + self.module = get_support_modules(self.module_type) if self.module is None: raise ValueError(f"Module type {self.module_type} is not supported.") diff --git a/autorag/schema/node.py b/autorag/schema/node.py index a4f74783d..304dbfd2a 100644 --- a/autorag/schema/node.py +++ b/autorag/schema/node.py @@ -6,15 +6,10 @@ import pandas as pd -from autorag.nodes.generator.run import run_generator_node -from autorag.nodes.retrieval.run import run_retrieval_node from autorag.schema.module import Module +from autorag.support import get_support_nodes from autorag.utils.util import make_combinations, explode -SUPPORT_NODES = { - 'retrieval': run_retrieval_node, - 'generator': run_generator_node, -} logger = logging.getLogger("AutoRAG") @@ -27,7 +22,7 @@ class Node: run_node: Callable = field(init=False) def __post_init__(self): - self.run_node = SUPPORT_NODES.get(self.node_type) + self.run_node = get_support_nodes(self.node_type) if self.run_node is None: raise ValueError(f"Node type {self.node_type} is not supported.") diff --git a/autorag/support.py b/autorag/support.py new file mode 100644 index 000000000..353d8419c --- /dev/null +++ b/autorag/support.py @@ -0,0 +1,31 @@ +from typing import Callable, Dict +import importlib + + +def dynamically_find_function(key: str, target_dict: Dict) -> Callable: + if key in target_dict: + module_path, func_name = target_dict[key] + module = importlib.import_module(module_path) + func = getattr(module, func_name) + return func + else: + raise KeyError(f"Key {key} is not supported.") + + +def get_support_modules(module_name: str) -> Callable: + support_modules = { + 'bm25': ('autorag.nodes.retrieval', 'bm25'), + 'vectordb': ('autorag.nodes.retrieval', 'vectordb'), + 'fstring': ('autorag.nodes.promptmaker', 'fstring'), + 'llama_index_llm': ('autorag.nodes.generator', 'llama_index_llm'), + } + return dynamically_find_function(module_name, support_modules) + + +def get_support_nodes(node_name: str) -> Callable: + support_nodes = { + 'retrieval': ('autorag.nodes.retrieval.run', 'run_retrieval_node'), + 'generator': ('autorag.nodes.generator.run', 'run_generator_node'), + 'prompt_maker': ('autorag.nodes.promptmaker.run', 'run_prompt_maker_node'), + } + return dynamically_find_function(node_name, support_nodes) diff --git a/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py b/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py new file mode 100644 index 000000000..e1e328fba --- /dev/null +++ b/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py @@ -0,0 +1,141 @@ +import os +import tempfile + +import pandas as pd +import pytest + +from autorag.nodes.generator import llama_index_llm +from autorag.nodes.promptmaker import fstring +from autorag.nodes.promptmaker.run import evaluate_generator_result, evaluate_one_prompt_maker_node, \ + run_prompt_maker_node + +prompts = ['Hello, Do you know the world without war?', + 'Hi, I am dreaming about the world without any war.'] +sample_generated_texts = ['hello. This is the world speaking', + 'Hello, Do you know the world without war?'] +sample_generation_gt = [ + ['Hello from space. Hi! This is the world speaking.'], + ['Hello, Do you know the world without war?', 'Hi, I am dreaming about the world without any war.'] +] +metrics = ['bleu', 'rouge'] +previous_result = pd.DataFrame({ + 'query': ['What is the war?', 'Who is John Lennon?'], + 'retrieved_contents': [['War is horrible.', 'War is bad.'], + ['John Lennon is a musician.', 'John Lennon is a singer.']], + 'test_column': ['test_value_1', 'test_value_2'], +}) + + +def test_evaluate_generator_result(): + sample_df = pd.DataFrame({'generated_texts': sample_generated_texts}) + result_df = evaluate_generator_result(sample_df, sample_generation_gt, metrics) + assert all(metric_name in result_df.columns for metric_name in metrics) + assert len(result_df) == len(sample_generated_texts) + + +def test_evaluate_one_prompt_maker_node(): + generator_funcs = [llama_index_llm, llama_index_llm] + generator_params = [{'llm': 'openai', 'model_name': 'gpt-3.5-turbo'}, + {'llm': 'openai', 'model_name': 'gpt-4-1106-preview'}] + project_dir = '_' + best_result = evaluate_one_prompt_maker_node(generator_funcs, generator_params, prompts, sample_generation_gt, + metrics, project_dir) + assert isinstance(best_result, pd.DataFrame) + assert all(metric_name in best_result.columns for metric_name in metrics) + assert len(best_result) == len(prompts) + + +@pytest.fixture +def node_line_dir(): + with tempfile.TemporaryDirectory() as project_dir: + data_dir = os.path.join(project_dir, "data") + os.makedirs(data_dir) + qa_data = pd.DataFrame({ + 'qid': ['id-1', 'id-2'], + 'query': ['What is the war?', 'Who is John Lennon?'], + 'retrieval_gt': [[['doc-1']], [['doc-2']]], + 'generation_gt': sample_generation_gt, + }) + qa_data.to_parquet(os.path.join(data_dir, "qa.parquet"), index=False) + trial_dir = os.path.join(project_dir, "trial") + os.makedirs(trial_dir) + node_line_path = os.path.join(trial_dir, "node_line_1") + os.makedirs(node_line_path) + yield node_line_path + + +def check_best_result(best_df: pd.DataFrame): + assert isinstance(best_df, pd.DataFrame) + assert len(best_df) == len(previous_result) + assert set(best_df.columns) == { + 'query', 'retrieved_contents', 'test_column', 'prompts', 'prompt_maker_bleu', 'prompt_maker_rouge' + } + + +def check_summary_df(node_line_dir): + # check the files saved properly + summary_path = os.path.join(node_line_dir, "prompt_maker", "summary.parquet") + assert os.path.exists(summary_path) + summary_df = pd.read_parquet(summary_path) + assert len(summary_df) == len(previous_result) + assert set(summary_df.columns) == {'filename', 'module_name', 'module_params', 'execution_time', + 'prompt_maker_bleu', 'prompt_maker_rouge', 'is_best'} + best_filename = summary_df[summary_df['is_best']]['filename'].values[0] + return best_filename + + +def test_run_prompt_maker_node(node_line_dir): + modules = [fstring, fstring] + params = [{'prompt': 'Tell me something about the question: {query} \n\n {retrieved_contents}'}, + {'prompt': 'Question: {query} \n Something to read: {retrieved_contents} \n What\'s your answer?'}] + strategies = { + 'metrics': metrics, + 'speed_threshold': 5, + 'generator_modules': [{ + 'module_type': 'llama_index_llm', + 'llm': 'openai', + 'model_name': ['gpt-3.5-turbo', 'gpt-4-1106-preview'], + }] + } + best_result = run_prompt_maker_node(modules, params, previous_result, node_line_dir, strategies) + check_best_result(best_result) + best_filename = check_summary_df(node_line_dir) + best_result_path = os.path.join(node_line_dir, "prompt_maker", f"best_{best_filename}") + assert os.path.exists(best_result_path) + + assert os.path.exists(os.path.join(node_line_dir, "prompt_maker", "fstring=>prompt_0.parquet")) + assert os.path.exists(os.path.join(node_line_dir, "prompt_maker", "fstring=>prompt_1.parquet")) + + +def test_run_prompt_maker_node_default(node_line_dir): + modules = [fstring, fstring] + params = [{'prompt': 'Tell me something about the question: {query} \n\n {retrieved_contents}'}, + {'prompt': 'Question: {query} \n Something to read: {retrieved_contents} \n What\'s your answer?'}] + strategies = { + 'metrics': metrics + } + best_result = run_prompt_maker_node(modules, params, previous_result, node_line_dir, strategies) + check_best_result(best_result) + best_filename = check_summary_df(node_line_dir) + best_result_path = os.path.join(node_line_dir, "prompt_maker", f"best_{best_filename}") + assert os.path.exists(best_result_path) + + +def test_run_prompt_maker_one_module(node_line_dir): + modules = [fstring] + params = [{'prompt': 'Tell me something about the question: {query} \n\n {retrieved_contents}'}] + strategies = { + 'metrics': metrics + } + best_result = run_prompt_maker_node(modules, params, previous_result, node_line_dir, strategies) + assert set(best_result.columns) == { + 'query', 'retrieved_contents', 'test_column', 'prompts' # automatically skip evaluation + } + summary_filepath = os.path.join(node_line_dir, "prompt_maker", "summary.parquet") + assert os.path.exists(summary_filepath) + summary_df = pd.read_parquet(summary_filepath) + assert set(summary_df) == { + 'filename', 'module_name', 'module_params', 'execution_time', 'is_best' + } + best_filepath = os.path.join(node_line_dir, "prompt_maker", f"best_{summary_df['filename'].values[0]}") + assert os.path.exists(best_filepath) diff --git a/tests/autorag/schema/test_module_schema.py b/tests/autorag/schema/test_module_schema.py index 190f15ac2..ab70e8c27 100644 --- a/tests/autorag/schema/test_module_schema.py +++ b/tests/autorag/schema/test_module_schema.py @@ -1,11 +1,12 @@ import pytest -from autorag.schema.module import Module, SUPPORT_MODULES +from autorag.schema.module import Module +from autorag.support import get_support_modules # Test cases for supported module types @pytest.mark.parametrize("module_type, expected_module", [ - ('bm25', SUPPORT_MODULES['bm25']), - # Add more supported module types and their expected output here + ('bm25', get_support_modules('bm25')), + ('vectordb', get_support_modules('vectordb')), ]) def test_module_from_dict_supported(module_type, expected_module): module_dict = { @@ -26,6 +27,5 @@ def test_module_from_dict_supported(module_type, expected_module): ]) def test_module_from_dict_unsupported(module_type): module_dict = {'module_type': module_type} - with pytest.raises(ValueError) as exc_info: + with pytest.raises(KeyError): Module.from_dict(module_dict) - assert str(exc_info.value) == f"Module type {module_type} is not supported." diff --git a/tests/autorag/test_support.py b/tests/autorag/test_support.py new file mode 100644 index 000000000..2cc659d30 --- /dev/null +++ b/tests/autorag/test_support.py @@ -0,0 +1,11 @@ +from autorag.support import get_support_modules, get_support_nodes + + +def test_get_support_modules(): + result = get_support_modules('bm25') + assert result.__name__ == 'bm25' + + +def test_get_support_nodes(): + result = get_support_nodes('retrieval') + assert result.__name__ == 'run_retrieval_node'