diff --git a/.gitignore b/.gitignore index a64fefc..57e715c 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,9 @@ keys/ # Ignore Flask Sessions flask_session/ + +# Ignore local QnA json files +QnA + +# Ignore output of api-test +api-test/output diff --git a/api-test/DESIGN.md b/api-test/DESIGN.md new file mode 100644 index 0000000..9dca800 --- /dev/null +++ b/api-test/DESIGN.md @@ -0,0 +1,146 @@ +# Design of the Finesse Benchmark Tool + +## Tools available + +There are tools that can integrate with Python or a script to accurately +calculate API statistics. Currently, the needs are to test the tool using JSON +files containing questions and their page origin in order to establish an +accuracy score. We also want to calculate request times and generate a +statistical summary of all this data. That being said, we plan to test the APIs +under different conditions in the near future. For example, with multiple +simultaneous users or under special conditions. That's why it's worth +researching tools, if they are scalable and well adapted with Python. + +### Decision + +We've opted for Locust as our tool of choice. It's seamlessly compatible with +Python, making it a natural fit due to its easy integration. Locust is an +open-source load testing framework written in Python, designed to simulate +numerous machines sending requests to a given system. It provides detailed +insights into the system's performance and scalability. With its built-in UI and +straightforward integration with Python scripts, Locust is user-friendly and +accessible. It is popular and open source, with support from major tech +companies such as Microsoft and Google + +However, Locust's primary purpose is to conduct ongoing tests involving multiple +machines and endpoints simultaneously. Our specific requirement involves running +the accuracy test just once. Nevertheless, there's potential for future +integration, especially for stress and load testing scenarios that involve +repeated searches. + +### Alternatives Considered + +#### Apache Bench (ab) + +Apache Bench (ab) is a command-line tool for benchmarking HTTP servers. It is +included with the Apache HTTP Server package and is designed for simplicity and +ease of use. + +Pros + +- Simple to use. +- Good for basic testing. +- Easy integration with test scripts. + +Cons + +- May not be flexible enough for complex testing scenarios. +- Less performant for heavy loads or advanced testing. + +#### Siege + +Siege is a load testing and benchmarking tool that simulates multiple users +accessing a web server, enabling stress testing and performance evaluation. + +Pros + +- Supports multiple concurrent users, making it suitable for load testing. +- Allows for stress testing of web servers and applications. + +Cons + +- Lack of documentation, some arguments are not documented in their wiki. +- May have a steeper learning curve compared to simpler tools like Apache Bench. + +## Overview + +This tool simplifies the process of comparing different search engines and +assessing their accuracy. It's designed to be straightforward, making it easy +to understand and use. + +## How it Works + +- **Single command:** + - Users can enter commands with clear instructions to choose a search engine, + specify a directory for JSON files and specify the backend URL. + - Mandatory arguments: + - `--engine [search engine]`: Pick a search engine. + - `ai-lab` : AI-Lab search engine + - `azure`: Azure search engine + - `static`: Static search engine + - `llamaindex`: LlamaIndex search engine + - `--path [directory path]`: Point to the directory with files structured + - `--host [API URL]`: Point to the finesse-backend URL + with JSON files with the following properties: + - `score`: The score of the page. + - `crawl_id`: The unique identifier associated with the crawl table. + - `chunk_id`: The unique identifier of the chunk. + - `title`: The title of the page. + - `url`: The URL of the page. + - `text_content`: The main textual content of the item. + - `question`: The question to ask. + - `answer`: The response to the asked question. + - Optional argument: + - `--format [file type]`: + - `csv`: Generate a CSV document + - `md`: Generate a Markdown document, selected by default + - `--once`: Go through all the json files and does not repeat + - `--top`: Limit the number of results returned by the search engine +- **Many tests** + - Test all the JSON files in the path directory +- **Accuracy score** + - The tool compares the expected page with the actual Finesse response pages. + - Calculates an accuracy score for each response based on its position in the + list of pages relative to the total number of pages in the list. 100% would + correspond of being at the top of the list, and 0% would mean not in the + list. +- **Round trip time** + - Measure round trip time of each request +- **Summary statistical value** + - Measure the average, median, standard deviation, minimum and maximal accuracy scores and round trip time + +## Diagram + +![Alt text](diagram.png) + +## Example Command + +```cmd +$locust --engine azure --path api-test/QnA/good_question --host https://finesse-guidance.ninebasetwo.xyz/api --once +Searching with Azure Search... + +File: qna_2023-12-08_36.json +Question: Quelle est la zone réglementée dans la ville de Vancouver à partir du 19 mars 2022? +Expected URL: https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807 +Accuracy Score: 50.0% +Time: 277.836ms + +File: qna_2023-12-08_19.json +Question: What are the requirements for inspections of fishing vessels? +Expected URL: https://inspection.canada.ca/importing-food-plants-or-animals/food-imports/foreign-systems/audits/report-of-a-virtual-assessment-of-spain/eng/1661449231959/1661449232916 +Accuracy Score: 0.0% +Time: 677.906ms + +... + +--- +Tested on 21 files. +Time statistical summary: + Mean:429, Median:400, Standard Deviation:150 Maximum:889, Minimum:208 +Accuracy statistical summary: + Mean:0.35, Median:0.0, Standard Deviation:0.25, Maximum:1.0, Minimum:0.0 +--- +``` + +This example shows how the CLI Output of the tool, analyzing search results from +Azure Search and providing an accuracy score for Finesse. diff --git a/api-test/__init__.py b/api-test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api-test/accuracy_functions.py b/api-test/accuracy_functions.py new file mode 100644 index 0000000..8403aa5 --- /dev/null +++ b/api-test/accuracy_functions.py @@ -0,0 +1,115 @@ +import statistics +import datetime +import csv +import os +from collections import namedtuple + +OUTPUT_FOLDER = "./api-test/output" +AccuracyResult = namedtuple("AccuracyResult", ["position", "total_pages", "score"]) + +def calculate_accuracy(responses_url: list[str], expected_url: str) -> AccuracyResult: + position: int = 0 + total_pages: int = len(responses_url) + score: float = 0.0 + expected_number = int(expected_url.split('/')[-2]) + + for idx, response_url in enumerate(responses_url): + response_number = int(response_url.split('/')[-2]) + if response_number == expected_number: + position = idx + score = 1 - (position / total_pages) + score= round(score, 2) + break + + return AccuracyResult(position, total_pages, score) + +def save_to_markdown(test_data: dict, engine: str): + if not os.path.exists(OUTPUT_FOLDER): + os.makedirs(OUTPUT_FOLDER) + date_string = datetime.datetime.now().strftime("%Y-%m-%d") + file_name = f"test_{engine}_{date_string}.md" + output_file = os.path.join(OUTPUT_FOLDER, file_name) + with open(output_file, "w") as md_file: + md_file.write(f"# Test on the {engine} search engine: {date_string}\n\n") + md_file.write("## Test data table\n\n") + md_file.write("| 📄 File | 💬 Question | 📏 Accuracy Score | ⌛ Time |\n") + md_file.write("|--------------------|-------------------------------------------------------------------------------------------------------------------------|----------------|----------|\n") + for key, value in test_data.items(): + md_file.write(f"| {key} | [{value.get('question')}]({value.get('expected_page').get('url')})' | {value.get('accuracy')*100:.1f}% | {value.get('time')}ms |\n") + md_file.write("\n") + md_file.write(f"Tested on {len(test_data)} files.\n\n") + + time_stats, accuracy_stats = calculate_statistical_summary(test_data) + md_file.write("## Statistical summary\n\n") + md_file.write("| Statistic | Time | Accuracy score|\n") + md_file.write("|-----------------------|------------|---------|\n") + md_file.write(f"|Mean| {time_stats.get('Mean')}ms | {accuracy_stats.get('Mean')*100}% |\n") + md_file.write(f"|Median| {time_stats.get('Median')}ms | {accuracy_stats.get('Median')*100}% |\n") + md_file.write(f"|Standard Deviation| {time_stats.get('Standard Deviation')}ms | {accuracy_stats.get('Standard Deviation')*100}% |\n") + md_file.write(f"|Maximum| {time_stats.get('Maximum')}ms | {accuracy_stats.get('Maximum')*100}% |\n") + md_file.write(f"|Minimum| {time_stats.get('Minimum')}ms | {accuracy_stats.get('Minimum')*100}% |\n") + +def save_to_csv(test_data: dict, engine: str): + if not os.path.exists(OUTPUT_FOLDER): + os.makedirs(OUTPUT_FOLDER) + date_string = datetime.datetime.now().strftime("%Y-%m-%d") + file_name = f"test_{engine}_{date_string}.csv" + output_file = os.path.join(OUTPUT_FOLDER, file_name) + with open(output_file, "w", newline="") as csv_file: + writer = csv.writer(csv_file) + writer.writerow(["File", "Question", "Accuracy Score", "Time"]) + for key, value in test_data.items(): + writer.writerow([ + key, + value.get("question"), + f"{value.get('accuracy')}", + f"{value.get('time')}" + ]) + writer.writerow([]) + + time_stats, accuracy_stats = calculate_statistical_summary(test_data) + writer.writerow(["Statistic", "Time", "Accuracy Score"]) + writer.writerow(["Mean", f"{time_stats.get('Mean')}", f"{accuracy_stats.get('Mean')}"]) + writer.writerow(["Median", f"{time_stats.get('Median')}", f"{accuracy_stats.get('Median')}"]) + writer.writerow(["Standard Deviation", f"{time_stats.get('Standard Deviation')}", f"{accuracy_stats.get('Standard Deviation')}"]) + writer.writerow(["Maximum", f"{time_stats.get('Maximum')}", f"{accuracy_stats.get('Maximum')}"]) + writer.writerow(["Minimum", f"{time_stats.get('Minimum')}", f"{accuracy_stats.get('Minimum')}"]) + +def log_data(test_data: dict): + for key, value in test_data.items(): + print("File:", key) + print("Question:", value.get("question")) + print("Expected URL:", value.get("expected_page").get("url")) + print(f'Accuracy Score: {value.get("accuracy")*100}%') + print(f'Time: {value.get("time")}ms') + print() + time_stats, accuracy_stats = calculate_statistical_summary(test_data) + print("---") + print(f"Tested on {len(test_data)} files.") + print("Time statistical summary:", end="\n ") + for key,value in time_stats.items(): + print(f"{key}:{value},", end=' ') + print("\nAccuracy statistical summary:", end="\n ") + for key,value in accuracy_stats.items(): + print(f"{key}:{value*100}%,", end=' ') + print("\n---") + + +def calculate_statistical_summary(test_data: dict) -> tuple[dict, dict]: + times = [result.get("time") for result in test_data.values()] + accuracies = [result.get("accuracy") for result in test_data.values()] + time_stats = { + "Mean": round(statistics.mean(times), 3), + "Median": round(statistics.median(times), 3), + "Standard Deviation": round(statistics.stdev(times), 3), + "Maximum": round(max(times), 3), + "Minimum": round(min(times), 3), + } + accuracy_stats = { + "Mean": round(statistics.mean(accuracies), 2), + "Median": round(statistics.median(accuracies), 2), + "Standard Deviation": round(statistics.stdev(accuracies), 2), + "Maximum": round(max(accuracies), 2), + "Minimum": round(min(accuracies), 2), + } + return time_stats, accuracy_stats diff --git a/api-test/diagram.png b/api-test/diagram.png new file mode 100644 index 0000000..a0a8aa3 Binary files /dev/null and b/api-test/diagram.png differ diff --git a/api-test/host.py b/api-test/host.py new file mode 100644 index 0000000..28c9d6d --- /dev/null +++ b/api-test/host.py @@ -0,0 +1,9 @@ +import requests + +def is_host_up(host_url: str) -> bool: + health_check_endpoint = f"{host_url}/health" + try: + response = requests.get(health_check_endpoint) + return response.status_code == 200 + except requests.RequestException: + return False diff --git a/api-test/jsonreader.py b/api-test/jsonreader.py new file mode 100644 index 0000000..2c4c23f --- /dev/null +++ b/api-test/jsonreader.py @@ -0,0 +1,29 @@ +import json +from typing import Iterator +import os + +class JSONReader(Iterator): + "Read test data from JSON files using an iterator" + + def __init__(self, directory): + self.directory = directory + self.file_list = [f for f in os.listdir(directory) if f.endswith('.json')] + if not self.file_list: + raise FileNotFoundError(f"No JSON files found in the directory '{directory}'") + self.current_file_index = 0 + self.file_name = None # Initialize file_name attribute + + def __iter__(self): + return self + + def __next__(self): + if self.current_file_index >= len(self.file_list): + raise StopIteration + + file_path = os.path.join(self.directory, self.file_list[self.current_file_index]) + self.file_name = self.file_list[self.current_file_index] # Update file_name attribute + + with open(file_path, 'r') as f: + data = json.load(f) + self.current_file_index += 1 + return data diff --git a/api-test/locustfile.py b/api-test/locustfile.py new file mode 100644 index 0000000..46f200d --- /dev/null +++ b/api-test/locustfile.py @@ -0,0 +1,92 @@ +from locust import HttpUser, task, events +from jsonreader import JSONReader +import os +import json +from accuracy_functions import save_to_markdown, save_to_csv, log_data, calculate_accuracy +from host import is_host_up + +class NoTestDataError(Exception): + """Raised when all requests have failed and there is no test data""" + +@events.init_command_line_parser.add_listener +def _(parser): + parser.add_argument("--engine", type=str, choices=["ai-lab", "azure", "static", "llamaindex"], required=True, help="Pick a search engine.") + parser.add_argument("--path", type=str, required=True, help="Point to the directory with files structured") + parser.add_argument("--format", type=str, choices=["csv", "md"], default="md", help="Generate a CSV or Markdown document") + parser.add_argument("--once", action="store_true", default=False, help="Set this flag to make the accuracy test non-repeatable.") + parser.add_argument("--top", type=str, default = 100, help="Set this number to limit the number of results returned by the search engine.") + args = parser.parse_args() + + if not os.path.isdir(args.path): + parser.error(f"The directory '{args.path}' does not exist.") + + if not is_host_up(args.host): + parser.error(f"The backend URL '{args.host}' is either wrong or down.") + +class FinesseUser(HttpUser): + + @task() + def search_accuracy(self): + try: + json_data = next(self.qna_reader) + except StopIteration: + if not self.once: + # Reset variables + self.on_start() + json_data = next(self.qna_reader) + print("Restarting the running test") + else: + print("Stopping the running test") + self.environment.runner.quit() + + if self.engine in ["ai-lab", "azure", "static"]: + question = json_data.get("question") + expected_url = json_data.get("url") + file_name = self.qna_reader.file_name + response_url : list[str] = [] + search_url = f"{self.host}/search/{self.engine}?top={self.top}" + data = json.dumps({'query': f'{question}'}) + headers = { "Content-Type": "application/json" } + response = self.client.post(search_url, data=data, headers=headers) + + if response.status_code == 200: + response_pages = response.json() + for page in response_pages: + response_url.append(page.get("url")) + accuracy_result = calculate_accuracy(response_url, expected_url) + time_taken = round(response.elapsed.microseconds/1000,3) + + expected_page = json_data.copy() + del expected_page['question'] + del expected_page['answer'] + self.qna_results[file_name] = { + "question": question, + "expected_page": expected_page, + "response_pages": response_pages, + "position": accuracy_result.position, + "total_pages": accuracy_result.total_pages, + "accuracy": accuracy_result.score, + "time": time_taken, + } + + def on_start(self): + self.qna_reader = JSONReader(self.path) + self.qna_results = dict() + + def on_stop(self): + if not self.qna_results: + raise NoTestDataError + + log_data(self.qna_results) + if self.format == "md": + save_to_markdown(self.qna_results, self.engine) + elif self.format == "csv": + save_to_csv(self.qna_results, self.engine) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.path = self.environment.parsed_options.path + self.engine = self.environment.parsed_options.engine + self.format = self.environment.parsed_options.format + self.once = self.environment.parsed_options.once + self.top = self.environment.parsed_options.top diff --git a/api-test/requirements-test.txt b/api-test/requirements-test.txt new file mode 100644 index 0000000..5d94512 --- /dev/null +++ b/api-test/requirements-test.txt @@ -0,0 +1,2 @@ +locust + diff --git a/api-test/test_accuracy_functions.py b/api-test/test_accuracy_functions.py new file mode 100644 index 0000000..3176b4c --- /dev/null +++ b/api-test/test_accuracy_functions.py @@ -0,0 +1,20 @@ +import unittest +from accuracy_functions import calculate_accuracy + +class TestFunctions(unittest.TestCase): + + def test_calculate_accuracy(self): + responses_url = [ + "https://inspection.canada.ca/exporting-food-plants-or-animals/food-exports/food-specific-export-requirements/meat/crfpcp/eng/1434119937443/1434120400252", + "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-08-04/fra/1323752901318/1323753612811", + "https://inspection.canada.ca/varietes-vegetales/vegetaux-a-caracteres-nouveaux/demandeurs/directive-94-08/documents-sur-la-biologie/lens-culinaris-medikus-lentille-/fra/1330978380871/1330978449837", + "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-96-15/fra/1323854808025/1323854941807" + ] + expected_url = "https://inspection.canada.ca/protection-des-vegetaux/especes-envahissantes/directives/date/d-08-04/fra/1323752901318/1323753612811" + result = calculate_accuracy(responses_url, expected_url) + self.assertEqual(result.position, 1) + self.assertEqual(result.total_pages, 4) + self.assertEqual(result.score, 0.75) + +if __name__ == "__main__": + unittest.main() diff --git a/locust.conf b/locust.conf new file mode 100644 index 0000000..ec5ce4d --- /dev/null +++ b/locust.conf @@ -0,0 +1,3 @@ +# Default settings of locust +locustfile = api-test/locustfile.py +headless = true diff --git a/requirements.txt b/requirements.txt index 06a4db3..16fcea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search fuzzywuzzy python-Levenshtein git+https://github.com/ai-cfia/ailab-db@main +