diff --git a/.env.template b/.env.template index 192f1b8..129ed76 100644 --- a/.env.template +++ b/.env.template @@ -28,3 +28,7 @@ FINESSE_BACKEND_GITHUB_STATIC_FILE_URL=https://api.github.com/repos/ai-cfia/fine # Message for unexpected errors. Optional. # FINESSE_BACKEND_ERROR_UNEXPECTED="Unexpected error." + +# Threshold for fuzzy matching queries to finesse-data files. Represents the minimum +# score (out of 100) for a match to be considered close enough. Optional. +# FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD=90 diff --git a/app/blueprints/search.py b/app/blueprints/search.py index b7394c1..8464360 100644 --- a/app/blueprints/search.py +++ b/app/blueprints/search.py @@ -37,8 +37,9 @@ def search_azure(): def search_static(): finesse_data_url = current_app.config["FINESSE_DATA_URL"] query = request.json["query"] + match_threshold = current_app.config["FUZZY_MATCH_THRESHOLD"] try: - data = fetch_data(finesse_data_url, query) + data = fetch_data(finesse_data_url, query, match_threshold) return jsonify(data) except FinesseDataFetchException: return jsonify({"error": current_app.config["ERROR_FINESSE_DATA_FAILED"]}), 500 diff --git a/app/config.py b/app/config.py index d0ceb4b..fdc4d71 100644 --- a/app/config.py +++ b/app/config.py @@ -13,6 +13,7 @@ DEFAULT_ERROR_AZURE_FAILED = "Azure index search failed." DEFAULT_ERROR_FINESSE_DATA_FAILED = "finesse-data static search failed" DEFAULT_ERROR_UNEXPECTED = "Unexpected error." +DEFAULT_FUZZY_MATCH_THRESHOLD = "90" @dataclass @@ -44,3 +45,8 @@ class Config: ERROR_UNEXPECTED = os.getenv( "FINESSE_BACKEND_ERROR_UNEXPECTED", DEFAULT_ERROR_UNEXPECTED ) + FUZZY_MATCH_THRESHOLD = int( + os.getenv( + "FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD", DEFAULT_FUZZY_MATCH_THRESHOLD + ) + ) diff --git a/app/finesse_data/__init__.py b/app/finesse_data/__init__.py index b34906f..377d701 100644 --- a/app/finesse_data/__init__.py +++ b/app/finesse_data/__init__.py @@ -1,32 +1,30 @@ import logging - import requests - +from fuzzywuzzy import process class FinesseDataFetchException(Exception): """Custom exception for errors in fetching data from finesse-data.""" - class EmptyQueryError(Exception): """Raised when the search query is empty.""" - -def fetch_data(finesse_data_url, query): +def fetch_data(finesse_data_url, query, match_threshold): if not query: logging.error("Empty search query received") raise EmptyQueryError("Search query cannot be empty") + try: response = requests.get(finesse_data_url) response.raise_for_status() files = response.json() + file_map = {file["name"]: file for file in files} query = query.replace("\r\n", "").replace("\n", "") - normalized_term = query.lower() - matching_file = next( - (file for file in files if normalized_term in file["name"].lower()), None - ) - if not matching_file: - logging.info("No matching file found for query: %s", query) + best_match_result = process.extractOne(query, file_map.keys()) + if not best_match_result or best_match_result[1] < match_threshold: + logging.info(f"No close match found for query: {query}") return None + best_match, _ = best_match_result + matching_file = file_map[best_match] results_response = requests.get(matching_file["download_url"]) results_response.raise_for_status() return results_response.json() diff --git a/requirements-production.txt b/requirements-production.txt index 7533989..e0e9c3a 100644 --- a/requirements-production.txt +++ b/requirements-production.txt @@ -4,3 +4,5 @@ flask-cors==4.0.0 # Released: 2023-06-26 gunicorn==21.2.0 # Released: 2023-07-19 python-dotenv==1.0.0 # Released: 2023-02-24 git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search +fuzzywuzzy==0.18.0 +python-Levenshtein== 0.23.0 diff --git a/requirements.txt b/requirements.txt index cdfb355..e291fa8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ flask-cors gunicorn python-dotenv git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search +fuzzywuzzy +python-Levenshtein