Skip to content

Commit

Permalink
issue #2: using fuzzywuzzy for finesse-data files matching
Browse files Browse the repository at this point in the history
  • Loading branch information
k-allagbe committed Nov 24, 2023
1 parent 9315b6e commit 29d0c4f
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 12 deletions.
4 changes: 4 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ FINESSE_BACKEND_GITHUB_STATIC_FILE_URL=https://api.github.com/repos/ai-cfia/fine

# Message for unexpected errors. Optional.
# FINESSE_BACKEND_ERROR_UNEXPECTED="Unexpected error."

# Threshold for fuzzy matching queries to finesse-data files. Represents the minimum
# score (out of 100) for a match to be considered close enough. Optional.
# FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD=90
3 changes: 2 additions & 1 deletion app/blueprints/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ def search_azure():
def search_static():
finesse_data_url = current_app.config["FINESSE_DATA_URL"]
query = request.json["query"]
match_threshold = current_app.config["FUZZY_MATCH_THRESHOLD"]
try:
data = fetch_data(finesse_data_url, query)
data = fetch_data(finesse_data_url, query, match_threshold)
return jsonify(data)
except FinesseDataFetchException:
return jsonify({"error": current_app.config["ERROR_FINESSE_DATA_FAILED"]}), 500
Expand Down
6 changes: 6 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DEFAULT_ERROR_AZURE_FAILED = "Azure index search failed."
DEFAULT_ERROR_FINESSE_DATA_FAILED = "finesse-data static search failed"
DEFAULT_ERROR_UNEXPECTED = "Unexpected error."
DEFAULT_FUZZY_MATCH_THRESHOLD = "90"


@dataclass
Expand Down Expand Up @@ -44,3 +45,8 @@ class Config:
ERROR_UNEXPECTED = os.getenv(
"FINESSE_BACKEND_ERROR_UNEXPECTED", DEFAULT_ERROR_UNEXPECTED
)
FUZZY_MATCH_THRESHOLD = int(
os.getenv(
"FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD", DEFAULT_FUZZY_MATCH_THRESHOLD
)
)
20 changes: 9 additions & 11 deletions app/finesse_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,30 @@
import logging

import requests

from fuzzywuzzy import process

class FinesseDataFetchException(Exception):
"""Custom exception for errors in fetching data from finesse-data."""


class EmptyQueryError(Exception):
"""Raised when the search query is empty."""


def fetch_data(finesse_data_url, query):
def fetch_data(finesse_data_url, query, match_threshold):
if not query:
logging.error("Empty search query received")
raise EmptyQueryError("Search query cannot be empty")

try:
response = requests.get(finesse_data_url)
response.raise_for_status()
files = response.json()
file_map = {file["name"]: file for file in files}
query = query.replace("\r\n", "").replace("\n", "")
normalized_term = query.lower()
matching_file = next(
(file for file in files if normalized_term in file["name"].lower()), None
)
if not matching_file:
logging.info("No matching file found for query: %s", query)
best_match_result = process.extractOne(query, file_map.keys())
if not best_match_result or best_match_result[1] < match_threshold:
logging.info(f"No close match found for query: {query}")
return None
best_match, _ = best_match_result
matching_file = file_map[best_match]
results_response = requests.get(matching_file["download_url"])
results_response.raise_for_status()
return results_response.json()
Expand Down
2 changes: 2 additions & 0 deletions requirements-production.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ flask-cors==4.0.0 # Released: 2023-06-26
gunicorn==21.2.0 # Released: 2023-07-19
python-dotenv==1.0.0 # Released: 2023-02-24
git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search
fuzzywuzzy==0.18.0
python-Levenshtein== 0.23.0
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ flask-cors
gunicorn
python-dotenv
git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search
fuzzywuzzy
python-Levenshtein

0 comments on commit 29d0c4f

Please sign in to comment.