-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from ai-cfia/k-allagbe/issue2-search-from-stati…
…c-finesse-data K-allagbe/issue2-search-from-static-finesse-data
- Loading branch information
Showing
14 changed files
with
329 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,37 @@ | ||
# FINESSE_BACKEND_AZURE_SEARCH_ENDPOINT: | ||
# Endpoint URL of Azure Cognitive Search service. Format: | ||
# https://[service-name].search.windows.net | ||
FINESSE_BACKEND_AZURE_SEARCH_ENDPOINT=<Azure-Search-Service-Endpoint> | ||
|
||
# FINESSE_BACKEND_AZURE_SEARCH_API_KEY: | ||
# API key for Azure Cognitive Search. Used for operations such as | ||
# querying the search index. | ||
FINESSE_BACKEND_AZURE_SEARCH_API_KEY=<Azure-Search-API-Key> | ||
|
||
# FINESSE_BACKEND_AZURE_SEARCH_INDEX_NAME: | ||
# Name of the search index in Azure Cognitive Search. Contains documents | ||
# for search operations. | ||
FINESSE_BACKEND_AZURE_SEARCH_INDEX_NAME=<Search-Index-Name> | ||
|
||
# FINESSE_BACKEND_DEBUG_MODE: | ||
# Boolean flag to enable or disable debug mode for the application. | ||
# Defaults to False when not set. | ||
# Defaults to False when not set. Optional. | ||
# FINESSE_BACKEND_DEBUG_MODE=<True/False> | ||
|
||
# URL for static file hosted on GitHub. | ||
FINESSE_BACKEND_GITHUB_STATIC_FILE_URL=https://api.github.com/repos/ai-cfia/finesse-data/contents | ||
|
||
# Message for empty search query errors. Optional. | ||
# FINESSE_BACKEND_ERROR_EMPTY_QUERY="Search query cannot be empty" | ||
|
||
# Message for Azure search failures. Optional. | ||
# FINESSE_BACKEND_ERROR_AZURE_FAILED="Azure index search failed." | ||
|
||
# Message for Finesse data search failures. Optional. | ||
# FINESSE_BACKEND_ERROR_FINESSE_DATA_FAILED="finesse-data static search failed" | ||
|
||
# Message for unexpected errors. Optional. | ||
# FINESSE_BACKEND_ERROR_UNEXPECTED="Unexpected error." | ||
|
||
# Threshold for fuzzy matching queries to finesse-data files. Represents the minimum | ||
# score (out of 100) for a match to be considered close enough. Optional. | ||
# FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD=90 | ||
|
||
# Regular expression pattern used for sanitizing input to prevent log injection. Optional. | ||
# FINESSE_BACKEND_SANITIZE_PATTERN="[^\w \d\"#\$%&'\(\)\*\+,-\.\/:;?@\^_`{\|}~]+|\%\w+|;|/|\(|\)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
from flask import Blueprint | ||
|
||
monitor = Blueprint("monitor", __name__) | ||
monitor_blueprint = Blueprint("monitor", __name__) | ||
|
||
|
||
@monitor.route("", methods=["GET"]) | ||
@monitor_blueprint.route("", methods=["GET"]) | ||
def health(): | ||
return "ok", 200 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,51 @@ | ||
import re | ||
from functools import wraps | ||
|
||
from flask import Blueprint, current_app, jsonify, request | ||
from index_search import search as azure_index_search | ||
from index_search import AzureIndexSearchQueryError, search | ||
|
||
from app.finesse_data import FinesseDataFetchException, fetch_data | ||
from app.utils import sanitize | ||
|
||
search_blueprint = Blueprint("finesse", __name__) | ||
|
||
|
||
search = Blueprint("finesse", __name__) | ||
def require_non_empty_query(f): | ||
@wraps(f) | ||
def decorated_function(*args, **kwargs): | ||
query = request.json.get("query") | ||
if not query: | ||
return jsonify({"message": current_app.config["ERROR_EMPTY_QUERY"]}), 400 | ||
return f(*args, **kwargs) | ||
|
||
return decorated_function | ||
|
||
|
||
@search_blueprint.route("/azure", methods=["POST"]) | ||
@require_non_empty_query | ||
def search_azure(): | ||
query = request.json["query"] | ||
query = sanitize(query, current_app.config["SANITIZE_PATTERN"]) | ||
try: | ||
results = search(query, current_app.config["AZURE_CONFIG"]) | ||
return jsonify(results) | ||
except AzureIndexSearchQueryError: | ||
return jsonify({"error": current_app.config["ERROR_AZURE_FAILED"]}), 500 | ||
except Exception: | ||
return jsonify({"error": current_app.config["ERROR_UNEXPECTED"]}), 500 | ||
|
||
|
||
@search.route("", methods=["POST"]) | ||
def search_documents(): | ||
@search_blueprint.route("/static", methods=["POST"]) | ||
@require_non_empty_query | ||
def search_static(): | ||
finesse_data_url = current_app.config["FINESSE_DATA_URL"] | ||
query = request.json["query"] | ||
results = azure_index_search(query, current_app.config["AZURE_CONFIG"]) | ||
return jsonify(results) | ||
query = sanitize(query, current_app.config["SANITIZE_PATTERN"]) | ||
match_threshold = current_app.config["FUZZY_MATCH_THRESHOLD"] | ||
try: | ||
data = fetch_data(finesse_data_url, query, match_threshold) | ||
return jsonify(data) | ||
except FinesseDataFetchException: | ||
return jsonify({"error": current_app.config["ERROR_FINESSE_DATA_FAILED"]}), 500 | ||
except Exception: | ||
return jsonify({"error": current_app.config["ERROR_UNEXPECTED"]}), 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
|
||
import requests | ||
from fuzzywuzzy import process | ||
|
||
|
||
class FinesseDataFetchException(Exception): | ||
"""Custom exception for errors in fetching data from finesse-data.""" | ||
|
||
|
||
class EmptyQueryError(Exception): | ||
"""Raised when the search query is empty.""" | ||
|
||
|
||
def find_best_match(search_string, candidates, match_threshold): | ||
best_match_result = process.extractOne(search_string, candidates) | ||
if not best_match_result or best_match_result[1] < match_threshold: | ||
logging.info(f"No close match found for search string: {search_string}") | ||
return None | ||
return best_match_result[0] | ||
|
||
|
||
def fetch_data(finesse_data_url, query, match_threshold): | ||
if not query: | ||
logging.error("Empty search query received") | ||
raise EmptyQueryError("Search query cannot be empty") | ||
|
||
try: | ||
response = requests.get(finesse_data_url) | ||
response.raise_for_status() | ||
files = response.json() | ||
file_map = {file["name"]: file for file in files} | ||
if best_match := find_best_match(query, file_map.keys(), match_threshold): | ||
matching_file = file_map[best_match] | ||
results_response = requests.get(matching_file["download_url"]) | ||
results_response.raise_for_status() | ||
return results_response.json() | ||
except requests.RequestException as e: | ||
logging.error(f"finesse-data fetch failed: {e}", exc_info=True) | ||
raise FinesseDataFetchException(f"finesse-data fetch failed: {e}") from e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import re | ||
|
||
|
||
def sanitize(input, pattern): | ||
"""Mitigates log injection risks.""" | ||
return re.sub(pattern, "", input) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,20 @@ | ||
from dataclasses import dataclass | ||
from unittest.mock import Mock | ||
|
||
from app.config import Config | ||
|
||
|
||
@dataclass | ||
class TestAzureSearchConfig: | ||
endpoint = "endpoint" | ||
api_key = "api_key" | ||
index_name = "index" | ||
client = Mock() | ||
|
||
|
||
@dataclass | ||
class TestConfig(Config): | ||
AZURE_CONFIG = TestAzureSearchConfig() | ||
FINESSE_DATA_URL = "" | ||
DEBUG = "" | ||
TESTING = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import unittest | ||
from unittest.mock import Mock, patch | ||
|
||
import requests | ||
|
||
from app.finesse_data import ( | ||
EmptyQueryError, | ||
FinesseDataFetchException, | ||
fetch_data, | ||
find_best_match, | ||
) | ||
|
||
|
||
class TestFetchData(unittest.TestCase): | ||
def setUp(self): | ||
self.finesse_data_url = "https://example.com/data" | ||
self.match_threshold = 90 | ||
self.files = [ | ||
{"name": "file1.json", "download_url": "https://example.com/file1.json"}, | ||
{"name": "file2.json", "download_url": "https://example.com/file2.json"}, | ||
] | ||
self.candidates = [ | ||
"Annual Financial Report", | ||
"Project Proposal March", | ||
"Client Contact Information", | ||
"Product Catalog", | ||
] | ||
|
||
@patch("app.finesse_data.requests.get") | ||
def test_fetch_data_empty_query(self, mock_get): | ||
with self.assertRaises(EmptyQueryError): | ||
fetch_data(self.finesse_data_url, "", self.match_threshold) | ||
|
||
@patch("app.finesse_data.requests.get") | ||
def test_fetch_data_no_match_found(self, mock_get): | ||
mock_get.return_value = Mock(status_code=200, json=lambda: self.files) | ||
result = fetch_data(self.finesse_data_url, "bad query", self.match_threshold) | ||
self.assertIsNone(result) | ||
|
||
@patch("app.finesse_data.requests.get") | ||
def test_fetch_data_success(self, mock_get): | ||
mock_get.side_effect = [ | ||
Mock(status_code=200, json=lambda: self.files), | ||
Mock(status_code=200, json=lambda: {"data": "content"}), | ||
] | ||
result = fetch_data(self.finesse_data_url, "file1", self.match_threshold) | ||
self.assertEqual(result, {"data": "content"}) | ||
|
||
@patch("app.finesse_data.requests.get") | ||
def test_fetch_data_request_exception(self, mock_get): | ||
mock_get.side_effect = requests.RequestException() | ||
with self.assertRaises(FinesseDataFetchException): | ||
fetch_data(self.finesse_data_url, "a query", self.match_threshold) | ||
|
||
def test_exact_match(self): | ||
result = find_best_match( | ||
"Annual Financial Report", self.candidates, self.match_threshold | ||
) | ||
self.assertEqual(result, "Annual Financial Report") | ||
|
||
def test_misspelled_match(self): | ||
result = find_best_match( | ||
"Project Propsal March", self.candidates, self.match_threshold | ||
) | ||
self.assertEqual(result, "Project Proposal March") | ||
|
||
def test_transposed_letters_match(self): | ||
result = find_best_match( | ||
"Cleint Contact Information", self.candidates, self.match_threshold | ||
) | ||
self.assertEqual(result, "Client Contact Information") | ||
|
||
def test_extra_letter_match(self): | ||
result = find_best_match( | ||
"Product Catatalog", self.candidates, self.match_threshold | ||
) | ||
self.assertEqual(result, "Product Catalog") | ||
|
||
def test_no_match_found(self): | ||
result = find_best_match( | ||
"Completely Unrelated String", self.candidates, self.match_threshold | ||
) | ||
self.assertIsNone(result) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.