Skip to content

Commit

Permalink
Merge pull request #7 from ai-cfia/k-allagbe/issue2-search-from-stati…
Browse files Browse the repository at this point in the history
…c-finesse-data

K-allagbe/issue2-search-from-static-finesse-data
  • Loading branch information
k-allagbe authored Nov 27, 2023
2 parents ceaf226 + 1fa9741 commit fc4c1b6
Show file tree
Hide file tree
Showing 14 changed files with 329 additions and 26 deletions.
28 changes: 23 additions & 5 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
# FINESSE_BACKEND_AZURE_SEARCH_ENDPOINT:
# Endpoint URL of Azure Cognitive Search service. Format:
# https://[service-name].search.windows.net
FINESSE_BACKEND_AZURE_SEARCH_ENDPOINT=<Azure-Search-Service-Endpoint>

# FINESSE_BACKEND_AZURE_SEARCH_API_KEY:
# API key for Azure Cognitive Search. Used for operations such as
# querying the search index.
FINESSE_BACKEND_AZURE_SEARCH_API_KEY=<Azure-Search-API-Key>

# FINESSE_BACKEND_AZURE_SEARCH_INDEX_NAME:
# Name of the search index in Azure Cognitive Search. Contains documents
# for search operations.
FINESSE_BACKEND_AZURE_SEARCH_INDEX_NAME=<Search-Index-Name>

# FINESSE_BACKEND_DEBUG_MODE:
# Boolean flag to enable or disable debug mode for the application.
# Defaults to False when not set.
# Defaults to False when not set. Optional.
# FINESSE_BACKEND_DEBUG_MODE=<True/False>

# URL for static file hosted on GitHub.
FINESSE_BACKEND_GITHUB_STATIC_FILE_URL=https://api.github.com/repos/ai-cfia/finesse-data/contents

# Message for empty search query errors. Optional.
# FINESSE_BACKEND_ERROR_EMPTY_QUERY="Search query cannot be empty"

# Message for Azure search failures. Optional.
# FINESSE_BACKEND_ERROR_AZURE_FAILED="Azure index search failed."

# Message for Finesse data search failures. Optional.
# FINESSE_BACKEND_ERROR_FINESSE_DATA_FAILED="finesse-data static search failed"

# Message for unexpected errors. Optional.
# FINESSE_BACKEND_ERROR_UNEXPECTED="Unexpected error."

# Threshold for fuzzy matching queries to finesse-data files. Represents the minimum
# score (out of 100) for a match to be considered close enough. Optional.
# FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD=90

# Regular expression pattern used for sanitizing input to prevent log injection. Optional.
# FINESSE_BACKEND_SANITIZE_PATTERN="[^\w \d\"#\$%&'\(\)\*\+,-\.\/:;?@\^_`{\|}~]+|\%\w+|;|/|\(|\)"
10 changes: 6 additions & 4 deletions app/app_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ def create_app(config: Config):
CORS(app)
app.config.from_object(config)

from .blueprints.monitor import monitor
from .blueprints.search import search
from .blueprints.monitor import monitor_blueprint
from .blueprints.search import search_blueprint

app.register_blueprint(monitor, url_prefix="/health", strict_slashes=False)
app.register_blueprint(search, url_prefix="/search", strict_slashes=False)
app.register_blueprint(
monitor_blueprint, url_prefix="/health", strict_slashes=False
)
app.register_blueprint(search_blueprint, url_prefix="/search", strict_slashes=False)

return app
4 changes: 2 additions & 2 deletions app/blueprints/monitor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from flask import Blueprint

monitor = Blueprint("monitor", __name__)
monitor_blueprint = Blueprint("monitor", __name__)


@monitor.route("", methods=["GET"])
@monitor_blueprint.route("", methods=["GET"])
def health():
return "ok", 200
52 changes: 46 additions & 6 deletions app/blueprints/search.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,51 @@
import re
from functools import wraps

from flask import Blueprint, current_app, jsonify, request
from index_search import search as azure_index_search
from index_search import AzureIndexSearchQueryError, search

from app.finesse_data import FinesseDataFetchException, fetch_data
from app.utils import sanitize

search_blueprint = Blueprint("finesse", __name__)


search = Blueprint("finesse", __name__)
def require_non_empty_query(f):
@wraps(f)
def decorated_function(*args, **kwargs):
query = request.json.get("query")
if not query:
return jsonify({"message": current_app.config["ERROR_EMPTY_QUERY"]}), 400
return f(*args, **kwargs)

return decorated_function


@search_blueprint.route("/azure", methods=["POST"])
@require_non_empty_query
def search_azure():
query = request.json["query"]
query = sanitize(query, current_app.config["SANITIZE_PATTERN"])
try:
results = search(query, current_app.config["AZURE_CONFIG"])
return jsonify(results)
except AzureIndexSearchQueryError:
return jsonify({"error": current_app.config["ERROR_AZURE_FAILED"]}), 500
except Exception:
return jsonify({"error": current_app.config["ERROR_UNEXPECTED"]}), 500


@search.route("", methods=["POST"])
def search_documents():
@search_blueprint.route("/static", methods=["POST"])
@require_non_empty_query
def search_static():
finesse_data_url = current_app.config["FINESSE_DATA_URL"]
query = request.json["query"]
results = azure_index_search(query, current_app.config["AZURE_CONFIG"])
return jsonify(results)
query = sanitize(query, current_app.config["SANITIZE_PATTERN"])
match_threshold = current_app.config["FUZZY_MATCH_THRESHOLD"]
try:
data = fetch_data(finesse_data_url, query, match_threshold)
return jsonify(data)
except FinesseDataFetchException:
return jsonify({"error": current_app.config["ERROR_FINESSE_DATA_FAILED"]}), 500
except Exception:
return jsonify({"error": current_app.config["ERROR_UNEXPECTED"]}), 500
30 changes: 30 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
load_dotenv()

DEFAULT_DEBUG_MODE = "False"
DEFAULT_ERROR_EMPTY_QUERY = "Search query cannot be empty"
DEFAULT_ERROR_AZURE_FAILED = "Azure index search failed."
DEFAULT_ERROR_FINESSE_DATA_FAILED = "finesse-data static search failed"
DEFAULT_ERROR_UNEXPECTED = "Unexpected error."
DEFAULT_FUZZY_MATCH_THRESHOLD = 90
DEFAULT_SANITIZE_PATTERN = (
"[^\w \d\"#\$%&'\(\)\*\+,-\.\/:;?@\^_`{\|}~]+|\%\w+|;|/|\(|\)"
)


@dataclass
Expand All @@ -23,6 +31,28 @@ class Config:
AzureKeyCredential(os.getenv("FINESSE_BACKEND_AZURE_SEARCH_API_KEY")),
),
)
FINESSE_DATA_URL = os.getenv("FINESSE_BACKEND_STATIC_FILE_URL")
DEBUG = (
os.getenv("FINESSE_BACKEND_DEBUG_MODE", DEFAULT_DEBUG_MODE).lower() == "true"
)
ERROR_EMPTY_QUERY = os.getenv(
"FINESSE_BACKEND_ERROR_EMPTY_QUERY", DEFAULT_ERROR_EMPTY_QUERY
)
ERROR_AZURE_FAILED = os.getenv(
"FINESSE_BACKEND_ERROR_AZURE_FAILED", DEFAULT_ERROR_AZURE_FAILED
)
ERROR_FINESSE_DATA_FAILED = os.getenv(
"FINESSE_BACKEND_ERROR_FINESSE_DATA_FAILED",
DEFAULT_ERROR_FINESSE_DATA_FAILED,
)
ERROR_UNEXPECTED = os.getenv(
"FINESSE_BACKEND_ERROR_UNEXPECTED", DEFAULT_ERROR_UNEXPECTED
)
FUZZY_MATCH_THRESHOLD = int(
os.getenv(
"FINESSE_BACKEND_FUZZY_MATCH_THRESHOLD", DEFAULT_FUZZY_MATCH_THRESHOLD
)
)
SANITIZE_PATTERN = os.getenv(
"FINESSE_BACKEND_SANITIZE_PATTERN", DEFAULT_SANITIZE_PATTERN
)
40 changes: 40 additions & 0 deletions app/finesse_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging

import requests
from fuzzywuzzy import process


class FinesseDataFetchException(Exception):
"""Custom exception for errors in fetching data from finesse-data."""


class EmptyQueryError(Exception):
"""Raised when the search query is empty."""


def find_best_match(search_string, candidates, match_threshold):
best_match_result = process.extractOne(search_string, candidates)
if not best_match_result or best_match_result[1] < match_threshold:
logging.info(f"No close match found for search string: {search_string}")
return None
return best_match_result[0]


def fetch_data(finesse_data_url, query, match_threshold):
if not query:
logging.error("Empty search query received")
raise EmptyQueryError("Search query cannot be empty")

try:
response = requests.get(finesse_data_url)
response.raise_for_status()
files = response.json()
file_map = {file["name"]: file for file in files}
if best_match := find_best_match(query, file_map.keys(), match_threshold):
matching_file = file_map[best_match]
results_response = requests.get(matching_file["download_url"])
results_response.raise_for_status()
return results_response.json()
except requests.RequestException as e:
logging.error(f"finesse-data fetch failed: {e}", exc_info=True)
raise FinesseDataFetchException(f"finesse-data fetch failed: {e}") from e
6 changes: 6 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import re


def sanitize(input, pattern):
"""Mitigates log injection risks."""
return re.sub(pattern, "", input)
2 changes: 2 additions & 0 deletions requirements-production.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ flask-cors==4.0.0 # Released: 2023-06-26
gunicorn==21.2.0 # Released: 2023-07-19
python-dotenv==1.0.0 # Released: 2023-02-24
git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search
fuzzywuzzy==0.18.0
python-Levenshtein== 0.23.0
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ flask-cors
gunicorn
python-dotenv
git+https://github.com/ai-cfia/azure-db.git@main#subdirectory=azure-ai-search
fuzzywuzzy
python-Levenshtein
10 changes: 10 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
from dataclasses import dataclass
from unittest.mock import Mock

from app.config import Config


@dataclass
class TestAzureSearchConfig:
endpoint = "endpoint"
api_key = "api_key"
index_name = "index"
client = Mock()


@dataclass
class TestConfig(Config):
AZURE_CONFIG = TestAzureSearchConfig()
FINESSE_DATA_URL = ""
DEBUG = ""
TESTING = True
87 changes: 87 additions & 0 deletions tests/test_finesse_data_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import unittest
from unittest.mock import Mock, patch

import requests

from app.finesse_data import (
EmptyQueryError,
FinesseDataFetchException,
fetch_data,
find_best_match,
)


class TestFetchData(unittest.TestCase):
def setUp(self):
self.finesse_data_url = "https://example.com/data"
self.match_threshold = 90
self.files = [
{"name": "file1.json", "download_url": "https://example.com/file1.json"},
{"name": "file2.json", "download_url": "https://example.com/file2.json"},
]
self.candidates = [
"Annual Financial Report",
"Project Proposal March",
"Client Contact Information",
"Product Catalog",
]

@patch("app.finesse_data.requests.get")
def test_fetch_data_empty_query(self, mock_get):
with self.assertRaises(EmptyQueryError):
fetch_data(self.finesse_data_url, "", self.match_threshold)

@patch("app.finesse_data.requests.get")
def test_fetch_data_no_match_found(self, mock_get):
mock_get.return_value = Mock(status_code=200, json=lambda: self.files)
result = fetch_data(self.finesse_data_url, "bad query", self.match_threshold)
self.assertIsNone(result)

@patch("app.finesse_data.requests.get")
def test_fetch_data_success(self, mock_get):
mock_get.side_effect = [
Mock(status_code=200, json=lambda: self.files),
Mock(status_code=200, json=lambda: {"data": "content"}),
]
result = fetch_data(self.finesse_data_url, "file1", self.match_threshold)
self.assertEqual(result, {"data": "content"})

@patch("app.finesse_data.requests.get")
def test_fetch_data_request_exception(self, mock_get):
mock_get.side_effect = requests.RequestException()
with self.assertRaises(FinesseDataFetchException):
fetch_data(self.finesse_data_url, "a query", self.match_threshold)

def test_exact_match(self):
result = find_best_match(
"Annual Financial Report", self.candidates, self.match_threshold
)
self.assertEqual(result, "Annual Financial Report")

def test_misspelled_match(self):
result = find_best_match(
"Project Propsal March", self.candidates, self.match_threshold
)
self.assertEqual(result, "Project Proposal March")

def test_transposed_letters_match(self):
result = find_best_match(
"Cleint Contact Information", self.candidates, self.match_threshold
)
self.assertEqual(result, "Client Contact Information")

def test_extra_letter_match(self):
result = find_best_match(
"Product Catatalog", self.candidates, self.match_threshold
)
self.assertEqual(result, "Product Catalog")

def test_no_match_found(self):
result = find_best_match(
"Completely Unrelated String", self.candidates, self.match_threshold
)
self.assertIsNone(result)


if __name__ == "__main__":
unittest.main()
11 changes: 2 additions & 9 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
import unittest
from dataclasses import dataclass

from app.app_creator import create_app
from app.config import Config
from tests.common import TestAzureSearchConfig
from tests.common import TestConfig


@dataclass
class TestConfig(Config):
app_config = TestAzureSearchConfig()


class TestMonitorBlueprint(unittest.TestCase):
class TestMonitor(unittest.TestCase):
def setUp(self):
self.config = TestConfig()
self.app = create_app(self.config)
Expand Down
Loading

0 comments on commit fc4c1b6

Please sign in to comment.