From 69e3dbc8cccea93463f3cefcc6cc4119e612d0b8 Mon Sep 17 00:00:00 2001 From: "Benjamin D. Brodie" <113984758+bdb-dd@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:46:21 +0100 Subject: [PATCH] Store sha1 hexdigest of markdown content (#135) * Store sha1 hexdigest of markdown content - don't generate new phrases if content not changed - Remove existing phrases before uploading new --- docs_qa/generate_search_phrases.py | 31 ++++++++++++++++++++++++++++-- docs_qa/typesense_search.py | 4 +++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/docs_qa/generate_search_phrases.py b/docs_qa/generate_search_phrases.py index c20bebc..d701016 100644 --- a/docs_qa/generate_search_phrases.py +++ b/docs_qa/generate_search_phrases.py @@ -5,6 +5,7 @@ import pprint import typesense import datetime +import hashlib from langchain.pydantic_v1 import BaseModel, Field from langchain.prompts import ChatPromptTemplate @@ -138,8 +139,22 @@ async def run(collection_name_tmp): existing_phrases = await lookup_search_phrases(url, collection_name_tmp) # pp.pprint(existing_phrases) - if existing_phrases.get('found', 0) > 0: - print(f'Found existing phrases, skipping for url: {url}') + content_md = search_hit.get('content_markdown','') + checksum_md = hashlib.sha1(content_md.encode()).hexdigest() if content_md else None + + existing_phrase_count = existing_phrases.get('found', 0) + + if existing_phrase_count > 0: + stored_checksum = existing_phrases.get('hits', [])[0].get('document', {}).get('checksum', '') + checksum_matches = stored_checksum == checksum_md + + if checksum_matches: + print(f'Found existing phrases and checksum matches, skipping for url: {url}') + doc_index += 1 + continue + + if existing_phrases.get('checksum', '') == checksum_md: + print(f'Found existing phrases and checksum matches, skipping for url: {url}') doc_index += 1 continue @@ -165,6 +180,16 @@ async def run(collection_name_tmp): print(f'Generated search phrases for: {url}\n') + # delete existing search phrases before uploading new + for document in existing_phrases.get('hits', []): + doc_id = document.get('document', {}).get('id', '') + if doc_id: + try: + client.collections[collection_name_tmp].documents[doc_id].delete() + print(f'Search phrase ID {doc_id} deleted for url {url}') + except typesense.exceptions.ObjectNotFound: + print(f'Search phrase ID {doc_id} not found in collection "{collection_name_tmp}"') + upload_batch = [] @@ -176,6 +201,8 @@ async def run(collection_name_tmp): 'search_phrase': phrase.get('search_phrase', ''), 'sort_order': index, 'item_priority': 1, + 'updated_at': int(datetime.datetime.utcnow().timestamp()), + 'checksum': checksum_md, } upload_batch.append(batch) diff --git a/docs_qa/typesense_search.py b/docs_qa/typesense_search.py index 691f705..23bd142 100644 --- a/docs_qa/typesense_search.py +++ b/docs_qa/typesense_search.py @@ -194,6 +194,8 @@ def setup_search_phrase_schema(collection_name_tmp): }}}, {'name': 'language', 'type': 'string', 'facet': True, 'optional': True}, {'name': 'item_priority', 'type': 'int64'}, + {'name': 'updated_at', 'type': 'int64'}, + {'name': 'checksum', 'type': 'string'} ], 'default_sorting_field': 'sort_order', 'token_separators': ['_', '-', '/'] @@ -218,7 +220,7 @@ async def lookup_search_phrases(url, collection_name: str): "collection": collection_name, "q": "*", "query_by":"url", - "include_fields": "id,url,search_phrase,sort_order", + "include_fields": "id,url,search_phrase,sort_order,updated_at,checksum", "filter_by": f"url:={url}", "sort_by": "sort_order:asc", "per_page": 30,