From 69e3dbc8cccea93463f3cefcc6cc4119e612d0b8 Mon Sep 17 00:00:00 2001
From: "Benjamin D. Brodie" <113984758+bdb-dd@users.noreply.github.com>
Date: Wed, 28 Feb 2024 16:46:21 +0100
Subject: [PATCH] Store sha1 hexdigest of markdown content (#135)

* Store sha1 hexdigest of markdown content
- don't generate new phrases if content not changed
- Remove existing phrases before uploading new
---
 docs_qa/generate_search_phrases.py | 31 ++++++++++++++++++++++++++++--
 docs_qa/typesense_search.py        |  4 +++-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/docs_qa/generate_search_phrases.py b/docs_qa/generate_search_phrases.py
index c20bebc..d701016 100644
--- a/docs_qa/generate_search_phrases.py
+++ b/docs_qa/generate_search_phrases.py
@@ -5,6 +5,7 @@
 import pprint
 import typesense
 import datetime
+import hashlib
 
 from langchain.pydantic_v1 import BaseModel, Field
 from langchain.prompts import ChatPromptTemplate
@@ -138,8 +139,22 @@ async def run(collection_name_tmp):
             existing_phrases = await lookup_search_phrases(url, collection_name_tmp)
             # pp.pprint(existing_phrases)
 
-            if existing_phrases.get('found', 0) > 0:
-                print(f'Found existing phrases, skipping for url: {url}')            
+            content_md = search_hit.get('content_markdown','')
+            checksum_md = hashlib.sha1(content_md.encode()).hexdigest() if content_md else None
+
+            existing_phrase_count = existing_phrases.get('found', 0)
+
+            if existing_phrase_count > 0:
+                stored_checksum = existing_phrases.get('hits', [])[0].get('document', {}).get('checksum', '')
+                checksum_matches = stored_checksum == checksum_md
+
+                if checksum_matches:
+                    print(f'Found existing phrases and checksum matches, skipping for url: {url}')            
+                    doc_index += 1      
+                    continue
+
+            if existing_phrases.get('checksum', '') == checksum_md:
+                print(f'Found existing phrases and checksum matches, skipping for url: {url}')            
                 doc_index += 1    
                 continue
 
@@ -165,6 +180,16 @@ async def run(collection_name_tmp):
 
             print(f'Generated search phrases for: {url}\n')
 
+            # delete existing search phrases before uploading new
+            for document in existing_phrases.get('hits', []):
+                doc_id = document.get('document', {}).get('id', '')
+                if doc_id:
+                    try:
+                        client.collections[collection_name_tmp].documents[doc_id].delete()
+                        print(f'Search phrase ID {doc_id} deleted for url {url}')
+                    except typesense.exceptions.ObjectNotFound:
+                        print(f'Search phrase ID {doc_id} not found in collection "{collection_name_tmp}"')
+
             upload_batch = []
             
             
@@ -176,6 +201,8 @@ async def run(collection_name_tmp):
                     'search_phrase': phrase.get('search_phrase', ''),
                     'sort_order': index,
                     'item_priority': 1,
+                    'updated_at': int(datetime.datetime.utcnow().timestamp()),
+                    'checksum': checksum_md,
                 }
                 upload_batch.append(batch)
 
diff --git a/docs_qa/typesense_search.py b/docs_qa/typesense_search.py
index 691f705..23bd142 100644
--- a/docs_qa/typesense_search.py
+++ b/docs_qa/typesense_search.py
@@ -194,6 +194,8 @@ def setup_search_phrase_schema(collection_name_tmp):
                             }}},
                 {'name': 'language', 'type': 'string', 'facet': True, 'optional': True},
                 {'name': 'item_priority', 'type': 'int64'},
+                {'name': 'updated_at', 'type': 'int64'},
+                {'name': 'checksum', 'type': 'string'}
             ],
             'default_sorting_field': 'sort_order',
             'token_separators': ['_', '-', '/']
@@ -218,7 +220,7 @@ async def lookup_search_phrases(url, collection_name: str):
                     "collection": collection_name,
                     "q": "*",
                     "query_by":"url",
-                    "include_fields": "id,url,search_phrase,sort_order",
+                    "include_fields": "id,url,search_phrase,sort_order,updated_at,checksum",
                     "filter_by": f"url:={url}",
                     "sort_by": "sort_order:asc",                
                     "per_page": 30,