Skip to content

Commit

Permalink
Store sha1 hexdigest of markdown content (#135)
Browse files Browse the repository at this point in the history
* Store sha1 hexdigest of markdown content
- don't generate new phrases if content not changed
- Remove existing phrases before uploading new
  • Loading branch information
bdb-dd authored Feb 28, 2024
1 parent 8ca8e6f commit 69e3dbc
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
31 changes: 29 additions & 2 deletions docs_qa/generate_search_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pprint
import typesense
import datetime
import hashlib

from langchain.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate
Expand Down Expand Up @@ -138,8 +139,22 @@ async def run(collection_name_tmp):
existing_phrases = await lookup_search_phrases(url, collection_name_tmp)
# pp.pprint(existing_phrases)

if existing_phrases.get('found', 0) > 0:
print(f'Found existing phrases, skipping for url: {url}')
content_md = search_hit.get('content_markdown','')
checksum_md = hashlib.sha1(content_md.encode()).hexdigest() if content_md else None

existing_phrase_count = existing_phrases.get('found', 0)

if existing_phrase_count > 0:
stored_checksum = existing_phrases.get('hits', [])[0].get('document', {}).get('checksum', '')
checksum_matches = stored_checksum == checksum_md

if checksum_matches:
print(f'Found existing phrases and checksum matches, skipping for url: {url}')
doc_index += 1
continue

if existing_phrases.get('checksum', '') == checksum_md:
print(f'Found existing phrases and checksum matches, skipping for url: {url}')
doc_index += 1
continue

Expand All @@ -165,6 +180,16 @@ async def run(collection_name_tmp):

print(f'Generated search phrases for: {url}\n')

# delete existing search phrases before uploading new
for document in existing_phrases.get('hits', []):
doc_id = document.get('document', {}).get('id', '')
if doc_id:
try:
client.collections[collection_name_tmp].documents[doc_id].delete()
print(f'Search phrase ID {doc_id} deleted for url {url}')
except typesense.exceptions.ObjectNotFound:
print(f'Search phrase ID {doc_id} not found in collection "{collection_name_tmp}"')

upload_batch = []


Expand All @@ -176,6 +201,8 @@ async def run(collection_name_tmp):
'search_phrase': phrase.get('search_phrase', ''),
'sort_order': index,
'item_priority': 1,
'updated_at': int(datetime.datetime.utcnow().timestamp()),
'checksum': checksum_md,
}
upload_batch.append(batch)

Expand Down
4 changes: 3 additions & 1 deletion docs_qa/typesense_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ def setup_search_phrase_schema(collection_name_tmp):
}}},
{'name': 'language', 'type': 'string', 'facet': True, 'optional': True},
{'name': 'item_priority', 'type': 'int64'},
{'name': 'updated_at', 'type': 'int64'},
{'name': 'checksum', 'type': 'string'}
],
'default_sorting_field': 'sort_order',
'token_separators': ['_', '-', '/']
Expand All @@ -218,7 +220,7 @@ async def lookup_search_phrases(url, collection_name: str):
"collection": collection_name,
"q": "*",
"query_by":"url",
"include_fields": "id,url,search_phrase,sort_order",
"include_fields": "id,url,search_phrase,sort_order,updated_at,checksum",
"filter_by": f"url:={url}",
"sort_by": "sort_order:asc",
"per_page": 30,
Expand Down

0 comments on commit 69e3dbc

Please sign in to comment.