From a836e4a84c82b141948fcc61e6b9f3184c10ec6d Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 20 Dec 2024 00:41:27 +0530
Subject: [PATCH 1/2] fix: decode title from headers as utf-8

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/doc_loader.py |  4 ++--
 context_chat_backend/chain/ingest/injest.py     | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index a5e8d33..4c607a8 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -122,10 +122,10 @@ def decode_source(source: UploadFile) -> str | None:
 	try:
 		# .pot files are powerpoint templates but also plain text files,
 		# so we skip them to prevent decoding errors
-		if source.headers.get('title', '').endswith('.pot'):
+		if source.headers['title'].endswith('.pot'):
 			return None
 
-		mimetype = source.headers.get('type')
+		mimetype = source.headers['type']
 		if mimetype is None:
 			return None
 
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index cd59eda..4a28d1d 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -76,7 +76,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 
 		metadata = {
 			'source': source.filename,
-			'title': source.headers['title'],
+			'title': _decode_latin_1(source.headers['title']),
 			'type': source.headers['type'],
 		}
 		doc = Document(page_content=content, metadata=metadata)
@@ -86,7 +86,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 
 		indocuments.append(InDocument(
 			documents=split_docs,
-			userIds=source.headers['userIds'].split(','),
+			userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))),
 			source_id=source.filename,  # pyright: ignore[reportArgumentType]
 			provider=source.headers['provider'],
 			modified=to_int(source.headers['modified']),
@@ -114,7 +114,7 @@ def _process_sources(
 			try:
 				vectordb.update_access(
 					UpdateAccessOp.allow,
-					source.headers['userIds'].split(','),
+					list(map(_decode_latin_1, source.headers['userIds'].split(','))),
 					source.filename,  # pyright: ignore[reportArgumentType]
 				)
 			except SafeDbException as e:
@@ -141,6 +141,14 @@ def _process_sources(
 	return added_sources
 
 
+def _decode_latin_1(s: str) -> str:
+	try:
+		return s.encode('latin-1').decode('utf-8')
+	except UnicodeDecodeError:
+		print('Failed to decode latin-1:', s, flush=True)
+		return s
+
+
 def embed_sources(
 	vectordb_loader: VectorDBLoader,
 	config: TConfig,
@@ -155,7 +163,7 @@ def embed_sources(
 
 	print(
 		'Embedding sources:\n' +
-		'\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]),
+		'\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]),
 		flush=True,
 	)
 

From f88c798b353ae7dab1f5ac9b25925e48412580f7 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 20 Dec 2024 00:52:24 +0530
Subject: [PATCH 2/2] fix: ignore invalid bytes according to utf-8

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/doc_loader.py | 8 ++++----
 context_chat_backend/ocs_utils.py               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index 4c607a8..b4f03a0 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -31,7 +31,7 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
 			os.remove(tmp.name)
 
 	if isinstance(docs, str) or isinstance(docs, bytes):
-		return docs.decode('utf-8') if isinstance(docs, bytes) else docs  # pyright: ignore[reportReturnType]
+		return docs.decode('utf-8', 'ignore') if isinstance(docs, bytes) else docs  # pyright: ignore[reportReturnType]
 
 	return sep.join(d.page_content for d in docs)
 
@@ -64,11 +64,11 @@ def _load_ppt_x(file: BinaryIO) -> str:
 
 
 def _load_rtf(file: BinaryIO) -> str:
-	return striprtf.rtf_to_text(file.read().decode('utf-8')).strip()
+	return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()
 
 
 def _load_xml(file: BinaryIO) -> str:
-	data = file.read().decode('utf-8')
+	data = file.read().decode('utf-8', 'ignore')
 	data = re.sub(r'</.+>', '', data)
 	return data.strip()
 
@@ -134,7 +134,7 @@ def decode_source(source: UploadFile) -> str | None:
 			source.file.close()
 			return result
 
-		result = source.file.read().decode('utf-8')
+		result = source.file.read().decode('utf-8', 'ignore')
 		source.file.close()
 		return result
 	except Exception:
diff --git a/context_chat_backend/ocs_utils.py b/context_chat_backend/ocs_utils.py
index 8ef3698..461e09a 100644
--- a/context_chat_backend/ocs_utils.py
+++ b/context_chat_backend/ocs_utils.py
@@ -44,7 +44,7 @@ def _verify_signature(headers: Headers) -> str | None:
 		)
 		return None
 
-	auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8')
+	auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8', 'ignore')
 	username, app_secret = auth_aa.split(':', maxsplit=1)
 
 	if app_secret != getenv('APP_SECRET'):