From a836e4a84c82b141948fcc61e6b9f3184c10ec6d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 20 Dec 2024 00:41:27 +0530 Subject: [PATCH 1/2] fix: decode title from headers as utf-8 Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/doc_loader.py | 4 ++-- context_chat_backend/chain/ingest/injest.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index a5e8d33..4c607a8 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -122,10 +122,10 @@ def decode_source(source: UploadFile) -> str | None: try: # .pot files are powerpoint templates but also plain text files, # so we skip them to prevent decoding errors - if source.headers.get('title', '').endswith('.pot'): + if source.headers['title'].endswith('.pot'): return None - mimetype = source.headers.get('type') + mimetype = source.headers['type'] if mimetype is None: return None diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index cd59eda..4a28d1d 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -76,7 +76,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[ metadata = { 'source': source.filename, - 'title': source.headers['title'], + 'title': _decode_latin_1(source.headers['title']), 'type': source.headers['type'], } doc = Document(page_content=content, metadata=metadata) @@ -86,7 +86,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[ indocuments.append(InDocument( documents=split_docs, - userIds=source.headers['userIds'].split(','), + userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))), source_id=source.filename, # pyright: ignore[reportArgumentType] provider=source.headers['provider'], modified=to_int(source.headers['modified']), @@ -114,7 +114,7 @@ def _process_sources( try: vectordb.update_access( UpdateAccessOp.allow, - source.headers['userIds'].split(','), + list(map(_decode_latin_1, source.headers['userIds'].split(','))), source.filename, # pyright: ignore[reportArgumentType] ) except SafeDbException as e: @@ -141,6 +141,14 @@ def _process_sources( return added_sources +def _decode_latin_1(s: str) -> str: + try: + return s.encode('latin-1').decode('utf-8') + except UnicodeDecodeError: + print('Failed to decode latin-1:', s, flush=True) + return s + + def embed_sources( vectordb_loader: VectorDBLoader, config: TConfig, @@ -155,7 +163,7 @@ def embed_sources( print( 'Embedding sources:\n' + - '\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]), + '\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]), flush=True, ) From f88c798b353ae7dab1f5ac9b25925e48412580f7 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 20 Dec 2024 00:52:24 +0530 Subject: [PATCH 2/2] fix: ignore invalid bytes according to utf-8 Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/doc_loader.py | 8 ++++---- context_chat_backend/ocs_utils.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index 4c607a8..b4f03a0 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -31,7 +31,7 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str os.remove(tmp.name) if isinstance(docs, str) or isinstance(docs, bytes): - return docs.decode('utf-8') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType] + return docs.decode('utf-8', 'ignore') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType] return sep.join(d.page_content for d in docs) @@ -64,11 +64,11 @@ def _load_ppt_x(file: BinaryIO) -> str: def _load_rtf(file: BinaryIO) -> str: - return striprtf.rtf_to_text(file.read().decode('utf-8')).strip() + return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip() def _load_xml(file: BinaryIO) -> str: - data = file.read().decode('utf-8') + data = file.read().decode('utf-8', 'ignore') data = re.sub(r'', '', data) return data.strip() @@ -134,7 +134,7 @@ def decode_source(source: UploadFile) -> str | None: source.file.close() return result - result = source.file.read().decode('utf-8') + result = source.file.read().decode('utf-8', 'ignore') source.file.close() return result except Exception: diff --git a/context_chat_backend/ocs_utils.py b/context_chat_backend/ocs_utils.py index 8ef3698..461e09a 100644 --- a/context_chat_backend/ocs_utils.py +++ b/context_chat_backend/ocs_utils.py @@ -44,7 +44,7 @@ def _verify_signature(headers: Headers) -> str | None: ) return None - auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8') + auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8', 'ignore') username, app_secret = auth_aa.split(':', maxsplit=1) if app_secret != getenv('APP_SECRET'):