Skip to content

Commit

Permalink
fix: utf-8 encoding fixes (#118)
Browse files Browse the repository at this point in the history
fixes #71
  • Loading branch information
kyteinsky authored Dec 20, 2024
2 parents c3cc44b + f88c798 commit be82b42
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 11 deletions.
12 changes: 6 additions & 6 deletions context_chat_backend/chain/ingest/doc_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
os.remove(tmp.name)

if isinstance(docs, str) or isinstance(docs, bytes):
return docs.decode('utf-8') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType]
return docs.decode('utf-8', 'ignore') if isinstance(docs, bytes) else docs # pyright: ignore[reportReturnType]

return sep.join(d.page_content for d in docs)

Expand Down Expand Up @@ -64,11 +64,11 @@ def _load_ppt_x(file: BinaryIO) -> str:


def _load_rtf(file: BinaryIO) -> str:
return striprtf.rtf_to_text(file.read().decode('utf-8')).strip()
return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()


def _load_xml(file: BinaryIO) -> str:
data = file.read().decode('utf-8')
data = file.read().decode('utf-8', 'ignore')
data = re.sub(r'</.+>', '', data)
return data.strip()

Expand Down Expand Up @@ -122,10 +122,10 @@ def decode_source(source: UploadFile) -> str | None:
try:
# .pot files are powerpoint templates but also plain text files,
# so we skip them to prevent decoding errors
if source.headers.get('title', '').endswith('.pot'):
if source.headers['title'].endswith('.pot'):
return None

mimetype = source.headers.get('type')
mimetype = source.headers['type']
if mimetype is None:
return None

Expand All @@ -134,7 +134,7 @@ def decode_source(source: UploadFile) -> str | None:
source.file.close()
return result

result = source.file.read().decode('utf-8')
result = source.file.read().decode('utf-8', 'ignore')
source.file.close()
return result
except Exception:
Expand Down
16 changes: 12 additions & 4 deletions context_chat_backend/chain/ingest/injest.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[

metadata = {
'source': source.filename,
'title': source.headers['title'],
'title': _decode_latin_1(source.headers['title']),
'type': source.headers['type'],
}
doc = Document(page_content=content, metadata=metadata)
Expand All @@ -86,7 +86,7 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[

indocuments.append(InDocument(
documents=split_docs,
userIds=source.headers['userIds'].split(','),
userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))),
source_id=source.filename, # pyright: ignore[reportArgumentType]
provider=source.headers['provider'],
modified=to_int(source.headers['modified']),
Expand Down Expand Up @@ -114,7 +114,7 @@ def _process_sources(
try:
vectordb.update_access(
UpdateAccessOp.allow,
source.headers['userIds'].split(','),
list(map(_decode_latin_1, source.headers['userIds'].split(','))),
source.filename, # pyright: ignore[reportArgumentType]
)
except SafeDbException as e:
Expand All @@ -141,6 +141,14 @@ def _process_sources(
return added_sources


def _decode_latin_1(s: str) -> str:
try:
return s.encode('latin-1').decode('utf-8')
except UnicodeDecodeError:
print('Failed to decode latin-1:', s, flush=True)
return s


def embed_sources(
vectordb_loader: VectorDBLoader,
config: TConfig,
Expand All @@ -155,7 +163,7 @@ def embed_sources(

print(
'Embedding sources:\n' +
'\n'.join([f'{source.filename} ({source.headers["title"]})' for source in sources_filtered]),
'\n'.join([f'{source.filename} ({_decode_latin_1(source.headers["title"])})' for source in sources_filtered]),
flush=True,
)

Expand Down
2 changes: 1 addition & 1 deletion context_chat_backend/ocs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _verify_signature(headers: Headers) -> str | None:
)
return None

auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8')
auth_aa = b64decode(headers.get('AUTHORIZATION-APP-API', '')).decode('UTF-8', 'ignore')
username, app_secret = auth_aa.split(':', maxsplit=1)

if app_secret != getenv('APP_SECRET'):
Expand Down

0 comments on commit be82b42

Please sign in to comment.