Skip to content

Commit

Permalink
Merge pull request #19 from brc-dd/fix/18
Browse files Browse the repository at this point in the history
Fix character decoding issues with text-like files
  • Loading branch information
gagb authored Dec 16, 2024
2 parents c9c7d98 + aeff2cb commit ed91e8b
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 3 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dependencies = [
"youtube-transcript-api",
"SpeechRecognition",
"pathvalidate",
"charset-normalizer",
]

[project.urls]
Expand Down
5 changes: 2 additions & 3 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import puremagic
import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path

# Optional Transcription support
try:
Expand Down Expand Up @@ -161,9 +162,7 @@ def convert(
elif "text/" not in content_type.lower():
return None

text_content = ""
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
text_content = str(from_path(local_path).best())
return DocumentConverterResult(
title=None,
text_content=text_content,
Expand Down
4 changes: 4 additions & 0 deletions tests/test_files/test_mskanji.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
���O,�N��,�Z��
�������Y,30,����
�O�؉p�q,25,���
�����~,35,����
13 changes: 13 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@
"data:image/svg+xml,%3Csvg%20width%3D",
]

CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]


@pytest.mark.skipif(
skip_remote,
Expand Down Expand Up @@ -164,6 +171,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS:
assert test_string in text_content

## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content


@pytest.mark.skipif(
skip_exiftool,
Expand Down

0 comments on commit ed91e8b

Please sign in to comment.