From 52b723724c33b76cf3a2ee1e4d636ee81312e388 Mon Sep 17 00:00:00 2001 From: Divyansh Singh <40380293+brc-dd@users.noreply.github.com> Date: Sun, 15 Dec 2024 10:37:15 +0530 Subject: [PATCH] Fix character decoding issues with text-like files --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 5 ++--- tests/test_files/test_mskanji.csv | 4 ++++ tests/test_markitdown.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tests/test_files/test_mskanji.csv diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..25786f6 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -26,6 +26,7 @@ import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -161,9 +162,7 @@ def convert( elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content, diff --git a/tests/test_files/test_mskanji.csv b/tests/test_files/test_mskanji.csv new file mode 100644 index 0000000..d67f5a3 --- /dev/null +++ b/tests/test_files/test_mskanji.csv @@ -0,0 +1,4 @@ +名前,年齢,住所 +佐藤太郎,30,東京 +三木英子,25,大阪 +煖エ淳,35,名古屋 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..ac08820 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -87,6 +87,13 @@ "data:image/svg+xml,%3Csvg%20width%3D", ] +CSV_CP932_TEST_STRINGS = [ + "蜷榊燕,蟷エ鮨「,菴乗園", + "菴占陸螟ェ驛,30,譚ア莠ャ", + "荳画惠闍ア蟄,25,螟ァ髦ェ", + "鬮呎ゥ区キウ,35,蜷榊商螻", +] + @pytest.mark.skipif( skip_remote, @@ -164,6 +171,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + ## Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + text_content = result.text_content.replace("\\", "") + for test_string in CSV_CP932_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool,