Add option to specify how many first bytes to consider when searching…

… for content charsets in header
openzim · Jun 18, 2024 · f8fe4bc · f8fe4bc
1 parent 8f79546
commit f8fe4bc
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Add `--ignore-content-header-charsets` option to disable automatic retrieval of content charsets from content first bytes (#318)
+- Add `--content-header-bytes-length` option to specify how many first bytes to consider when searching for content charsets in header (#320)
 - Add `--ignore-http-header-charsets` option to disable automatic retrieval of content charsets from content HTTP `Content-Type` headers (#318)
 
 ### Changed

diff --git a/src/warc2zim/content_rewriting/generic.py b/src/warc2zim/content_rewriting/generic.py
@@ -64,6 +64,7 @@ def __init__(
         missing_zim_paths: set[ZimPath] | None,
         js_modules: set[ZimPath],
         charsets_to_try: list[str],
+        content_header_bytes_length: int,
         *,
         ignore_content_header_charsets: bool,
         ignore_http_header_charsets: bool,
@@ -83,6 +84,7 @@ def __init__(
         self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
         self.js_modules = js_modules
         self.charsets_to_try = charsets_to_try
+        self.content_header_bytes_length = content_header_bytes_length
         self.ignore_content_header_charsets = ignore_content_header_charsets
         self.ignore_http_header_charsets = ignore_http_header_charsets
 
@@ -92,6 +94,7 @@ def content_str(self) -> str:
             self.content,
             self.encoding,
             self.charsets_to_try,
+            self.content_header_bytes_length,
             ignore_content_header_charsets=self.ignore_content_header_charsets,
             ignore_http_header_charsets=self.ignore_http_header_charsets,
         )

diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py
@@ -195,6 +195,7 @@ def __init__(self, args):
         self.charsets_to_try: list[str] = [
             charset_to_try.strip() for charset_to_try in args.charsets_to_try.split(",")
         ]
+        self.content_header_bytes_length: int = int(args.content_header_bytes_length)
 
         # progress file handling
         self.stats_filename = (
@@ -753,6 +754,7 @@ def add_items_for_warc_record(self, record):
                 self.missing_zim_paths,
                 self.js_modules,
                 self.charsets_to_try,
+                self.content_header_bytes_length,
                 ignore_content_header_charsets=self.ignore_content_header_charsets,
                 ignore_http_header_charsets=self.ignore_http_header_charsets,
             )

diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py
@@ -34,6 +34,7 @@ def __init__(
         missing_zim_paths: set[ZimPath] | None,
         js_modules: set[ZimPath],
         charsets_to_try: list[str],
+        content_header_bytes_length: int,
         *,
         ignore_content_header_charsets: bool,
         ignore_http_header_charsets: bool,
@@ -49,6 +50,7 @@ def __init__(
             missing_zim_paths,
             js_modules,
             charsets_to_try,
+            content_header_bytes_length,
             ignore_content_header_charsets=ignore_content_header_charsets,
             ignore_http_header_charsets=ignore_http_header_charsets,
         ).rewrite(pre_head_template, post_head_template)

diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
@@ -126,6 +126,13 @@ def main(raw_args=None):
         default=False,
     )
 
+    parser.add_argument(
+        "--content-header-bytes-length",
+        help="How many bytes to consider when searching for content charsets in header",
+        type=int,
+        default=1024,
+    )
+
     parser.add_argument(
         "--ignore-http-header-charsets",
         help="Ignore the charsets specified in HTTP `Content-Type` headers, typically "

diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
@@ -128,6 +128,7 @@ def to_string(
     input_: str | bytes,
     http_encoding: str | None,
     charsets_to_try: list[str],
+    content_header_bytes_length: int,
     *,
     ignore_content_header_charsets: bool,
     ignore_http_header_charsets: bool,
@@ -166,7 +167,9 @@ def to_string(
     # Search for encoding from content first bytes based on regexp
     if not ignore_content_header_charsets:
         for encoding in ["ascii", "utf-16", "utf-32"]:
-            content_start = input_[:1024].decode(encoding, errors="replace")
+            content_start = input_[:content_header_bytes_length].decode(
+                encoding, errors="replace"
+            )
             if m := ENCODING_RE.search(content_start):
                 head_encoding = m.group("encoding")
                 return input_.decode(head_encoding, errors="replace")

diff --git a/tests/test_rewriting.py b/tests/test_rewriting.py
@@ -39,6 +39,7 @@ def generate_and_call(
             set(),
             set(),
             ["UTF-8", "ISO-8859-1"],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         ).rewrite(Template(""), Template(""))

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -68,6 +68,7 @@ def test_decode_http_header(simple_encoded_content):
             simple_encoded_content.encoded,
             simple_encoded_content.encoding,
             [],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
@@ -86,6 +87,7 @@ def test_decode_bad_http_header(simple_encoded_content):
             "latin1",
             # but we luckily have the proper "try-charset"
             [simple_encoded_content.encoding],
+            1024,
             # and we've disabled the use of HTTP header
             ignore_http_header_charsets=True,
             ignore_content_header_charsets=False,
@@ -115,6 +117,7 @@ def test_decode_html_header(declared_html_encoded_content):
             test_case.encoded,
             None,
             [],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
@@ -146,6 +149,7 @@ def test_decode_bad_html_header(badly_declared_html_encoded_content):
             None,
             # Indicate proper charset to use in try-charsets
             ["ISO-8859-1"],
+            1024,
             ignore_http_header_charsets=False,
             # Disable charset defined in content first bytes
             ignore_content_header_charsets=True,
@@ -159,6 +163,7 @@ def test_decode_str(content, encoding):
         content,
         encoding,
         [],
+        1024,
         ignore_http_header_charsets=False,
         ignore_content_header_charsets=False,
     )
@@ -176,6 +181,7 @@ def test_binary_content():
         content,
         "UTF-8",
         [],
+        1024,
         ignore_http_header_charsets=False,
         ignore_content_header_charsets=False,
     )
@@ -189,6 +195,7 @@ def test_single_bad_character():
         content,
         "utf-8-sig",
         [],
+        1024,
         ignore_http_header_charsets=False,
         ignore_content_header_charsets=False,
     )
@@ -204,6 +211,7 @@ def test_decode_charset_to_try(simple_encoded_content):
             simple_encoded_content.encoded,
             None,
             [simple_encoded_content.encoding],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
@@ -217,6 +225,7 @@ def test_decode_weird_encoding_not_declared_not_in_try_list():
             "Latin1 contént".encode("latin1"),
             None,
             ["UTF-8"],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
@@ -229,6 +238,7 @@ def test_decode_weird_encoding_not_declared_in_try_list():
             content.encode("latin1"),
             None,
             ["UTF-8", "latin1"],
+            1024,
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
@@ -272,8 +282,55 @@ def test_decode_files(testdata: CharsetsTestData):
         (Path(__file__).parent / "encodings" / testdata.filename).read_bytes(),
         testdata.http_charset,
         ["UTF-8", "latin1"],
+        1024,
         ignore_http_header_charsets=False,
         ignore_content_header_charsets=False,
     )
     for expected_string in testdata.expected_strings:
         assert expected_string in result
+
+
+def test_decode_charset_too_far_away_without_fallback():
+    content = '<html><meta charset="latin1"><body>content</body></html>'
+    with pytest.raises(ValueError, match="No suitable charset"):
+        to_string(
+            content.encode("latin1"),
+            None,
+            [],
+            24,
+            ignore_http_header_charsets=False,
+            ignore_content_header_charsets=False,
+        )
+
+
+def test_decode_charset_too_far_away_with_fallback():
+    content = '<html><meta charset="latin1"><body>content</body></html>'
+    assert (
+        to_string(
+            content.encode("latin1"),
+            None,
+            ["latin1"],
+            24,
+            ignore_http_header_charsets=False,
+            ignore_content_header_charsets=False,
+        )
+        == content
+    )
+
+
+def test_decode_charset_far_away():
+    content = (
+        f'<html>{"".join("-" for i in range(1024))}<meta charset="latin1">'
+        "<body>content</body></html>"
+    )
+    assert (
+        to_string(
+            content.encode("latin1"),
+            None,
+            [],
+            1200,
+            ignore_http_header_charsets=False,
+            ignore_content_header_charsets=False,
+        )
+        == content
+    )