Skip to content

Commit

Permalink
Add option to specify how many first bytes to consider when searching…
Browse files Browse the repository at this point in the history
… for content charsets in header
  • Loading branch information
benoit74 committed Jun 18, 2024
1 parent 8f79546 commit f8fe4bc
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add `--ignore-content-header-charsets` option to disable automatic retrieval of content charsets from content first bytes (#318)
- Add `--content-header-bytes-length` option to specify how many first bytes to consider when searching for content charsets in header (#320)
- Add `--ignore-http-header-charsets` option to disable automatic retrieval of content charsets from content HTTP `Content-Type` headers (#318)

### Changed
Expand Down
3 changes: 3 additions & 0 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
missing_zim_paths: set[ZimPath] | None,
js_modules: set[ZimPath],
charsets_to_try: list[str],
content_header_bytes_length: int,
*,
ignore_content_header_charsets: bool,
ignore_http_header_charsets: bool,
Expand All @@ -83,6 +84,7 @@ def __init__(
self.rewrite_mode = self.get_rewrite_mode(record, mimetype)
self.js_modules = js_modules
self.charsets_to_try = charsets_to_try
self.content_header_bytes_length = content_header_bytes_length
self.ignore_content_header_charsets = ignore_content_header_charsets
self.ignore_http_header_charsets = ignore_http_header_charsets

Expand All @@ -92,6 +94,7 @@ def content_str(self) -> str:
self.content,
self.encoding,
self.charsets_to_try,
self.content_header_bytes_length,
ignore_content_header_charsets=self.ignore_content_header_charsets,
ignore_http_header_charsets=self.ignore_http_header_charsets,
)
Expand Down
2 changes: 2 additions & 0 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def __init__(self, args):
self.charsets_to_try: list[str] = [
charset_to_try.strip() for charset_to_try in args.charsets_to_try.split(",")
]
self.content_header_bytes_length: int = int(args.content_header_bytes_length)

# progress file handling
self.stats_filename = (
Expand Down Expand Up @@ -753,6 +754,7 @@ def add_items_for_warc_record(self, record):
self.missing_zim_paths,
self.js_modules,
self.charsets_to_try,
self.content_header_bytes_length,
ignore_content_header_charsets=self.ignore_content_header_charsets,
ignore_http_header_charsets=self.ignore_http_header_charsets,
)
Expand Down
2 changes: 2 additions & 0 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
missing_zim_paths: set[ZimPath] | None,
js_modules: set[ZimPath],
charsets_to_try: list[str],
content_header_bytes_length: int,
*,
ignore_content_header_charsets: bool,
ignore_http_header_charsets: bool,
Expand All @@ -49,6 +50,7 @@ def __init__(
missing_zim_paths,
js_modules,
charsets_to_try,
content_header_bytes_length,
ignore_content_header_charsets=ignore_content_header_charsets,
ignore_http_header_charsets=ignore_http_header_charsets,
).rewrite(pre_head_template, post_head_template)
Expand Down
7 changes: 7 additions & 0 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ def main(raw_args=None):
default=False,
)

parser.add_argument(
"--content-header-bytes-length",
help="How many bytes to consider when searching for content charsets in header",
type=int,
default=1024,
)

parser.add_argument(
"--ignore-http-header-charsets",
help="Ignore the charsets specified in HTTP `Content-Type` headers, typically "
Expand Down
5 changes: 4 additions & 1 deletion src/warc2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def to_string(
input_: str | bytes,
http_encoding: str | None,
charsets_to_try: list[str],
content_header_bytes_length: int,
*,
ignore_content_header_charsets: bool,
ignore_http_header_charsets: bool,
Expand Down Expand Up @@ -166,7 +167,9 @@ def to_string(
# Search for encoding from content first bytes based on regexp
if not ignore_content_header_charsets:
for encoding in ["ascii", "utf-16", "utf-32"]:
content_start = input_[:1024].decode(encoding, errors="replace")
content_start = input_[:content_header_bytes_length].decode(
encoding, errors="replace"
)
if m := ENCODING_RE.search(content_start):
head_encoding = m.group("encoding")
return input_.decode(head_encoding, errors="replace")
Expand Down
1 change: 1 addition & 0 deletions tests/test_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def generate_and_call(
set(),
set(),
["UTF-8", "ISO-8859-1"],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
).rewrite(Template(""), Template(""))
Expand Down
57 changes: 57 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def test_decode_http_header(simple_encoded_content):
simple_encoded_content.encoded,
simple_encoded_content.encoding,
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -86,6 +87,7 @@ def test_decode_bad_http_header(simple_encoded_content):
"latin1",
# but we luckily have the proper "try-charset"
[simple_encoded_content.encoding],
1024,
# and we've disabled the use of HTTP header
ignore_http_header_charsets=True,
ignore_content_header_charsets=False,
Expand Down Expand Up @@ -115,6 +117,7 @@ def test_decode_html_header(declared_html_encoded_content):
test_case.encoded,
None,
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand Down Expand Up @@ -146,6 +149,7 @@ def test_decode_bad_html_header(badly_declared_html_encoded_content):
None,
# Indicate proper charset to use in try-charsets
["ISO-8859-1"],
1024,
ignore_http_header_charsets=False,
# Disable charset defined in content first bytes
ignore_content_header_charsets=True,
Expand All @@ -159,6 +163,7 @@ def test_decode_str(content, encoding):
content,
encoding,
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -176,6 +181,7 @@ def test_binary_content():
content,
"UTF-8",
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -189,6 +195,7 @@ def test_single_bad_character():
content,
"utf-8-sig",
[],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -204,6 +211,7 @@ def test_decode_charset_to_try(simple_encoded_content):
simple_encoded_content.encoded,
None,
[simple_encoded_content.encoding],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -217,6 +225,7 @@ def test_decode_weird_encoding_not_declared_not_in_try_list():
"Latin1 contént".encode("latin1"),
None,
["UTF-8"],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand All @@ -229,6 +238,7 @@ def test_decode_weird_encoding_not_declared_in_try_list():
content.encode("latin1"),
None,
["UTF-8", "latin1"],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
Expand Down Expand Up @@ -272,8 +282,55 @@ def test_decode_files(testdata: CharsetsTestData):
(Path(__file__).parent / "encodings" / testdata.filename).read_bytes(),
testdata.http_charset,
["UTF-8", "latin1"],
1024,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
for expected_string in testdata.expected_strings:
assert expected_string in result


def test_decode_charset_too_far_away_without_fallback():
content = '<html><meta charset="latin1"><body>content</body></html>'
with pytest.raises(ValueError, match="No suitable charset"):
to_string(
content.encode("latin1"),
None,
[],
24,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)


def test_decode_charset_too_far_away_with_fallback():
content = '<html><meta charset="latin1"><body>content</body></html>'
assert (
to_string(
content.encode("latin1"),
None,
["latin1"],
24,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
== content
)


def test_decode_charset_far_away():
content = (
f'<html>{"".join("-" for i in range(1024))}<meta charset="latin1">'
"<body>content</body></html>"
)
assert (
to_string(
content.encode("latin1"),
None,
[],
1200,
ignore_http_header_charsets=False,
ignore_content_header_charsets=False,
)
== content
)

0 comments on commit f8fe4bc

Please sign in to comment.