Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔧 improve the detector general reliability #532

Merged
merged 1 commit into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-03-??)
## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)

### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.

### Changed
- Optional mypyc compilation upgraded to version 1.9.0 for Python >= 3.8
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)

## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

Expand Down
16 changes: 10 additions & 6 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,16 +221,20 @@ def from_bytes(
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
Expand Down
2 changes: 2 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@
"|",
'"',
"-",
"(",
")",
}


Expand Down
19 changes: 16 additions & 3 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count <= 24:
if self._character_count <= 13:
return 0.0

ratio_of_suspicious_range_usage: float = (
Expand All @@ -260,6 +260,7 @@ def __init__(self) -> None:

self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0

def eligible(self, character: str) -> bool:
return True
Expand All @@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
Expand All @@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
self._character_count += buffer_length

if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
Expand All @@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.2"
__version__ = "3.3.3"
VERSION = __version__.split(".")
Loading