From b6c460e09f53e463920ca23fd924cf5054b0143b Mon Sep 17 00:00:00 2001 From: "TAHRI Ahmed R." Date: Wed, 2 Oct 2024 09:30:34 +0200 Subject: [PATCH] :wrench: improve detector based on case 537 (#538) --- CHANGELOG.md | 2 +- charset_normalizer/api.py | 44 ++++++++++++++++++++++++++---------- charset_normalizer/models.py | 2 +- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8a53576..eba4d0dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. -- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) +- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537) - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381) ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index a51ee35e..70a90182 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -159,6 +159,8 @@ def from_bytes( results: CharsetMatches = CharsetMatches() + early_stop_results: CharsetMatches = CharsetMatches() + sig_encoding, sig_payload = identify_sig_or_bom(sequences) if sig_encoding is not None: @@ -431,29 +433,47 @@ def from_bytes( ), ) - results.append( - CharsetMatch( - sequences, - encoding_iana, - mean_mess_ratio, - bom_or_sig_available, - cd_ratios_merged, - decoded_payload, - preemptive_declaration=specified_encoding, - ) + current_match = CharsetMatch( + sequences, + encoding_iana, + mean_mess_ratio, + bom_or_sig_available, + cd_ratios_merged, + ( + decoded_payload + if ( + is_too_large_sequence is False + or encoding_iana in [specified_encoding, "ascii", "utf_8"] + ) + else None + ), + preemptive_declaration=specified_encoding, ) + results.append(current_match) + if ( encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1 ): + early_stop_results.append(current_match) + + if ( + len(early_stop_results) + and (specified_encoding is None or specified_encoding in tested) + and "ascii" in tested + and "utf_8" in tested + ): + probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment] logger.debug( - "Encoding detection: %s is most likely the one.", encoding_iana + "Encoding detection: %s is most likely the one.", + probable_result.encoding, ) if explain: logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) - return CharsetMatches([results[encoding_iana]]) + + return CharsetMatches([probable_result]) if encoding_iana == sig_encoding: logger.debug( diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index ee5681ca..6f6b86b3 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -285,7 +285,7 @@ def append(self, item: CharsetMatch) -> None: ) ) # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) - if len(item.raw) <= TOO_BIG_SEQUENCE: + if len(item.raw) < TOO_BIG_SEQUENCE: for match in self._results: if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item)