🔧 improve detector based on case 537 (#538)

jawah · Oct 2, 2024 · b6c460e · b6c460e
1 parent 1b51a2c
commit b6c460e
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
-- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)
+- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
 - Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
 
 ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -159,6 +159,8 @@ def from_bytes(
 
     results: CharsetMatches = CharsetMatches()
 
+    early_stop_results: CharsetMatches = CharsetMatches()
+
     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
 
     if sig_encoding is not None:
@@ -431,29 +433,47 @@ def from_bytes(
                 ),
             )
 
-        results.append(
-            CharsetMatch(
-                sequences,
-                encoding_iana,
-                mean_mess_ratio,
-                bom_or_sig_available,
-                cd_ratios_merged,
-                decoded_payload,
-                preemptive_declaration=specified_encoding,
-            )
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
         )
 
+        results.append(current_match)
+
         if (
             encoding_iana in [specified_encoding, "ascii", "utf_8"]
             and mean_mess_ratio < 0.1
         ):
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
             logger.debug(
-                "Encoding detection: %s is most likely the one.", encoding_iana
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,
             )
             if explain:
                 logger.removeHandler(explain_handler)
                 logger.setLevel(previous_logger_level)
-            return CharsetMatches([results[encoding_iana]])
+
+            return CharsetMatches([probable_result])
 
         if encoding_iana == sig_encoding:
             logger.debug(

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -285,7 +285,7 @@ def append(self, item: CharsetMatch) -> None:
                 )
             )
         # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
-        if len(item.raw) <= TOO_BIG_SEQUENCE:
+        if len(item.raw) < TOO_BIG_SEQUENCE:
             for match in self._results:
                 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
                     match.add_submatch(item)