From 159678404e6de586d9088c25da13f38efbd8fde9 Mon Sep 17 00:00:00 2001
From: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
Date: Thu, 19 Oct 2023 07:52:13 +0200
Subject: [PATCH] :sparkle: Improve the detection around some cases

Close #365 #357 #356
---
 CHANGELOG.md                  |  8 +++++++-
 bin/coverage.py               |  4 +++-
 charset_normalizer/md.py      | 13 ++++++++-----
 charset_normalizer/utils.py   |  4 ++--
 charset_normalizer/version.py |  2 +-
 5 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2898af0..eec7d1cb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-??)
+
+### Changed
+- Optional mypyc compilation upgraded to version 1.6.0 for Python >= 3.8
+- Improved the general detection reliability based on reports from the community
+
 ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
 
 ### Added
@@ -14,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
-- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7
+- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
 
 ### Fixed
 - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
diff --git a/bin/coverage.py b/bin/coverage.py
index 94e058cf..e5f07bd5 100644
--- a/bin/coverage.py
+++ b/bin/coverage.py
@@ -5,7 +5,7 @@
 from typing import List
 import argparse
 
-from charset_normalizer import from_path
+from charset_normalizer import from_path, __version__
 from charset_normalizer.utils import iana_name
 
 from os import sep
@@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
         print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
         exit(1)
 
+    print(f"> using charset-normalizer {__version__}")
+
     success_count = 0
     total_count = 0
 
diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
index a6d9350c..103dfdd6 100644
--- a/charset_normalizer/md.py
+++ b/charset_normalizer/md.py
@@ -233,16 +233,13 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count == 0:
+        if self._character_count <= 24:
             return 0.0
 
         ratio_of_suspicious_range_usage: float = (
             self._suspicious_successive_range_count * 2
         ) / self._character_count
 
-        if ratio_of_suspicious_range_usage < 0.1:
-            return 0.0
-
         return ratio_of_suspicious_range_usage
 
 
@@ -295,7 +292,11 @@ def feed(self, character: str) -> None:
                     self._is_current_word_bad = True
                 # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
+                if (
+                    is_accentuated(self._buffer[-1])
+                    and self._buffer[-1].isupper()
+                    and all(_.isupper() for _ in self._buffer) is False
+                ):
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
             if buffer_length >= 24 and self._foreign_long_watch:
@@ -521,6 +522,8 @@ def is_suspiciously_successive_range(
             return False
         if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
             return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
 
     return True
 
diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
index 45a402e4..b5ee8459 100644
--- a/charset_normalizer/utils.py
+++ b/charset_normalizer/utils.py
@@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
     if character_range is None:
         return False
 
-    return "Forms" in character_range
+    return "Forms" in character_range and character_category != "Lo"
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool:
     if character_range is None:
         return False
 
-    return "Emoticons" in character_range
+    return "Emoticons" in character_range or "Pictographs" in character_range
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
index db1ff57a..83683f4c 100644
--- a/charset_normalizer/version.py
+++ b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "3.3.0"
+__version__ = "3.3.1"
 VERSION = __version__.split(".")