Fix issue 289, add function is_binary, add explicit support py 3.12 (#…

…306) Release 3.2.0
jawah · Jul 7, 2023 · 782885e · 782885e
1 parent 1b0fb5c
commit 782885e
Show file tree

Hide file tree

Showing 13 changed files with 155 additions and 14 deletions.
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -12,6 +12,7 @@ permissions:
 
 jobs:
   pre_flight_check:
+    name: Preflight Checks
     uses: ./.github/workflows/ci.yml
 
   universal-wheel:
@@ -127,7 +128,7 @@ jobs:
       id-token: write
       contents: write
     with:
-      subject-base64: ${{ needs.checksum.outputs.hashes }}
+      base64-subjects: ${{ needs.checksum.outputs.hashes }}
       upload-assets: true
 
   deploy:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -174,7 +174,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]  # , "3.12-dev"
         os: [ ubuntu-latest, macos-latest, windows-latest ]
     env:
       PYTHONIOENCODING: utf8  # only needed for Windows (console IO output encoding)

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,10 +2,19 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
-## [3.1.1.dev0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-05-??)
+## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
 
 ### Changed
 - Typehint for function `from_path` no longer enforce `PathLike` as its first argument
+- Minor improvement over the global detection reliability
+
+### Added
+- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
+- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
+- Explicit support for Python 3.12
+
+### Fixed
+- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
 
 ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
 

diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
@@ -21,7 +21,7 @@
 """
 import logging
 
-from .api import from_bytes, from_fp, from_path
+from .api import from_bytes, from_fp, from_path, is_binary
 from .legacy import detect
 from .models import CharsetMatch, CharsetMatches
 from .utils import set_logging_handler
@@ -31,6 +31,7 @@
     "from_fp",
     "from_path",
     "from_bytes",
+    "is_binary",
     "detect",
     "CharsetMatch",
     "CharsetMatches",

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -31,7 +31,7 @@
 
 
 def from_bytes(
-    sequences: bytes,
+    sequences: Union[bytes, bytearray],
     steps: int = 5,
     chunk_size: int = 512,
     threshold: float = 0.2,
@@ -40,6 +40,7 @@ def from_bytes(
     preemptive_behaviour: bool = True,
     explain: bool = False,
     language_threshold: float = 0.1,
+    enable_fallback: bool = True,
 ) -> CharsetMatches:
     """
     Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -361,7 +362,8 @@ def from_bytes(
             )
             # Preparing those fallbacks in case we got nothing.
             if (
-                encoding_iana in ["ascii", "utf_8", specified_encoding]
+                enable_fallback
+                and encoding_iana in ["ascii", "utf_8", specified_encoding]
                 and not lazy_str_hard_failure
             ):
                 fallback_entry = CharsetMatch(
@@ -507,6 +509,7 @@ def from_fp(
     preemptive_behaviour: bool = True,
     explain: bool = False,
     language_threshold: float = 0.1,
+    enable_fallback: bool = True,
 ) -> CharsetMatches:
     """
     Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -522,6 +525,7 @@ def from_fp(
         preemptive_behaviour,
         explain,
         language_threshold,
+        enable_fallback,
     )
 
 
@@ -535,6 +539,7 @@ def from_path(
     preemptive_behaviour: bool = True,
     explain: bool = False,
     language_threshold: float = 0.1,
+    enable_fallback: bool = True,
 ) -> CharsetMatches:
     """
     Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -551,4 +556,71 @@ def from_path(
             preemptive_behaviour,
             explain,
             language_threshold,
+            enable_fallback,
         )
+
+
+def is_binary(
+    fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes],  # type: ignore[type-arg]
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = False,
+) -> bool:
+    """
+    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
+    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
+    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
+    """
+    if isinstance(fp_or_path_or_payload, (str, PathLike)):
+        guesses = from_path(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    elif isinstance(
+        fp_or_path_or_payload,
+        (
+            bytes,
+            bytearray,
+        ),
+    ):
+        guesses = from_bytes(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    else:
+        guesses = from_fp(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+
+    return not guesses
diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -294,14 +294,25 @@ def feed(self, character: str) -> None:
             if buffer_length >= 4:
                 if self._buffer_accent_count / buffer_length > 0.34:
                     self._is_current_word_bad = True
-                # Word/Buffer ending with a upper case accentuated letter are so rare,
+                # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
                 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
             if buffer_length >= 24 and self._foreign_long_watch:
-                self._foreign_long_count += 1
-                self._is_current_word_bad = True
+                camel_case_dst = [
+                    i
+                    for c, i in zip(self._buffer, range(0, buffer_length))
+                    if c.isupper()
+                ]
+                probable_camel_cased: bool = False
+
+                if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
+                    probable_camel_cased = True
+
+                if not probable_camel_cased:
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
 
             if self._is_current_word_bad:
                 self._bad_word_count += 1

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -120,12 +120,12 @@ def is_emoticon(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_separator(character: str) -> bool:
-    if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
+    if character.isspace() or character in {"｜", "+", "<", ">"}:
         return True
 
     character_category: str = unicodedata.category(character)
 
-    return "Z" in character_category
+    return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "3.1.1.dev0"
+__version__ = "3.2.0"
 VERSION = __version__.split(".")
diff --git a/docs/api.rst b/docs/api.rst
@@ -13,6 +13,7 @@ Those functions are publicly exposed and are protected through our BC guarantee.
 .. autofunction:: from_bytes
 .. autofunction:: from_fp
 .. autofunction:: from_path
+.. autofunction:: is_binary
 
 .. autoclass:: charset_normalizer.models.CharsetMatches
     :inherited-members:
@@ -100,5 +101,3 @@ Some reusable functions used across the project. We do not guarantee the BC in t
 
 
 .. class:: os.PathLike
-
-   Used as a generic way to accept AnyStr for paths.
diff --git a/docs/index.rst b/docs/index.rst
@@ -51,6 +51,7 @@ Features
 - Transpose any encoded content to Unicode the best we can.
 - Detect spoken language in text.
 - Ship with a great CLI.
+- Also, detect binaries.
 
 Start Guide
 -----------

diff --git a/docs/user/miscellaneous.rst b/docs/user/miscellaneous.rst
@@ -44,3 +44,21 @@ On `DEBUG` only one entry will be observed and that is about the detection resul
 
 Then regarding the others log entries, they will be pushed as `Level 5`. Commonly known as TRACE level, but we do
 not register it globally.
+
+
+Detect binaries
+---------------
+
+This package offers a neat way to detect files that can be considered as 'binaries'
+meaning that it is not likely to be a text-file.
+
+ ::
+
+    from charset_normalizer import is_binary
+
+    # It can receive both a path or bytes or even a file pointer.
+    result = is_binary("./my-file.ext")
+
+    # This should print 'True' or 'False'
+    print(result)
+
diff --git a/setup.cfg b/setup.cfg
@@ -24,6 +24,7 @@ classifiers =
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
     Programming Language :: Python :: Implementation :: PyPy
     Topic :: Text Processing :: Linguistic
     Topic :: Utilities

diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py
@@ -0,0 +1,28 @@
+import pytest
+import typing
+from io import BytesIO
+from base64 import b64decode
+from charset_normalizer import is_binary
+from os import path, pardir
+
+DIR_PATH = path.join(
+    path.dirname(path.realpath(__file__)),
+    pardir
+)
+
+
+@pytest.mark.parametrize(
+    "raw, expected",
+    [
+        (b'\x00\x5f\x2f\xff'*50, True),
+        (b64decode("R0lGODlhAQABAAAAACw="), True),
+        (BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
+        ('sample-polish.txt', False),
+        ('sample-arabic.txt', False)
+    ]
+)
+def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None:
+    if isinstance(raw, str):
+        raw = DIR_PATH + "/data/{}".format(raw)
+
+    assert is_binary(raw) is expected