From 782885e789f0e97fbda796255a3bac5ec4cd656f Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Fri, 7 Jul 2023 18:54:04 +0100 Subject: [PATCH] Fix issue 289, add function is_binary, add explicit support py 3.12 (#306) Release 3.2.0 --- .github/workflows/cd.yml | 3 +- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 11 ++++- charset_normalizer/__init__.py | 3 +- charset_normalizer/api.py | 76 +++++++++++++++++++++++++++++++++- charset_normalizer/md.py | 17 ++++++-- charset_normalizer/utils.py | 4 +- charset_normalizer/version.py | 2 +- docs/api.rst | 3 +- docs/index.rst | 1 + docs/user/miscellaneous.rst | 18 ++++++++ setup.cfg | 1 + tests/test_isbinary.py | 28 +++++++++++++ 13 files changed, 155 insertions(+), 14 deletions(-) create mode 100644 tests/test_isbinary.py diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index e9adecd0..48d59c89 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -12,6 +12,7 @@ permissions: jobs: pre_flight_check: + name: Preflight Checks uses: ./.github/workflows/ci.yml universal-wheel: @@ -127,7 +128,7 @@ jobs: id-token: write contents: write with: - subject-base64: ${{ needs.checksum.outputs.hashes }} + base64-subjects: ${{ needs.checksum.outputs.hashes }} upload-assets: true deploy: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4abb414d..f3b24cb3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -174,7 +174,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] + python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] # , "3.12-dev" os: [ ubuntu-latest, macos-latest, windows-latest ] env: PYTHONIOENCODING: utf8 # only needed for Windows (console IO output encoding) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf458c04..935bc4d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,19 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.1.1.dev0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-05-??) +## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07) ### Changed - Typehint for function `from_path` no longer enforce `PathLike` as its first argument +- Minor improvement over the global detection reliability + +### Added +- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries +- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True) +- Explicit support for Python 3.12 + +### Fixed +- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289) ## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06) diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index ebb5da89..55991fc3 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -21,7 +21,7 @@ """ import logging -from .api import from_bytes, from_fp, from_path +from .api import from_bytes, from_fp, from_path, is_binary from .legacy import detect from .models import CharsetMatch, CharsetMatches from .utils import set_logging_handler @@ -31,6 +31,7 @@ "from_fp", "from_path", "from_bytes", + "is_binary", "detect", "CharsetMatch", "CharsetMatches", diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index cf144e46..0ba08e3a 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -31,7 +31,7 @@ def from_bytes( - sequences: bytes, + sequences: Union[bytes, bytearray], steps: int = 5, chunk_size: int = 512, threshold: float = 0.2, @@ -40,6 +40,7 @@ def from_bytes( preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, + enable_fallback: bool = True, ) -> CharsetMatches: """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. @@ -361,7 +362,8 @@ def from_bytes( ) # Preparing those fallbacks in case we got nothing. if ( - encoding_iana in ["ascii", "utf_8", specified_encoding] + enable_fallback + and encoding_iana in ["ascii", "utf_8", specified_encoding] and not lazy_str_hard_failure ): fallback_entry = CharsetMatch( @@ -507,6 +509,7 @@ def from_fp( preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, + enable_fallback: bool = True, ) -> CharsetMatches: """ Same thing than the function from_bytes but using a file pointer that is already ready. @@ -522,6 +525,7 @@ def from_fp( preemptive_behaviour, explain, language_threshold, + enable_fallback, ) @@ -535,6 +539,7 @@ def from_path( preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, + enable_fallback: bool = True, ) -> CharsetMatches: """ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. @@ -551,4 +556,71 @@ def from_path( preemptive_behaviour, explain, language_threshold, + enable_fallback, ) + + +def is_binary( + fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg] + steps: int = 5, + chunk_size: int = 512, + threshold: float = 0.20, + cp_isolation: Optional[List[str]] = None, + cp_exclusion: Optional[List[str]] = None, + preemptive_behaviour: bool = True, + explain: bool = False, + language_threshold: float = 0.1, + enable_fallback: bool = False, +) -> bool: + """ + Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string. + Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match + are disabled to be stricter around ASCII-compatible but unlikely to be a string. + """ + if isinstance(fp_or_path_or_payload, (str, PathLike)): + guesses = from_path( + fp_or_path_or_payload, + steps=steps, + chunk_size=chunk_size, + threshold=threshold, + cp_isolation=cp_isolation, + cp_exclusion=cp_exclusion, + preemptive_behaviour=preemptive_behaviour, + explain=explain, + language_threshold=language_threshold, + enable_fallback=enable_fallback, + ) + elif isinstance( + fp_or_path_or_payload, + ( + bytes, + bytearray, + ), + ): + guesses = from_bytes( + fp_or_path_or_payload, + steps=steps, + chunk_size=chunk_size, + threshold=threshold, + cp_isolation=cp_isolation, + cp_exclusion=cp_exclusion, + preemptive_behaviour=preemptive_behaviour, + explain=explain, + language_threshold=language_threshold, + enable_fallback=enable_fallback, + ) + else: + guesses = from_fp( + fp_or_path_or_payload, + steps=steps, + chunk_size=chunk_size, + threshold=threshold, + cp_isolation=cp_isolation, + cp_exclusion=cp_exclusion, + preemptive_behaviour=preemptive_behaviour, + explain=explain, + language_threshold=language_threshold, + enable_fallback=enable_fallback, + ) + + return not guesses diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 56e9321a..13aa062e 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -294,14 +294,25 @@ def feed(self, character: str) -> None: if buffer_length >= 4: if self._buffer_accent_count / buffer_length > 0.34: self._is_current_word_bad = True - # Word/Buffer ending with a upper case accentuated letter are so rare, + # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): self._foreign_long_count += 1 self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: - self._foreign_long_count += 1 - self._is_current_word_bad = True + camel_case_dst = [ + i + for c, i in zip(self._buffer, range(0, buffer_length)) + if c.isupper() + ] + probable_camel_cased: bool = False + + if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): + probable_camel_cased = True + + if not probable_camel_cased: + self._foreign_long_count += 1 + self._is_current_word_bad = True if self._is_current_word_bad: self._bad_word_count += 1 diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 76eafc64..bf2767a0 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -120,12 +120,12 @@ def is_emoticon(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_separator(character: str) -> bool: - if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: + if character.isspace() or character in {"|", "+", "<", ">"}: return True character_category: str = unicodedata.category(character) - return "Z" in character_category + return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 3d900f65..5eed49a4 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.1.1.dev0" +__version__ = "3.2.0" VERSION = __version__.split(".") diff --git a/docs/api.rst b/docs/api.rst index 48b74951..d75dd3ac 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -13,6 +13,7 @@ Those functions are publicly exposed and are protected through our BC guarantee. .. autofunction:: from_bytes .. autofunction:: from_fp .. autofunction:: from_path +.. autofunction:: is_binary .. autoclass:: charset_normalizer.models.CharsetMatches :inherited-members: @@ -100,5 +101,3 @@ Some reusable functions used across the project. We do not guarantee the BC in t .. class:: os.PathLike - - Used as a generic way to accept AnyStr for paths. diff --git a/docs/index.rst b/docs/index.rst index 05d5f98a..ee0595a6 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -51,6 +51,7 @@ Features - Transpose any encoded content to Unicode the best we can. - Detect spoken language in text. - Ship with a great CLI. +- Also, detect binaries. Start Guide ----------- diff --git a/docs/user/miscellaneous.rst b/docs/user/miscellaneous.rst index 281b0b23..c6251396 100644 --- a/docs/user/miscellaneous.rst +++ b/docs/user/miscellaneous.rst @@ -44,3 +44,21 @@ On `DEBUG` only one entry will be observed and that is about the detection resul Then regarding the others log entries, they will be pushed as `Level 5`. Commonly known as TRACE level, but we do not register it globally. + + +Detect binaries +--------------- + +This package offers a neat way to detect files that can be considered as 'binaries' +meaning that it is not likely to be a text-file. + + :: + + from charset_normalizer import is_binary + + # It can receive both a path or bytes or even a file pointer. + result = is_binary("./my-file.ext") + + # This should print 'True' or 'False' + print(result) + diff --git a/setup.cfg b/setup.cfg index 38a8288d..e45efafe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ classifiers = Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Programming Language :: Python :: Implementation :: PyPy Topic :: Text Processing :: Linguistic Topic :: Utilities diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py new file mode 100644 index 00000000..b134a8ac --- /dev/null +++ b/tests/test_isbinary.py @@ -0,0 +1,28 @@ +import pytest +import typing +from io import BytesIO +from base64 import b64decode +from charset_normalizer import is_binary +from os import path, pardir + +DIR_PATH = path.join( + path.dirname(path.realpath(__file__)), + pardir +) + + +@pytest.mark.parametrize( + "raw, expected", + [ + (b'\x00\x5f\x2f\xff'*50, True), + (b64decode("R0lGODlhAQABAAAAACw="), True), + (BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True), + ('sample-polish.txt', False), + ('sample-arabic.txt', False) + ] +) +def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None: + if isinstance(raw, str): + raw = DIR_PATH + "/data/{}".format(raw) + + assert is_binary(raw) is expected