diff --git a/.github/workflows/deploy-package-to-pypi.yml b/.github/workflows/deploy-package-to-pypi.yml index 8257c1a..e8bd265 100644 --- a/.github/workflows/deploy-package-to-pypi.yml +++ b/.github/workflows/deploy-package-to-pypi.yml @@ -5,29 +5,161 @@ on: types: [published] jobs: - build: + build-source: + name: Build source package runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v3 + - name: Update version in pyproject.toml from current git tag + run: | + sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - uses: actions/setup-python@v4 + with: + python-version: 3.13 + + - name: Build package + run: | + pip install build + python -m build --sdist + + - uses: actions/upload-artifact@v4 with: - python-version: 3.11 + name: source + path: ./dist + + build-linux: + name: Build Linux wheels + runs-on: ubuntu-latest + strategy: + matrix: + image: + - "manylinux2014_x86_64" + - "musllinux_1_1_x86_64" + - "manylinux2014_aarch64" + - "musllinux_1_1_aarch64" + folder: + - "cp37-cp37m" + - "cp38-cp38" + - "cp39-cp39" + - "cp310-cp310" + - "cp311-cp311" + - "cp312-cp312" + - "cp313-cp313" + + steps: + - uses: actions/checkout@v4 - name: Update version in pyproject.toml from current git tag + run: | + sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + platforms: arm64 + + - name: Build packages run: >- - sed -i "s/0\\.0\\.0\\.dev0/${GITHUB_REF/refs\/tags\/v/}/g" pyproject.toml + docker run --rm -v ${{ github.workspace }}:/app quay.io/pypa/${{ matrix.image }} bash -c ' + cd /app && + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && + . "$HOME/.cargo/env" && + /opt/python/${{ matrix.folder }}/bin/python -m build --wheel + auditwheel repair $(ls dist/*.whl) && + rm dist/*.whl && + cp wheelhouse/*.whl dist + ' - - run: | + - uses: actions/upload-artifact@v4 + with: + name: linux-${{ matrix.image }}-$${{ matrix.folder }} + path: ./dist + + build-macos: + name: Build macOS wheels + strategy: + matrix: + os: + - "macos-12" + - "macos-13" + - "macos-14" # ARM + python-version: + - "3.7.1" + - "3.8.10" + - "3.9.13" + - "3.10.11" + - "3.11.9" + - "3.12.6" + - "3.13.0" + exclude: + - python-version: "3.7.1" + os: "macos-14" + runs-on: '${{ matrix.os }}' + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '${{ matrix.python-version }}' + + - name: Update version in pyproject.toml from current git tag + run: | + sed -i "" "s/0\\.0\\.0\\.dev0/${GITHUB_REF_NAME}/g" pyproject.toml + + - name: Build package + run: | + pip install build + python -m build --wheel + + - uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.os }}-${{ matrix.python-version }} + path: ./dist + + build-windows: + name: Build Windows wheels + strategy: + matrix: + os: + - "windows-2019" + python-version: + - "3.7.1" + - "3.8.0" + - "3.9.0" + - "3.10.0" + - "3.11.0" + - "3.12.0" + - "3.13.0" + runs-on: '${{ matrix.os }}' + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '${{ matrix.python-version }}' + + - name: Update version in pyproject.toml from current git tag + run: | + (Get-Content pyproject.toml).Replace('0.0.0.dev0', $Env:GITHUB_REF_NAME) | Set-Content pyproject.toml + + - name: Build package + run: | pip install build - python -m build + python -m build --wheel - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: + name: ${{ matrix.os }}-${{ matrix.python-version }} path: ./dist deploy: - needs: [build] + needs: + - build-source + - build-linux + - build-macos + - build-windows environment: name: pypi url: https://pypi.org/project/stream-unzip/ @@ -37,9 +169,20 @@ jobs: permissions: id-token: write steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 + with: + path: ./dist + + # The "merge-multiple" option of download-artifact seems to cause corruption when there are + # multiple files of the same name, which happens because in some different macOS versions + # make the exact same Python package. So we avoid that and do a manual move of packages + # to the top level for upload + - name: Move packages to top level + run: | + find ./dist -mindepth 2 -type f -exec mv -t ./dist -i '{}' + + rm -R -- ./dist/*/ - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: - packages_dir: artifact/ + packages_dir: ./dist/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f878c3f..791efd5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,8 +14,6 @@ jobs: # If changing how many times tests are run, must also change in codecov.yml # to ensure test coverage is reported only after all tests have finished include: - - python-version: "3.6.7" - os: "ubuntu-20.04" - python-version: "3.7.1" os: "ubuntu-20.04" - python-version: "3.7.2" diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..315f34b --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,178 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crc32-v2" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f546fcecc3490696c3bea070d8949208279bbc220a5a7738573a10f584cda51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "libc" +version = "0.2.159" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" + +[[package]] +name = "proc-macro2" +version = "1.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "stream_unzip_zipcrypto_decrypt" +version = "0.1.0" +dependencies = [ + "crc32-v2", + "pyo3", +] + +[[package]] +name = "syn" +version = "2.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f3553b1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "stream_unzip_zipcrypto_decrypt" +version = "0.1.0" +edition = "2021" + +[lib] +name = "stream_unzip_zipcrypto_decrypt" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.22.5", features = ["extension-module", "gil-refs"] } +crc32-v2 = "0.0.4" diff --git a/README.md b/README.md index 22ad42d..75b9192 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ In addition to being memory efficient, stream-unzip supports: - WinZip-style AES-encrypted / password-protected ZIPs. Python's zipfile module cannot open AES-encrypted ZIPs. -- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. +- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. Decrypting ZipCrypto with stream-unzip is approximately 10 times faster than Python's zipfile module. - ZIP files created by Java's ZipOutputStream that are larger than 4GiB. At the time of writing libarchive-based stream readers cannot read these without error. diff --git a/docs/async-interface.md b/docs/async-interface.md index 6418461..ffa8690 100644 --- a/docs/async-interface.md +++ b/docs/async-interface.md @@ -44,5 +44,3 @@ The async interface is compatible with both [asyncio](https://docs.python.org/3/ > 2. The [contextvars](https://docs.python.org/3/library/contextvars.html) context available in the async iterables of files or data is a shallow copy of the context where async_stream_unzip is called from. > > This means that existing context variables are available inside the input iterable, but any changes made to the context itself from inside the iterable will not propagate out to the original context. Changes made to mutable data structures that are part of the context, for example dictionaries, will propagate out. -> -> This does not affect Python 3.6, because contextvars is not available. diff --git a/docs/features.md b/docs/features.md index 54a073f..fd3fe3f 100644 --- a/docs/features.md +++ b/docs/features.md @@ -15,7 +15,7 @@ In addition to being memory efficient, stream-unzip supports: - WinZip-style AES-encrypted / password-protected ZIPs. Python's zipfile module cannot open AES-encrypted ZIPs. -- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. +- Legacy-encrypted / password-protected ZIP files. This is also known as ZipCrypto/Zip 2.0. Decrypting ZipCrypto with stream-unzip is approximately 10 times faster than Python's zipfile module. - ZIP files created by Java's ZipOutputStream that are larger than 4GiB. At the time of writing libarchive-based stream readers cannot read these without error. diff --git a/docs/get-started.md b/docs/get-started.md index b00c484..84aa1b7 100644 --- a/docs/get-started.md +++ b/docs/get-started.md @@ -7,7 +7,7 @@ title: Get started ## Prerequisites -Python 3.6.7+ +Python 3.7.1+ ## Installation diff --git a/pyproject.toml b/pyproject.toml index 91d8bc1..5bd7158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" +requires = ["maturin>=0.12,<0.13"] +build-backend = "maturin" [project] name = "stream-unzip" @@ -10,7 +10,7 @@ authors = [ ] description = "Python function to stream unzip all the files in a ZIP archive, without loading the entire ZIP file into memory or any of its uncompressed files" readme = "README.md" -requires-python = ">=3.6.7" +requires-python = ">=3.7.1" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", @@ -37,7 +37,8 @@ ci = [ "Documentation" = "https://stream-unzip.docs.trade.gov.uk/" "Source" = "https://github.com/uktrade/stream-unzip" -[tool.hatch.build] +[tool.maturin] include = [ "stream_unzip.py", + "src/**" ] diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d18d1a5 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,88 @@ +use pyo3::prelude::*; +use pyo3::types::PyBytes; +use crc32_v2::crc32; + +// ZipCrypto key initialization vector and constants +const ZIPCRYPTO_KEY_0: u32 = 0x12345678; +const ZIPCRYPTO_KEY_1: u32 = 0x23456789; +const ZIPCRYPTO_KEY_2: u32 = 0x34567890; + +#[derive(Clone)] +struct ZipCrypto { + key_0: u32, + key_1: u32, + key_2: u32, +} + +impl ZipCrypto { + fn new() -> Self { + ZipCrypto { + key_0: ZIPCRYPTO_KEY_0, + key_1: ZIPCRYPTO_KEY_1, + key_2: ZIPCRYPTO_KEY_2, + } + } + + #[inline(always)] + fn init_password(&mut self, password: &[u8]) { + for &b in password { + self.update_keys(b); + } + } + + #[inline(always)] + fn update_keys(&mut self, byte: u8) { + self.key_0 = !crc32(!self.key_0, &[byte]); + + self.key_1 = self + .key_1 + .wrapping_add(self.key_0 & 0xFF) + .wrapping_mul(134775813) + .wrapping_add(1); + + let temp_byte = (self.key_1 >> 24) as u8; + self.key_2 = !crc32(!self.key_2, &[temp_byte]); + } + + #[inline(always)] + fn decrypt_byte(&mut self, byte: u8) -> u8 { + let temp = (self.key_2 | 2) as u16; + let key = (((temp.wrapping_mul(temp ^ 1)) >> 8) & 0xFF) as u8; + let decrypted = byte ^ key; + self.update_keys(decrypted); + decrypted + } + + #[inline(always)] + fn decrypt_chunk(&mut self, chunk: &[u8]) -> Vec { + chunk.iter().map(|&b| self.decrypt_byte(b)).collect() + } +} + +#[pyclass(name = "zipcrypto_decryptor")] +struct StreamUnzipZipCryptoDecryptor { + zipcrypto: ZipCrypto, +} + +#[pymethods] +impl StreamUnzipZipCryptoDecryptor { + #[new] + fn new(password: &[u8]) -> Self { + let mut zipcrypto = ZipCrypto::new(); + zipcrypto.init_password(password); + StreamUnzipZipCryptoDecryptor { zipcrypto } + } + + // Decrypts a single chunk and returns the decrypted result + fn __call__<'py>(&mut self, py: Python<'py>, chunk: Vec) -> PyResult<&'py PyBytes> { + let result = self.zipcrypto.decrypt_chunk(&chunk); + // Return the decrypted result as a Python bytes object so it can be used in Python code + Ok(PyBytes::new(py, &result)) + } +} + +#[pymodule] +fn stream_unzip_zipcrypto_decrypt(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + Ok(()) +} diff --git a/stream_unzip.py b/stream_unzip.py index 0cbe147..ea649ae 100644 --- a/stream_unzip.py +++ b/stream_unzip.py @@ -1,6 +1,6 @@ -from functools import partial from struct import Struct import asyncio +import contextvars import bz2 import zlib @@ -11,6 +11,8 @@ from stream_inflate import stream_inflate64 +from stream_unzip_zipcrypto_decrypt import zipcrypto_decryptor + NO_ENCRYPTION = object() ZIP_CRYPTO = object() @@ -226,39 +228,7 @@ def get_extra_value(extra, if_true, signature, exception_if_missing, min_length, return value def decrypt_weak_decompress(chunks, decompress, is_done, num_unused): - # There are a few optimisations that make this code unusual: - # - There is code repetition (to avoid function calls inside loops) - # - We assign global variables to local (to avoid the dictionary lookups globals involve) - # - Use bytearray rather than bytes (to avoid allocating memory) - # - Avoids intermediate statements/variables (to minimise unnecessary operations) - # From some light tests these make it ~5%-10% faster than Python's zipfile (although it - # does use similar optimisations from what I can tell) - key_0 = 305419896 - key_1 = 591751049 - key_2 = 878082192 - crc32 = zlib.crc32 - bytearray_byte = bytearray(1) - - def decrypt(chunk): - nonlocal key_0, key_1, key_2 - chunk = bytearray(chunk) - for i, byte in enumerate(chunk): - temp = key_2 | 2 - byte ^= ((temp * (temp ^ 1)) >> 8) & 0xFF - bytearray_byte[0] = byte - key_0 = ~crc32(bytearray_byte, ~key_0) & 0xFFFFFFFF - key_1 = ((((key_1 + (key_0 & 0xFF)) & 0xFFFFFFFF) * 134775813) + 1) & 0xFFFFFFFF - bytearray_byte[0] = key_1 >> 24 - key_2 = ~crc32(bytearray_byte, ~key_2) & 0xFFFFFFFF - chunk[i] = byte - return chunk - - for byte in password: - bytearray_byte[0] = byte - key_0 = ~crc32(bytearray_byte, ~key_0) & 0xFFFFFFFF - key_1 = ((((key_1 + (key_0 & 0xFF)) & 0xFFFFFFFF) * 134775813) + 1) & 0xFFFFFFFF - bytearray_byte[0] = key_1 >> 24 - key_2 = ~crc32(bytearray_byte, ~key_2) & 0xFFFFFFFF + decrypt = zipcrypto_decryptor(password) encryption_header = decrypt(get_num(12)) check_password_byte = \ @@ -526,14 +496,7 @@ async def to_async_iterable(sync_iterable): # propagated by run_in_executor, so we use a sentinel to detect the end of the iterable done = object() it = iter(sync_iterable) - - # contextvars are not available until Python 3.7 - try: - import contextvars - except ImportError: - get_args = lambda: (next, it, done) - else: - get_args = lambda: (contextvars.copy_context().run, next, it, done) + get_args = lambda: (contextvars.copy_context().run, next, it, done) while True: if trio is not None: @@ -557,17 +520,12 @@ def to_sync_iterable(async_iterable): break yield value - # A slightly complex dance to both find the asyncio event loop in various versions of Python, - # but also to work out if we're not in an asyncio event loop and instead in trio - # Note that get_running_loop is preferred, but isn't available until Python 3.7 + # A slightly complex dance to both find the asyncio event loop and to work out if we're not in + # asyncio event loop and instead in trio trio = None loop = None try: loop = asyncio.get_running_loop() - except AttributeError: - loop = asyncio.get_event_loop() - if not loop.is_running(): - loop = None except RuntimeError: loop = None