Skip to content

Commit

Permalink
Fix Zip extraction UTF-8 handling for Python 2.7. (#2546)
Browse files Browse the repository at this point in the history
Previously, zips with non-ASCII file names could cause extraction errors
under Python 2.7 on Linux.

Fixes #298
  • Loading branch information
jsirois authored Oct 1, 2024
1 parent 3f96d54 commit f28046b
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 40 deletions.
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Release Notes

## 2.20.2

This release fixes an old bug handling certain sdist zips under
Python 2.7 as well missing support for Python 3.13's `PYTHON_COLORS`
env var.

* Fix Zip extraction UTF-8 handling for Python 2.7. (#2546)
* Add repl support for `PYTHON_COLORS`. (#2545)

## 2.20.1

This release fixes Pex `--interpreter-constraint` handling such that
Expand Down
119 changes: 97 additions & 22 deletions pex/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
from collections import defaultdict, namedtuple
from datetime import datetime
from uuid import uuid4
from zipfile import ZipFile, ZipInfo

from pex.enum import Enum
from pex.typing import TYPE_CHECKING
from pex.typing import TYPE_CHECKING, cast

if TYPE_CHECKING:
from typing import (
Expand All @@ -43,7 +44,7 @@
Union,
)

_Text = TypeVar("_Text", str, Text)
_Text = TypeVar("_Text", bytes, str, Text)

# We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of
# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT).
Expand Down Expand Up @@ -170,14 +171,25 @@ def teardown(self):
_MKDTEMP_SINGLETON = MktempTeardownRegistry()


class PermPreservingZipFile(zipfile.ZipFile, object):
"""A ZipFile that works around https://bugs.python.org/issue15795."""
class ZipFileEx(ZipFile):
"""A ZipFile that works around several issues in the stdlib.
See:
+ https://bugs.python.org/issue15795
+ https://github.com/pex-tool/pex/issues/298
"""

class ZipEntry(namedtuple("ZipEntry", ["info", "data"])):
pass

@classmethod
def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
def zip_entry_from_file(
cls,
filename, # type: str
arcname=None, # type: Optional[str]
date_time=None, # type: Optional[time.struct_time]
):
# type: (...) -> ZipEntry
"""Construct a ZipEntry for a file on the filesystem.
Usually a similar `zip_info_from_file` method is provided by `ZipInfo`, but it is not
Expand All @@ -196,38 +208,101 @@ def zip_entry_from_file(cls, filename, arcname=None, date_time=None):
arcname += "/"
if date_time is None:
date_time = time.localtime(st.st_mtime)
zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6])
zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes
zip_info = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6])
zip_info.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes
if isdir:
zinfo.file_size = 0
zinfo.external_attr |= 0x10 # MS-DOS directory flag
zinfo.compress_type = zipfile.ZIP_STORED
zip_info.file_size = 0
zip_info.external_attr |= 0x10 # MS-DOS directory flag
zip_info.compress_type = zipfile.ZIP_STORED
data = b""
else:
zinfo.file_size = st.st_size
zinfo.compress_type = zipfile.ZIP_DEFLATED
zip_info.file_size = st.st_size
zip_info.compress_type = zipfile.ZIP_DEFLATED
with open(filename, "rb") as fp:
data = fp.read()
return cls.ZipEntry(info=zinfo, data=data)
return cls.ZipEntry(info=zip_info, data=data)

def _extract_member(self, member, targetpath, pwd):
result = super(PermPreservingZipFile, self)._extract_member(member, targetpath, pwd)
def _extract_member(
self,
member, # type: Union[str, ZipInfo]
targetpath, # type: str
pwd, # type: Optional[bytes]
):
# type: (...) -> str

# MyPy doesn't see the superclass private method.
result = super(ZipFileEx, self)._extract_member( # type: ignore[misc]
member, targetpath, pwd
)
info = member if isinstance(member, zipfile.ZipInfo) else self.getinfo(member)
self._chmod(info, result)
return result
return cast(str, result)

@staticmethod
def _chmod(
info, # type: ZipInfo
path, # type: Text
):
# type: (...) -> None

def _chmod(self, info, path):
# This magic works to extract perm bits from the 32 bit external file attributes field for
# unix-created zip files, for the layout, see:
# https://www.forensicswiki.org/wiki/ZIP#External_file_attributes
if info.external_attr > 0xFFFF:
attr = info.external_attr >> 16
os.chmod(path, attr)

# Python 3 also takes PathLike[str] for the path arg, but we only ever pass str since we support
# Python 2.7 and don't use pathlib as a result.
def extractall( # type: ignore[override]
self,
path=None, # type: Optional[str]
members=None, # type: Optional[Iterable[Union[str, ZipInfo]]]
pwd=None, # type: Optional[bytes]
):
# type: (...) -> None
if sys.version_info[0] != 2:
return super(ZipFileEx, self).extractall(path=path, members=members, pwd=pwd)

# Under Python 2.7, ZipFile does not handle Zip entry name encoding correctly. Older Zip
# standards supported IBM code page 437 and newer support UTF-8. The newer support is
# indicated by the bit 11 flag in the file header.
# From https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT section
# "4.4.4 general purpose bit flag: (2 bytes)":
#
# Bit 11: Language encoding flag (EFS). If this bit is set,
# the filename and comment fields for this file
# MUST be encoded using UTF-8. (see APPENDIX D)
#
# N.B.: MyPy fails to see this code can be reached for Python 2.7.
efs_bit = 1 << 11 # type: ignore[unreachable]

target_path = path or os.getcwd()
for member in members or self.infolist():
info = member if isinstance(member, ZipInfo) else self.getinfo(member)
encoding = "utf-8" if info.flag_bits & efs_bit else "cp437"
member_path = info.filename.encode(encoding)
target = target_path.encode(encoding)

rel_dir = os.path.dirname(member_path)
abs_dir = os.path.join(target, rel_dir)
abs_path = os.path.join(abs_dir, os.path.basename(member_path))
if member_path.endswith(b"/"):
safe_mkdir(abs_path)
else:
safe_mkdir(abs_dir)
with open(abs_path, "wb") as tfp, self.open(info) as zf_entry:
shutil.copyfileobj(zf_entry, tfp)
self._chmod(info, abs_path)


@contextlib.contextmanager
def open_zip(path, *args, **kwargs):
# type: (Text, *Any, **Any) -> Iterator[PermPreservingZipFile]
def open_zip(
path, # type: Text
*args, # type: Any
**kwargs # type: Any
):
# type: (...) -> Iterator[ZipFileEx]
"""A contextmanager for zip files.
Passes through positional and kwargs to zipfile.ZipFile.
Expand All @@ -237,8 +312,8 @@ def open_zip(path, *args, **kwargs):
# extensions across all Pex supported Pythons.
kwargs.setdefault("allowZip64", True)

with contextlib.closing(PermPreservingZipFile(path, *args, **kwargs)) as zip:
yield zip
with contextlib.closing(ZipFileEx(path, *args, **kwargs)) as zip_fp:
yield zip_fp


def deterministic_walk(*args, **kwargs):
Expand Down Expand Up @@ -336,7 +411,7 @@ def safe_delete(filename):


def safe_rmtree(directory):
# type: (Text) -> None
# type: (_Text) -> None
"""Delete a directory if it's present.
If it's not present, no-op.
Expand Down
11 changes: 2 additions & 9 deletions pex/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,7 @@
from pex.atomic_directory import atomic_directory
from pex.cache import access as cache_access
from pex.cache.dirs import CacheDir
from pex.common import (
PermPreservingZipFile,
is_script,
open_zip,
safe_copy,
safe_mkdir,
safe_mkdtemp,
)
from pex.common import ZipFileEx, is_script, open_zip, safe_copy, safe_mkdir, safe_mkdtemp
from pex.enum import Enum
from pex.tracer import TRACER
from pex.typing import TYPE_CHECKING
Expand Down Expand Up @@ -395,7 +388,7 @@ def close(self):
def open(self):
# type: () -> zipfile.ZipFile
if self._zfp is None:
self._zfp = PermPreservingZipFile(self.path)
self._zfp = ZipFileEx(self.path)
return self._zfp

def __getstate__(self):
Expand Down
2 changes: 1 addition & 1 deletion pex/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2015 Pex project contributors.
# Licensed under the Apache License, Version 2.0 (see LICENSE).

__version__ = "2.20.1"
__version__ = "2.20.2"
84 changes: 84 additions & 0 deletions tests/integration/test_issue_298.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Pex project contributors.
# Licensed under the Apache License, Version 2.0 (see LICENSE).

from __future__ import absolute_import

import os.path
import shutil
import subprocess
from textwrap import dedent

from pex.common import safe_open
from pex.fetcher import URLFetcher
from pex.typing import TYPE_CHECKING
from testing.docker import skip_unless_docker

if TYPE_CHECKING:
from typing import Any


@skip_unless_docker
def test_confounding_encoding(
tmpdir, # type: Any
pex_project_dir, # type: str
):
# type: (...) -> None

sdists = os.path.join(str(tmpdir), "sdists")
with safe_open(
os.path.join(sdists, "CherryPy-7.1.0.zip"), "wb"
) as write_fp, URLFetcher().get_body_stream(
"https://files.pythonhosted.org/packages/"
"b7/dd/e95de2d7042bd53009e8673ca489effebd4a35d9b64b75ecfcca160efaf6/CherryPy-7.1.0.zip"
) as read_fp:
shutil.copyfileobj(read_fp, write_fp)

dest = os.path.join(str(tmpdir), "dest")
subprocess.check_call(
args=[
"docker",
"run",
"--rm",
"-v",
"{code}:/code".format(code=pex_project_dir),
"-w",
"/code",
"-v",
"{sdists}:/sdists".format(sdists=sdists),
"-v",
"{dest}:/dest".format(dest=dest),
"-e",
"LANG=en_US.ISO-8859-1",
"python:2.7-slim",
"python2.7",
"-c",
dedent(
"""\
from __future__ import absolute_import
import os
import zipfile
from pex.common import open_zip
with zipfile.ZipFile("/sdists/CherryPy-7.1.0.zip") as zf:
try:
zf.extractall("/dest")
raise AssertionError(
"Expected standard Python 2.7 ZipFile.extractall to fail."
)
except UnicodeEncodeError as e:
pass
with open_zip("/sdists/CherryPy-7.1.0.zip") as zf:
zf.extractall("/dest")
"""
),
]
)

assert os.path.isfile(
os.path.join(dest, "CherryPy-7.1.0", "cherrypy", "test", "static", "Слава Україні.html")
)
10 changes: 5 additions & 5 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from pex.common import (
Chroot,
PermPreservingZipFile,
ZipFileEx,
can_write_dir,
chmod_plus_x,
deterministic_walk,
Expand Down Expand Up @@ -51,7 +51,7 @@ def zip_fixture():
assert extract_perms(one) != extract_perms(two)

zip_file = os.path.join(target_dir, "test.zip")
with contextlib.closing(PermPreservingZipFile(zip_file, "w")) as zf:
with contextlib.closing(ZipFileEx(zip_file, "w")) as zf:
zf.write(one, "one")
zf.write(two, "two")

Expand All @@ -61,7 +61,7 @@ def zip_fixture():
def test_perm_preserving_zipfile_extractall():
# type: () -> None
with zip_fixture() as (zip_file, extract_dir, one, two):
with contextlib.closing(PermPreservingZipFile(zip_file)) as zf:
with contextlib.closing(ZipFileEx(zip_file)) as zf:
zf.extractall(extract_dir)

assert extract_perms(one) == extract_perms(os.path.join(extract_dir, "one"))
Expand All @@ -71,7 +71,7 @@ def test_perm_preserving_zipfile_extractall():
def test_perm_preserving_zipfile_extract():
# type: () -> None
with zip_fixture() as (zip_file, extract_dir, one, two):
with contextlib.closing(PermPreservingZipFile(zip_file)) as zf:
with contextlib.closing(ZipFileEx(zip_file)) as zf:
zf.extract("one", path=extract_dir)
zf.extract("two", path=extract_dir)

Expand All @@ -98,7 +98,7 @@ def assert_chroot_perms(copyfn):
zip_path = os.path.join(src, "chroot.zip")
chroot.zip(zip_path)
with temporary_dir() as extract_dir:
with contextlib.closing(PermPreservingZipFile(zip_path)) as zf:
with contextlib.closing(ZipFileEx(zip_path)) as zf:
zf.extractall(extract_dir)

assert extract_perms(one) == extract_perms(os.path.join(extract_dir, "one"))
Expand Down
6 changes: 3 additions & 3 deletions tests/test_enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from pex.atomic_directory import AtomicDirectory
from pex.common import PermPreservingZipFile
from pex.common import ZipFileEx
from pex.compatibility import PY2
from pex.enum import Enum, qualified_name

Expand Down Expand Up @@ -116,9 +116,9 @@ def test_qualified_name():
AtomicDirectory.work_dir
), "Expected @property to be handled."

expected_prefix = "pex.common." if PY2 else "pex.common.PermPreservingZipFile."
expected_prefix = "pex.common." if PY2 else "pex.common.ZipFileEx."
assert expected_prefix + "zip_entry_from_file" == qualified_name(
PermPreservingZipFile.zip_entry_from_file
ZipFileEx.zip_entry_from_file
), "Expected @classmethod to be handled."

class Test(object):
Expand Down

0 comments on commit f28046b

Please sign in to comment.