Skip to content

Commit

Permalink
chore(python): WIP backport of changes from skalt/pagefind_python
Browse files Browse the repository at this point in the history
  • Loading branch information
SKalt committed Aug 20, 2024
1 parent 6185637 commit adf5168
Show file tree
Hide file tree
Showing 29 changed files with 725 additions and 56 deletions.
4 changes: 4 additions & 0 deletions .shellcheckrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
external-sources=true
source-path=SCRIPTDIR
disable=SC2002
# SC2002: ignore "useless cat" warning: starting pipes with `cat` improves composability
4 changes: 2 additions & 2 deletions test_ci.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

set -eu
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
cd "$SCRIPT_DIR"

PAGEFIND=$(realpath "$SCRIPT_DIR/target/$1/pagefind")
REPO_WD=$(realpath "$SCRIPT_DIR")
Expand Down
2 changes: 2 additions & 0 deletions wrappers/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
output
# ^ from src/tests/integration.py
2 changes: 2 additions & 0 deletions wrappers/python/poetry.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[virtualenvs]
in-project = true
Empty file.
16 changes: 16 additions & 0 deletions wrappers/python/scripts/build/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import logging
import os
from pathlib import Path

this_file = Path(__file__)
this_dir = Path(__file__).parent
python_root = this_dir.parent.parent.resolve().absolute()
upstream_version_file = python_root / "pagefind_version.txt"
dist_dir = python_root / "dist"
vendor_dir = python_root / "vendor"


def setup_logging() -> None:
logging.basicConfig(
level=os.environ.get("PAGEFIND_PYTHON_LOG_LEVEL") or logging.INFO
)
70 changes: 70 additions & 0 deletions wrappers/python/scripts/build/all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import tarfile
import tempfile
from pathlib import Path
from typing import List

from . import dist_dir, setup_logging
from .binary_only_wheel import (
LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS,
write_pagefind_bin_only_wheel,
)
from .get_pagefind_release import download

__candidates = (
"pagefind",
"pagefind.exe",
"pagefind_extended",
"pagefind_extended.exe",
)


def find_bin(dir: Path) -> Path:
for file in dir.iterdir():
if file.is_file() and file.name in __candidates:
return file
raise FileNotFoundError(f"Could not find any of {__candidates} in {dir}")


def get_llvm_triple(tar_gz: Path) -> str:
assert tar_gz.name.endswith(".tar.gz")
# parse the llvm triple from the archive name
llvm_triple = tar_gz.name
llvm_triple = llvm_triple.removesuffix(".tar.gz")
llvm_triple = llvm_triple.removeprefix(f"pagefind-{tag_name}-")
llvm_triple = llvm_triple.removeprefix(f"pagefind_extended-{tag_name}-")
return llvm_triple


def check_platforms(certified: List[Path]) -> None:
for compressed_archive in certified:
llvm_triple = get_llvm_triple(compressed_archive)
platform = LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS.get(llvm_triple)
if platform is None:
raise ValueError(f"Unsupported platform: {llvm_triple}")


if __name__ == "__main__":
setup_logging()
certified, tag_name = download("latest", dry_run=False)
# create a temp directory to hold the extracted binaries
check_platforms(certified)
dist_dir.mkdir(exist_ok=True)
for tar_gz in certified:
llvm_triple = get_llvm_triple(tar_gz)
platform = LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS.get(llvm_triple)
if platform is None:
raise ValueError(f"Unsupported platform: {llvm_triple}")

# FIXME: avoid writing the extracted bin to disk
# unpack the tar.gz archive
name = tar_gz.name.removesuffix(".tar.gz")
with tempfile.TemporaryDirectory(prefix=name + "~") as _temp_dir:
temp_dir = Path(_temp_dir)
with tarfile.open(tar_gz, "r:gz") as tar:
tar.extractall(_temp_dir)
write_pagefind_bin_only_wheel(
executable=find_bin(temp_dir),
output_dir=dist_dir,
version=tag_name.removeprefix("v"),
platform=platform,
)
28 changes: 28 additions & 0 deletions wrappers/python/scripts/build/api_package.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# HACK: This script is a hack to build the API package without using poetry to lock the
# optional dependencies. It might be preferable to use setuptools directly rather than
# work around poetry.

from . import python_root, setup_logging
import subprocess

pyproject_toml = python_root / "pyproject.toml"


def main() -> None:
original = pyproject_toml.read_text()
temp = ""
for line in original.splitlines():
if line.endswith("#!!opt"):
temp += line.removeprefix("# ") + "\n"
else:
temp += line + "\n"
with pyproject_toml.open("w") as f:
f.write(temp)
subprocess.run(["poetry", "build"], check=True)
with pyproject_toml.open("w") as f:
f.write(original)


if __name__ == "__main__":
setup_logging()
main()
254 changes: 254 additions & 0 deletions wrappers/python/scripts/build/binary_only_wheel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
#!/usr/bin/env python3
# Adapted from https://github.com/ziglang/zig-pypi/blob/a0ca0d8b2d5104498f4eececff09ed2b1ede2d0b/make_wheels.py
# See also https://simonwillison.net/2022/May/23/bundling-binary-tools-in-python-wheels/
#
# Note that this script assumes that the relevant files are on disk and either
# the files hashes have been verified or we trust the files.
import argparse
import logging
from email.message import EmailMessage
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
from zipfile import ZIP_DEFLATED, ZipInfo

import wheel # type: ignore
import wheel.wheelfile # type: ignore

from . import python_root, setup_logging

log = logging.getLogger(__name__)
# constants
HOMEPAGE = "https://pagefind.app"
REPO = "https://github.com/CloudCannon/pagefind/"
REQUIRED_PYTHON_VERSION = "~=3.9"


src_dir = python_root / "src" / "pagefind_python_bin"
assert src_dir.is_dir(), f"{src_dir} is not a directory"


# as of the time of writing, these are the supported platforms:
# See https://doc.rust-lang.org/nightly/rustc/platform-support.html
# wheel name format: {dist}-{version}(-{build})?-{python}-{abi}-{platform}.whl
# this dict helps look up the last part of the wheel name: ^^^^^^^^^^
LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS = {
# LLVM triple: Python platform
# only the LLVM triples that are produced in CI are listed here; see
# https://github.com/CloudCannon/pagefind/releases/latest
# the python platform mapping is copied from zig-pypi's script.
# See also: https://github.com/PyO3/maturin/blob/main/src/auditwheel/manylinux-policy.json
# See also: https://github.com/PyO3/maturin/blob/main/src/auditwheel/musllinux-policy.json
# TODO: check the python platforms are correct.
"aarch64-apple-darwin": "macosx_12_0_arm64",
"aarch64-unknown-linux-musl": "manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64",
"x86_64-apple-darwin": "macosx_12_0_arm64",
"x86_64-pc-windows-msvc": "win_amd64",
"x86_64-unknown-linux-musl": "manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64",
}


def as_zip_info(file: Path, *, alias: str) -> Tuple[ZipInfo, bytes]:
zip_info = ZipInfo(alias or file.name, (1980, 1, 1, 0, 0, 0))
zip_info.external_attr = file.stat().st_mode << 16
with file.open("rb") as f:
data = f.read()
zip_info.file_size = len(data)
return zip_info, data


class ReproducibleWheelFile(wheel.wheelfile.WheelFile): # type: ignore
def writestr(
self,
zip_info_or_arc_name: Union[ZipInfo, str],
data: Any,
*args: Any,
**kwargs: Any,
) -> None:
if isinstance(zip_info_or_arc_name, ZipInfo):
zip_info = zip_info_or_arc_name
else:
assert isinstance(zip_info_or_arc_name, str)
zip_info = ZipInfo(zip_info_or_arc_name)
zip_info.file_size = len(data)
zip_info.external_attr = 0o0644 << 16
if zip_info_or_arc_name.endswith(".dist-info/RECORD"):
zip_info.external_attr = 0o0664 << 16

zip_info.compress_type = ZIP_DEFLATED
zip_info.date_time = (1980, 1, 1, 0, 0, 0)
zip_info.create_system = 3
wheel.wheelfile.WheelFile.writestr(self, zip_info, data, *args, **kwargs)


def make_message(
headers: Dict[str, Union[str, List[str]]],
payload: Optional[Union[str, bytes]] = None,
) -> EmailMessage:
msg = EmailMessage()
for name, value in headers.items():
if isinstance(value, list):
for value_part in value:
msg[name] = value_part
else:
msg[name] = value
if payload:
msg.set_payload(payload)
return msg


def write_wheel_file(
filename: Path,
contents: Mapping[
Union[str, ZipInfo], Union[str, bytes, EmailMessage, ZipInfo, Path]
],
) -> Path:
with ReproducibleWheelFile(filename, "w") as wheel:
for member_info, member_source in contents.items():
if isinstance(member_source, str):
data = member_source.encode("utf-8")
elif isinstance(member_source, bytes):
data = member_source
elif isinstance(member_source, EmailMessage):
data = member_source.as_bytes(
policy=member_source.policy.clone(linesep="\n"), unixfrom=False
)
elif isinstance(member_source, Path):
assert type(member_info) is str
member_info, data = as_zip_info(member_source, alias=member_info)
else:
raise ValueError(f"unexpected content: {type(member_source)}")
wheel.writestr(member_info, data)
return filename


def write_wheel(
out_dir: Path,
*,
name: str,
version: str,
tag: str,
metadata: Dict[str, Any],
description: str,
contents: Mapping[
Union[str, ZipInfo], Union[str, bytes, EmailMessage, ZipInfo, Path]
],
) -> Path:
wheel_name = f"{name}-{version}-{tag}.whl"
dist_info = f"{name}-{version}.dist-info"
return write_wheel_file(
(out_dir / wheel_name),
{
**contents,
f"{dist_info}/METADATA": make_message(
{
# see https://packaging.python.org/en/latest/specifications/core-metadata/
"Metadata-Version": "2.1",
"Name": name,
"Version": version,
**metadata,
},
description,
),
f"{dist_info}/WHEEL": make_message(
{
"Wheel-Version": "1.0",
"Generator": "build_binary_only_wheel.py",
"Root-Is-Purelib": "false", # see https://packaging.python.org/en/latest/specifications/binary-distribution-format/#what-s-the-deal-with-purelib-vs-platlib
"Tag": tag,
}
),
},
)


def write_pagefind_bin_only_wheel(
*,
executable: Path,
output_dir: Path,
version: str,
platform: str,
) -> Path:
# FIXME: update when package support is stabilized
name = "pagefind_bin"
if "extended" in executable.name:
name += "_extended"
contents: Mapping[Union[str, ZipInfo], Path] = {
f"{name}/__init__.py": (src_dir / "__init__.py"),
f"{name}/__main__.py": (src_dir / "__main__.py"),
f"{name}/{executable.name}": executable,
}

# Load in static files
with (src_dir / "README.md").open() as f:
description = f.read().replace("pagefind_bin", name)

return write_wheel(
output_dir,
name=name,
version=version,
tag=f"py3-none-{platform}",
metadata={
"Summary": "Pagefind is a library for performant, low-bandwidth, fully static search.",
"Description-Content-Type": "text/markdown",
"License": "MIT",
"Author": "CloudCannon",
"Classifier": [
"License :: OSI Approved :: MIT License",
"Development Status :: 3 - Alpha", # FIXME: update when package name stabilized
"Intended Audience :: Developers",
],
"Project-URL": [
f"Homepage, {HOMEPAGE}",
f"Source Code, {REPO}",
f"Bug Tracker, {REPO}/issues",
],
"Requires-Python": REQUIRED_PYTHON_VERSION,
},
description=description,
contents=contents,
)


def get_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog=__file__, description="Repackage Pagefind binaries as Python wheels"
)
parser.add_argument(
"--version",
default=None,
help="version to package",
)
parser.add_argument("--suffix", default="", help="wheel version suffix")
parser.add_argument("--bin-path", help="path to the binary to embed", required=True)
parser.add_argument(
"--output-dir",
default="dist/",
help="Output directory in which to place the built wheel",
)
parser.add_argument(
"--llvm-triple",
required=True,
choices=list(LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS.keys()),
help="platform to build for",
)
return parser


def main() -> None:
setup_logging()
args = get_arg_parser().parse_args()
platform = LLVM_TRIPLES_TO_PYTHON_WHEEL_PLATFORMS.get(args.llvm_triple)
if platform is None:
raise ValueError(f"Unsupported platform: {args.llvm_triple}")

logging.getLogger(wheel.__name__).setLevel(logging.WARNING)
write_pagefind_bin_only_wheel(
output_dir=Path(args.output_dir),
executable=Path(args.bin_path),
version=args.version,
platform=platform,
)


if __name__ == "__main__":
main()
Loading

0 comments on commit adf5168

Please sign in to comment.