-
-
Notifications
You must be signed in to change notification settings - Fork 14.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
python-doctr[torch]: init at 0.7.0 #268285
Open
DGollings
wants to merge
1
commit into
NixOS:master
Choose a base branch
from
DGollings:init-python-doctr
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
173 changes: 173 additions & 0 deletions
173
pkgs/development/python-modules/python-doctr/default.nix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
{ lib, buildPythonPackage, fetchPypi, fetchFromGitHub, fetchurl, fetchgit, git, glibc, python3Packages }: | ||
|
||
let | ||
mplcursors = buildPythonPackage rec { | ||
pname = "mplcursors"; | ||
version = "0.3"; | ||
|
||
src = fetchPypi { | ||
inherit pname version; | ||
sha256 = "sha256-DjLBxhP4g6Q21TrWWMA3er2ErAI12UDBNeEclKG679A"; | ||
}; | ||
|
||
# Build-time dependencies | ||
nativeBuildInputs = [ python3Packages.setuptools_scm ]; | ||
|
||
# Run-time dependencies | ||
propagatedBuildInputs = [ | ||
python3Packages.matplotlib | ||
python3Packages.pytest | ||
python3Packages.weasyprint | ||
] ++ lib.optional (python3Packages.pythonOlder "3.8") python3Packages.importlib-metadata; | ||
|
||
meta = with lib; { | ||
description = "Interactive cursors for Matplotlib"; | ||
homepage = "https://example.com/mplcursors"; | ||
license = licenses.mit; | ||
}; | ||
}; | ||
|
||
ctypesgen = buildPythonPackage rec { | ||
pname = "ctypesgen"; | ||
version = "pypdfium2"; | ||
|
||
src = fetchFromGitHub { | ||
owner = "pypdfium2-team"; | ||
repo = "ctypesgen"; | ||
rev = "pypdfium2"; | ||
sha256 = "sha256-klc6mouJ8w/xIgx8xmDXrui5Ebyicg++KIgr+b5ozbk="; | ||
}; | ||
|
||
# Specify native build inputs | ||
nativeBuildInputs = with python3Packages; [ | ||
setuptools | ||
wheel | ||
setuptools_scm | ||
tomli | ||
]; | ||
|
||
buildInputs = [ glibc ]; | ||
|
||
# Custom patching steps | ||
postPatch = '' | ||
export SETUPTOOLS_SCM_PRETEND_VERSION=1.0.0 # fake version | ||
mkdir -p dist | ||
''; | ||
|
||
# Disable checks if necessary | ||
doCheck = false; | ||
|
||
# Run-time dependencies | ||
propagatedBuildInputs = [ python3Packages.wheel python3Packages.toml ]; | ||
|
||
meta = with lib; { | ||
description = "Python bindings generator for C libraries"; | ||
homepage = "https://github.com/pypdfium2-team/ctypesgen"; | ||
license = licenses.mit; | ||
}; | ||
}; | ||
|
||
pypdfium2 = buildPythonPackage rec { | ||
pname = "pypdfium2"; | ||
version = "4.24.0"; | ||
|
||
src = fetchPypi { | ||
inherit pname version; | ||
sha256 = "sha256-YnBsBrxb45qnolMa+AJCBCm2xMR0mO69JSGvfpiNCEg="; | ||
}; | ||
|
||
# Additional source and binary fetching | ||
headers = fetchurl { | ||
url = "https://pdfium.googlesource.com/pdfium/+archive/7233e99fcaeb18adbf048be2df0b1cca355abc70/public.tar.gz"; | ||
sha256 = "sha256-920OK/8UXrwwlf+FBrIKdTl3Q35W1li/BEpGknbtRlU="; | ||
}; | ||
|
||
binaries = fetchurl { | ||
url = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F6124/pdfium-linux-x64.tgz"; | ||
sha256 = "sha256-nFIwGgpwFV31rgu6ZFZtrcAAEltBNPgoVy5hR7evbA8="; | ||
}; | ||
|
||
# Patches and post-patch steps | ||
patches = [ ./pypdfdfium2-get-binaries.patch ]; | ||
|
||
# Place headers and binary downloads in the expected locations | ||
postPatch = '' | ||
mkdir -p data/bindings/headers | ||
tar -xzf ${headers} -C data/bindings/headers | ||
mkdir -p data/linux_x64 | ||
cp ${binaries} data/linux_x64/pdfium-linux-x64.tgz | ||
cp ${binaries} pdfium-linux-x64.tgz | ||
''; | ||
|
||
# Fetching pdfium binaries | ||
pdfium-binaries = fetchgit { | ||
url = "https://github.com/bblanchon/pdfium-binaries.git"; | ||
rev = "chromium/6124"; | ||
sha256 = "sha256-2GfuqI95RLLhSC13Qc97wK/XrAqPxnDNfiFD2hNK4+A="; | ||
}; | ||
|
||
# Native build inputs | ||
nativeBuildInputs = [ git ctypesgen ]; | ||
|
||
meta = with lib; { | ||
description = "Python bindings for the PDFium library"; | ||
homepage = "https://example.com/pypdfium2"; | ||
license = licenses.mit; | ||
}; | ||
}; | ||
|
||
python-doctr = buildPythonPackage rec { | ||
pname = "python-doctr"; | ||
version = "0.7.0"; | ||
|
||
src = fetchPypi { | ||
inherit pname version; | ||
sha256 = "sha256-4F7yC8WPxiyA0vOWjtOADLFXf8k1OkZTw6eyw+D2SFU="; | ||
}; | ||
|
||
# Build-time dependencies | ||
nativeBuildInputs = [ python3Packages.pip ]; | ||
|
||
# Run-time dependencies | ||
propagatedBuildInputs = [ | ||
python3Packages.opencv4 | ||
python3Packages.setuptools | ||
python3Packages.huggingface-hub | ||
python3Packages.unidecode | ||
python3Packages.rapidfuzz | ||
python3Packages.langdetect | ||
python3Packages.shapely | ||
python3Packages.pyclipper | ||
python3Packages.scipy | ||
python3Packages.h5py | ||
mplcursors | ||
pypdfium2 | ||
]; | ||
|
||
# Disable checks if necessary | ||
doCheck = false; | ||
|
||
meta = with lib; { | ||
description = "A powerful tool for Python documentation"; | ||
homepage = "https://example.com/python-doctr"; | ||
license = licenses.mit; | ||
}; | ||
}; | ||
|
||
# Override for python-doctr with additional dependencies for pyTorch | ||
python-doctr-pytorch = python3Packages.toPythonModule (python-doctr.overridePythonAttrs (oldAttrs: { | ||
propagatedBuildInputs = oldAttrs.propagatedBuildInputs ++ [ | ||
python3Packages.torch | ||
python3Packages.torchvision | ||
]; | ||
})); | ||
|
||
in | ||
{ | ||
packages = { | ||
python-doctr = python-doctr; | ||
python-doctr-pytorch = python-doctr-pytorch; | ||
}; | ||
|
||
defaultPackage.x86_64-linux = python-doctr; | ||
} |
106 changes: 106 additions & 0 deletions
106
pkgs/development/python-modules/python-doctr/pypdfium2-get-binaries.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py | ||
index 8fa4a70..1df4ec6 100644 | ||
--- a/setupsrc/pypdfium2_setup/packaging_base.py | ||
+++ b/setupsrc/pypdfium2_setup/packaging_base.py | ||
@@ -115,6 +115,8 @@ LibnameForSystem = { | ||
BinaryPlatforms = list(ReleaseNames.keys()) | ||
BinarySystems = list(LibnameForSystem.keys()) | ||
|
||
+NixHardcodedVersion = 6124 | ||
+ | ||
|
||
class PdfiumVer: | ||
|
||
@@ -124,40 +126,20 @@ class PdfiumVer: | ||
@staticmethod | ||
@functools.lru_cache(maxsize=1) | ||
def get_latest(): | ||
- git_ls = run_cmd(["git", "ls-remote", f"{ReleaseRepo}.git"], cwd=None, capture=True) | ||
- tag = git_ls.split("\t")[-1] | ||
- return int( tag.split("/")[-1] ) | ||
+ return NixHardcodedVersion | ||
|
||
@classmethod | ||
def to_full(cls, v_short): | ||
- | ||
- # FIXME The ls-remote call is fairly expensive. While cached in memory for a process lifetime, it can cause a significant slowdown for consecutive process runs. | ||
- # There may be multiple ways to improve this, like adding some disk cache to ensure it would only be called once for a whole session, or maybe adding a second strategy that would parse the pdfium-binaries VERSION file, and use the chromium refs only for sourcebuild. | ||
- | ||
- v_short = int(v_short) | ||
- rc = cls._refs_cache | ||
- | ||
- if rc["lines"] is None: | ||
- print(f"Fetching chromium refs ...", file=sys.stderr) | ||
- ChromiumURL = "https://chromium.googlesource.com/chromium/src" | ||
- rc["lines"] = run_cmd(["git", "ls-remote", "--sort", "-version:refname", "--tags", ChromiumURL, '*.*.*.0'], cwd=None, capture=True).split("\n") | ||
- | ||
- if rc["cursor"] is None or rc["cursor"] > v_short: | ||
- for i, line in enumerate(rc["lines"]): | ||
- ref = line.split("\t")[-1].rsplit("/", maxsplit=1)[-1] | ||
- full_ver = cls.scheme(*[int(v) for v in ref.split(".")]) | ||
- rc["dict"][full_ver.build] = full_ver | ||
- if full_ver.build == v_short: | ||
- rc["cursor"] = full_ver.build | ||
- rc["lines"] = rc["lines"][i+1:] | ||
- break | ||
- | ||
- full_ver = rc["dict"][v_short] | ||
- print(f"Resolved {v_short} -> {full_ver}", file=sys.stderr) | ||
- | ||
+ # can be found using | ||
+ # git ls-remote --sort -version:refname --tags https://chromium.googlesource.com/chromium/src '*.*.*.0' | awk -F '/' '{print $NF}' | grep $NixHardcodedVersion | ||
+ # where the minor shoud match the NixHardcodedVersion | ||
+ # after which a dict is returned | ||
+ PdfiumVerTuple = namedtuple("PdfiumVerTuple", ["build", "major", "minor", "patch"]) | ||
+ | ||
+ # Simulate a return value for full_ver | ||
+ full_ver = PdfiumVerTuple(build=121, major=0, minor=NixHardcodedVersion, patch=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW |
||
return full_ver | ||
|
||
- | ||
def read_json(fp): | ||
with open(fp, "r") as buf: | ||
return json.load(buf) | ||
diff --git a/setupsrc/pypdfium2_setup/update_pdfium.py b/setupsrc/pypdfium2_setup/update_pdfium.py | ||
index f4d1b3f..2fdb3de 100755 | ||
--- a/setupsrc/pypdfium2_setup/update_pdfium.py | ||
+++ b/setupsrc/pypdfium2_setup/update_pdfium.py | ||
@@ -36,17 +36,8 @@ def _get_package(pl_name, version, robust, use_v8): | ||
fn = prefix + f"{ReleaseNames[pl_name]}.tgz" | ||
fu = f"{ReleaseURL}{version}/{fn}" | ||
fp = pl_dir / fn | ||
- print(f"'{fu}' -> '{fp}'") | ||
- | ||
- try: | ||
- url_request.urlretrieve(fu, fp) | ||
- except Exception: | ||
- if robust: | ||
- traceback.print_exc() | ||
- return None, None | ||
- else: | ||
- raise | ||
|
||
+ print("OVERRIDE - using nix supplied package instead of downloading") | ||
return pl_name, fp | ||
|
||
|
||
@@ -69,7 +60,8 @@ def extract(archives, version, flags): | ||
|
||
for pl_name, arc_path in archives.items(): | ||
|
||
- with tarfile.open(arc_path) as tar: | ||
+ arc_path_override = os.path.basename(arc_path) | ||
+ with tarfile.open(arc_path_override) as tar: | ||
pl_dir = DataDir/pl_name | ||
system = plat_to_system(pl_name) | ||
libname = LibnameForSystem[system] | ||
@@ -77,9 +69,6 @@ def extract(archives, version, flags): | ||
tar_extract_file(tar, f"{tar_libdir}/{libname}", pl_dir/libname) | ||
write_pdfium_info(pl_dir, version, origin="pdfium-binaries", flags=flags) | ||
|
||
- arc_path.unlink() | ||
- | ||
- | ||
BinaryPlatforms = list(ReleaseNames.keys()) | ||
|
||
def main(platforms, version=None, robust=False, max_workers=None, use_v8=False): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually an interesting approach, just filling the data cache.
If we add caches for the tarball and the refs, that might be a viable alternative to entirely external data files management. Then offline callers could just hook in at any abstraction stage they like and reuse as much of our code as possible.
(I overlooked that possibility during discussion with @nh2, though shortly afterwards it came to my mind that programatically we don't truly need the
prepared!
target and could go through the cache instead.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wait, though. Caches need version info to be correct, so it would probably add more complexity than it's worth and you might as well use external data files directly.
Also, this made me realize an issue (regression) in the cache logic - existing headers can't be used without a version check, otherwise this would fail to update. I'll fix this shortly, but the fix will break the assumption made by this code.