Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python-doctr[torch]: init at 0.7.0 #268285

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions pkgs/development/python-modules/python-doctr/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{ lib, buildPythonPackage, fetchPypi, fetchFromGitHub, fetchurl, fetchgit, git, glibc, python3Packages }:

let
mplcursors = buildPythonPackage rec {
pname = "mplcursors";
version = "0.3";

src = fetchPypi {
inherit pname version;
sha256 = "sha256-DjLBxhP4g6Q21TrWWMA3er2ErAI12UDBNeEclKG679A";
};

# Build-time dependencies
nativeBuildInputs = [ python3Packages.setuptools_scm ];

# Run-time dependencies
propagatedBuildInputs = [
python3Packages.matplotlib
python3Packages.pytest
python3Packages.weasyprint
] ++ lib.optional (python3Packages.pythonOlder "3.8") python3Packages.importlib-metadata;

meta = with lib; {
description = "Interactive cursors for Matplotlib";
homepage = "https://example.com/mplcursors";
license = licenses.mit;
};
};

ctypesgen = buildPythonPackage rec {
pname = "ctypesgen";
version = "pypdfium2";

src = fetchFromGitHub {
owner = "pypdfium2-team";
repo = "ctypesgen";
rev = "pypdfium2";
sha256 = "sha256-klc6mouJ8w/xIgx8xmDXrui5Ebyicg++KIgr+b5ozbk=";
};

# Specify native build inputs
nativeBuildInputs = with python3Packages; [
setuptools
wheel
setuptools_scm
tomli
];

buildInputs = [ glibc ];

# Custom patching steps
postPatch = ''
export SETUPTOOLS_SCM_PRETEND_VERSION=1.0.0 # fake version
mkdir -p dist
'';

# Disable checks if necessary
doCheck = false;

# Run-time dependencies
propagatedBuildInputs = [ python3Packages.wheel python3Packages.toml ];

meta = with lib; {
description = "Python bindings generator for C libraries";
homepage = "https://github.com/pypdfium2-team/ctypesgen";
license = licenses.mit;
};
};

pypdfium2 = buildPythonPackage rec {
pname = "pypdfium2";
version = "4.24.0";

src = fetchPypi {
inherit pname version;
sha256 = "sha256-YnBsBrxb45qnolMa+AJCBCm2xMR0mO69JSGvfpiNCEg=";
};

# Additional source and binary fetching
headers = fetchurl {
url = "https://pdfium.googlesource.com/pdfium/+archive/7233e99fcaeb18adbf048be2df0b1cca355abc70/public.tar.gz";
sha256 = "sha256-920OK/8UXrwwlf+FBrIKdTl3Q35W1li/BEpGknbtRlU=";
};

binaries = fetchurl {
url = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F6124/pdfium-linux-x64.tgz";
sha256 = "sha256-nFIwGgpwFV31rgu6ZFZtrcAAEltBNPgoVy5hR7evbA8=";
};

# Patches and post-patch steps
patches = [ ./pypdfdfium2-get-binaries.patch ];

# Place headers and binary downloads in the expected locations
postPatch = ''
mkdir -p data/bindings/headers
tar -xzf ${headers} -C data/bindings/headers
mkdir -p data/linux_x64
cp ${binaries} data/linux_x64/pdfium-linux-x64.tgz
cp ${binaries} pdfium-linux-x64.tgz
Copy link

@mara004 mara004 Nov 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually an interesting approach, just filling the data cache.
If we add caches for the tarball and the refs, that might be a viable alternative to entirely external data files management. Then offline callers could just hook in at any abstraction stage they like and reuse as much of our code as possible.

(I overlooked that possibility during discussion with @nh2, though shortly afterwards it came to my mind that programatically we don't truly need the prepared! target and could go through the cache instead.)

Copy link

@mara004 mara004 Nov 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, though. Caches need version info to be correct, so it would probably add more complexity than it's worth and you might as well use external data files directly.

Also, this made me realize an issue (regression) in the cache logic - existing headers can't be used without a version check, otherwise this would fail to update. I'll fix this shortly, but the fix will break the assumption made by this code.

'';

# Fetching pdfium binaries
pdfium-binaries = fetchgit {
url = "https://github.com/bblanchon/pdfium-binaries.git";
rev = "chromium/6124";
sha256 = "sha256-2GfuqI95RLLhSC13Qc97wK/XrAqPxnDNfiFD2hNK4+A=";
};

# Native build inputs
nativeBuildInputs = [ git ctypesgen ];

meta = with lib; {
description = "Python bindings for the PDFium library";
homepage = "https://example.com/pypdfium2";
license = licenses.mit;
};
};

python-doctr = buildPythonPackage rec {
pname = "python-doctr";
version = "0.7.0";

src = fetchPypi {
inherit pname version;
sha256 = "sha256-4F7yC8WPxiyA0vOWjtOADLFXf8k1OkZTw6eyw+D2SFU=";
};

# Build-time dependencies
nativeBuildInputs = [ python3Packages.pip ];

# Run-time dependencies
propagatedBuildInputs = [
python3Packages.opencv4
python3Packages.setuptools
python3Packages.huggingface-hub
python3Packages.unidecode
python3Packages.rapidfuzz
python3Packages.langdetect
python3Packages.shapely
python3Packages.pyclipper
python3Packages.scipy
python3Packages.h5py
mplcursors
pypdfium2
];

# Disable checks if necessary
doCheck = false;

meta = with lib; {
description = "A powerful tool for Python documentation";
homepage = "https://example.com/python-doctr";
license = licenses.mit;
};
};

# Override for python-doctr with additional dependencies for pyTorch
python-doctr-pytorch = python3Packages.toPythonModule (python-doctr.overridePythonAttrs (oldAttrs: {
propagatedBuildInputs = oldAttrs.propagatedBuildInputs ++ [
python3Packages.torch
python3Packages.torchvision
];
}));

in
{
packages = {
python-doctr = python-doctr;
python-doctr-pytorch = python-doctr-pytorch;
};

defaultPackage.x86_64-linux = python-doctr;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 8fa4a70..1df4ec6 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -115,6 +115,8 @@ LibnameForSystem = {
BinaryPlatforms = list(ReleaseNames.keys())
BinarySystems = list(LibnameForSystem.keys())

+NixHardcodedVersion = 6124
+

class PdfiumVer:

@@ -124,40 +126,20 @@ class PdfiumVer:
@staticmethod
@functools.lru_cache(maxsize=1)
def get_latest():
- git_ls = run_cmd(["git", "ls-remote", f"{ReleaseRepo}.git"], cwd=None, capture=True)
- tag = git_ls.split("\t")[-1]
- return int( tag.split("/")[-1] )
+ return NixHardcodedVersion

@classmethod
def to_full(cls, v_short):
-
- # FIXME The ls-remote call is fairly expensive. While cached in memory for a process lifetime, it can cause a significant slowdown for consecutive process runs.
- # There may be multiple ways to improve this, like adding some disk cache to ensure it would only be called once for a whole session, or maybe adding a second strategy that would parse the pdfium-binaries VERSION file, and use the chromium refs only for sourcebuild.
-
- v_short = int(v_short)
- rc = cls._refs_cache
-
- if rc["lines"] is None:
- print(f"Fetching chromium refs ...", file=sys.stderr)
- ChromiumURL = "https://chromium.googlesource.com/chromium/src"
- rc["lines"] = run_cmd(["git", "ls-remote", "--sort", "-version:refname", "--tags", ChromiumURL, '*.*.*.0'], cwd=None, capture=True).split("\n")
-
- if rc["cursor"] is None or rc["cursor"] > v_short:
- for i, line in enumerate(rc["lines"]):
- ref = line.split("\t")[-1].rsplit("/", maxsplit=1)[-1]
- full_ver = cls.scheme(*[int(v) for v in ref.split(".")])
- rc["dict"][full_ver.build] = full_ver
- if full_ver.build == v_short:
- rc["cursor"] = full_ver.build
- rc["lines"] = rc["lines"][i+1:]
- break
-
- full_ver = rc["dict"][v_short]
- print(f"Resolved {v_short} -> {full_ver}", file=sys.stderr)
-
+ # can be found using
+ # git ls-remote --sort -version:refname --tags https://chromium.googlesource.com/chromium/src '*.*.*.0' | awk -F '/' '{print $NF}' | grep $NixHardcodedVersion
+ # where the minor shoud match the NixHardcodedVersion
+ # after which a dict is returned
+ PdfiumVerTuple = namedtuple("PdfiumVerTuple", ["build", "major", "minor", "patch"])
+
+ # Simulate a return value for full_ver
+ full_ver = PdfiumVerTuple(build=121, major=0, minor=NixHardcodedVersion, patch=0)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW cls.scheme() provides that definition already. In general, redefining types can cause confusion if there were any type checks.

return full_ver

-
def read_json(fp):
with open(fp, "r") as buf:
return json.load(buf)
diff --git a/setupsrc/pypdfium2_setup/update_pdfium.py b/setupsrc/pypdfium2_setup/update_pdfium.py
index f4d1b3f..2fdb3de 100755
--- a/setupsrc/pypdfium2_setup/update_pdfium.py
+++ b/setupsrc/pypdfium2_setup/update_pdfium.py
@@ -36,17 +36,8 @@ def _get_package(pl_name, version, robust, use_v8):
fn = prefix + f"{ReleaseNames[pl_name]}.tgz"
fu = f"{ReleaseURL}{version}/{fn}"
fp = pl_dir / fn
- print(f"'{fu}' -> '{fp}'")
-
- try:
- url_request.urlretrieve(fu, fp)
- except Exception:
- if robust:
- traceback.print_exc()
- return None, None
- else:
- raise

+ print("OVERRIDE - using nix supplied package instead of downloading")
return pl_name, fp


@@ -69,7 +60,8 @@ def extract(archives, version, flags):

for pl_name, arc_path in archives.items():

- with tarfile.open(arc_path) as tar:
+ arc_path_override = os.path.basename(arc_path)
+ with tarfile.open(arc_path_override) as tar:
pl_dir = DataDir/pl_name
system = plat_to_system(pl_name)
libname = LibnameForSystem[system]
@@ -77,9 +69,6 @@ def extract(archives, version, flags):
tar_extract_file(tar, f"{tar_libdir}/{libname}", pl_dir/libname)
write_pdfium_info(pl_dir, version, origin="pdfium-binaries", flags=flags)

- arc_path.unlink()
-
-
BinaryPlatforms = list(ReleaseNames.keys())

def main(platforms, version=None, robust=False, max_workers=None, use_v8=False):
2 changes: 2 additions & 0 deletions pkgs/top-level/python-packages.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10171,6 +10171,8 @@ self: super: with self; {

python-didl-lite = callPackage ../development/python-modules/python-didl-lite { };

python-doctr = callPackage ../development/python-modules/python-doctr { };

python-docx = callPackage ../development/python-modules/python-docx { };

python-doi = callPackage ../development/python-modules/python-doi { };
Expand Down