diff --git a/pkgs/development/python-modules/python-doctr/default.nix b/pkgs/development/python-modules/python-doctr/default.nix new file mode 100644 index 0000000000000..6e034b404ec7c --- /dev/null +++ b/pkgs/development/python-modules/python-doctr/default.nix @@ -0,0 +1,173 @@ +{ lib, buildPythonPackage, fetchPypi, fetchFromGitHub, fetchurl, fetchgit, git, glibc, python3Packages }: + +let + mplcursors = buildPythonPackage rec { + pname = "mplcursors"; + version = "0.3"; + + src = fetchPypi { + inherit pname version; + sha256 = "sha256-DjLBxhP4g6Q21TrWWMA3er2ErAI12UDBNeEclKG679A"; + }; + + # Build-time dependencies + nativeBuildInputs = [ python3Packages.setuptools_scm ]; + + # Run-time dependencies + propagatedBuildInputs = [ + python3Packages.matplotlib + python3Packages.pytest + python3Packages.weasyprint + ] ++ lib.optional (python3Packages.pythonOlder "3.8") python3Packages.importlib-metadata; + + meta = with lib; { + description = "Interactive cursors for Matplotlib"; + homepage = "https://example.com/mplcursors"; + license = licenses.mit; + }; + }; + + ctypesgen = buildPythonPackage rec { + pname = "ctypesgen"; + version = "pypdfium2"; + + src = fetchFromGitHub { + owner = "pypdfium2-team"; + repo = "ctypesgen"; + rev = "pypdfium2"; + sha256 = "sha256-klc6mouJ8w/xIgx8xmDXrui5Ebyicg++KIgr+b5ozbk="; + }; + + # Specify native build inputs + nativeBuildInputs = with python3Packages; [ + setuptools + wheel + setuptools_scm + tomli + ]; + + buildInputs = [ glibc ]; + + # Custom patching steps + postPatch = '' + export SETUPTOOLS_SCM_PRETEND_VERSION=1.0.0 # fake version + mkdir -p dist + ''; + + # Disable checks if necessary + doCheck = false; + + # Run-time dependencies + propagatedBuildInputs = [ python3Packages.wheel python3Packages.toml ]; + + meta = with lib; { + description = "Python bindings generator for C libraries"; + homepage = "https://github.com/pypdfium2-team/ctypesgen"; + license = licenses.mit; + }; + }; + + pypdfium2 = buildPythonPackage rec { + pname = "pypdfium2"; + version = "4.24.0"; + + src = fetchPypi { + inherit pname version; + sha256 = "sha256-YnBsBrxb45qnolMa+AJCBCm2xMR0mO69JSGvfpiNCEg="; + }; + + # Additional source and binary fetching + headers = fetchurl { + url = "https://pdfium.googlesource.com/pdfium/+archive/7233e99fcaeb18adbf048be2df0b1cca355abc70/public.tar.gz"; + sha256 = "sha256-920OK/8UXrwwlf+FBrIKdTl3Q35W1li/BEpGknbtRlU="; + }; + + binaries = fetchurl { + url = "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F6124/pdfium-linux-x64.tgz"; + sha256 = "sha256-nFIwGgpwFV31rgu6ZFZtrcAAEltBNPgoVy5hR7evbA8="; + }; + + # Patches and post-patch steps + patches = [ ./pypdfdfium2-get-binaries.patch ]; + + # Place headers and binary downloads in the expected locations + postPatch = '' + mkdir -p data/bindings/headers + tar -xzf ${headers} -C data/bindings/headers + mkdir -p data/linux_x64 + cp ${binaries} data/linux_x64/pdfium-linux-x64.tgz + cp ${binaries} pdfium-linux-x64.tgz + ''; + + # Fetching pdfium binaries + pdfium-binaries = fetchgit { + url = "https://github.com/bblanchon/pdfium-binaries.git"; + rev = "chromium/6124"; + sha256 = "sha256-2GfuqI95RLLhSC13Qc97wK/XrAqPxnDNfiFD2hNK4+A="; + }; + + # Native build inputs + nativeBuildInputs = [ git ctypesgen ]; + + meta = with lib; { + description = "Python bindings for the PDFium library"; + homepage = "https://example.com/pypdfium2"; + license = licenses.mit; + }; + }; + + python-doctr = buildPythonPackage rec { + pname = "python-doctr"; + version = "0.7.0"; + + src = fetchPypi { + inherit pname version; + sha256 = "sha256-4F7yC8WPxiyA0vOWjtOADLFXf8k1OkZTw6eyw+D2SFU="; + }; + + # Build-time dependencies + nativeBuildInputs = [ python3Packages.pip ]; + + # Run-time dependencies + propagatedBuildInputs = [ + python3Packages.opencv4 + python3Packages.setuptools + python3Packages.huggingface-hub + python3Packages.unidecode + python3Packages.rapidfuzz + python3Packages.langdetect + python3Packages.shapely + python3Packages.pyclipper + python3Packages.scipy + python3Packages.h5py + mplcursors + pypdfium2 + ]; + + # Disable checks if necessary + doCheck = false; + + meta = with lib; { + description = "A powerful tool for Python documentation"; + homepage = "https://example.com/python-doctr"; + license = licenses.mit; + }; + }; + + # Override for python-doctr with additional dependencies for pyTorch + python-doctr-pytorch = python3Packages.toPythonModule (python-doctr.overridePythonAttrs (oldAttrs: { + propagatedBuildInputs = oldAttrs.propagatedBuildInputs ++ [ + python3Packages.torch + python3Packages.torchvision + ]; + })); + +in +{ + packages = { + python-doctr = python-doctr; + python-doctr-pytorch = python-doctr-pytorch; + }; + + defaultPackage.x86_64-linux = python-doctr; +} diff --git a/pkgs/development/python-modules/python-doctr/pypdfium2-get-binaries.patch b/pkgs/development/python-modules/python-doctr/pypdfium2-get-binaries.patch new file mode 100644 index 0000000000000..e03a207f63c5d --- /dev/null +++ b/pkgs/development/python-modules/python-doctr/pypdfium2-get-binaries.patch @@ -0,0 +1,106 @@ +diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py +index 8fa4a70..1df4ec6 100644 +--- a/setupsrc/pypdfium2_setup/packaging_base.py ++++ b/setupsrc/pypdfium2_setup/packaging_base.py +@@ -115,6 +115,8 @@ LibnameForSystem = { + BinaryPlatforms = list(ReleaseNames.keys()) + BinarySystems = list(LibnameForSystem.keys()) + ++NixHardcodedVersion = 6124 ++ + + class PdfiumVer: + +@@ -124,40 +126,20 @@ class PdfiumVer: + @staticmethod + @functools.lru_cache(maxsize=1) + def get_latest(): +- git_ls = run_cmd(["git", "ls-remote", f"{ReleaseRepo}.git"], cwd=None, capture=True) +- tag = git_ls.split("\t")[-1] +- return int( tag.split("/")[-1] ) ++ return NixHardcodedVersion + + @classmethod + def to_full(cls, v_short): +- +- # FIXME The ls-remote call is fairly expensive. While cached in memory for a process lifetime, it can cause a significant slowdown for consecutive process runs. +- # There may be multiple ways to improve this, like adding some disk cache to ensure it would only be called once for a whole session, or maybe adding a second strategy that would parse the pdfium-binaries VERSION file, and use the chromium refs only for sourcebuild. +- +- v_short = int(v_short) +- rc = cls._refs_cache +- +- if rc["lines"] is None: +- print(f"Fetching chromium refs ...", file=sys.stderr) +- ChromiumURL = "https://chromium.googlesource.com/chromium/src" +- rc["lines"] = run_cmd(["git", "ls-remote", "--sort", "-version:refname", "--tags", ChromiumURL, '*.*.*.0'], cwd=None, capture=True).split("\n") +- +- if rc["cursor"] is None or rc["cursor"] > v_short: +- for i, line in enumerate(rc["lines"]): +- ref = line.split("\t")[-1].rsplit("/", maxsplit=1)[-1] +- full_ver = cls.scheme(*[int(v) for v in ref.split(".")]) +- rc["dict"][full_ver.build] = full_ver +- if full_ver.build == v_short: +- rc["cursor"] = full_ver.build +- rc["lines"] = rc["lines"][i+1:] +- break +- +- full_ver = rc["dict"][v_short] +- print(f"Resolved {v_short} -> {full_ver}", file=sys.stderr) +- ++ # can be found using ++ # git ls-remote --sort -version:refname --tags https://chromium.googlesource.com/chromium/src '*.*.*.0' | awk -F '/' '{print $NF}' | grep $NixHardcodedVersion ++ # where the minor shoud match the NixHardcodedVersion ++ # after which a dict is returned ++ PdfiumVerTuple = namedtuple("PdfiumVerTuple", ["build", "major", "minor", "patch"]) ++ ++ # Simulate a return value for full_ver ++ full_ver = PdfiumVerTuple(build=121, major=0, minor=NixHardcodedVersion, patch=0) + return full_ver + +- + def read_json(fp): + with open(fp, "r") as buf: + return json.load(buf) +diff --git a/setupsrc/pypdfium2_setup/update_pdfium.py b/setupsrc/pypdfium2_setup/update_pdfium.py +index f4d1b3f..2fdb3de 100755 +--- a/setupsrc/pypdfium2_setup/update_pdfium.py ++++ b/setupsrc/pypdfium2_setup/update_pdfium.py +@@ -36,17 +36,8 @@ def _get_package(pl_name, version, robust, use_v8): + fn = prefix + f"{ReleaseNames[pl_name]}.tgz" + fu = f"{ReleaseURL}{version}/{fn}" + fp = pl_dir / fn +- print(f"'{fu}' -> '{fp}'") +- +- try: +- url_request.urlretrieve(fu, fp) +- except Exception: +- if robust: +- traceback.print_exc() +- return None, None +- else: +- raise + ++ print("OVERRIDE - using nix supplied package instead of downloading") + return pl_name, fp + + +@@ -69,7 +60,8 @@ def extract(archives, version, flags): + + for pl_name, arc_path in archives.items(): + +- with tarfile.open(arc_path) as tar: ++ arc_path_override = os.path.basename(arc_path) ++ with tarfile.open(arc_path_override) as tar: + pl_dir = DataDir/pl_name + system = plat_to_system(pl_name) + libname = LibnameForSystem[system] +@@ -77,9 +69,6 @@ def extract(archives, version, flags): + tar_extract_file(tar, f"{tar_libdir}/{libname}", pl_dir/libname) + write_pdfium_info(pl_dir, version, origin="pdfium-binaries", flags=flags) + +- arc_path.unlink() +- +- + BinaryPlatforms = list(ReleaseNames.keys()) + + def main(platforms, version=None, robust=False, max_workers=None, use_v8=False): diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index b171e8e9ba753..88e71ba17d979 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -10171,6 +10171,8 @@ self: super: with self; { python-didl-lite = callPackage ../development/python-modules/python-didl-lite { }; + python-doctr = callPackage ../development/python-modules/python-doctr { }; + python-docx = callPackage ../development/python-modules/python-docx { }; python-doi = callPackage ../development/python-modules/python-doi { };