diff --git a/gimie/io.py b/gimie/io.py new file mode 100644 index 00000000..53ea468d --- /dev/null +++ b/gimie/io.py @@ -0,0 +1,103 @@ +"""Standard input interfaces to local or remote resources for gimie.""" + +import io +import os +from pathlib import Path +import requests +from typing import Optional, Union + + +class Resource: + """Abstract class for buffered read-only access to local or remote resources via + a file-like interface.""" + + def open(self) -> io.BufferedReader: + raise NotImplementedError + + +class LocalResource(Resource): + """Providing buffered read-only access to local data. + + Parameters + ---------- + name: the name of the resource, typically the filename. + url: the URL where the resource. can be downladed from. + headers: optional headers to pass to the request. + + Examples + -------- + >>> from gimie.io import LocalResource + >>> resource = LocalResource("README.md") + """ + + def __init__(self, path: Union[str, os.PathLike]): + self.path = Path(path) + + def open(self, mode="r") -> io.BufferedReader: + return io.BufferedReader(io.FileIO(self.path, mode)) + + @property + def name(self) -> str: + return self.path.name + + +class RemoteResource(Resource): + """Provides buffered read-only access to remote data. + + Parameters + ---------- + name: the name of the resource, typically the filename. + url: the URL where the resource. can be downladed from. + headers: optional headers to pass to the request. + + Examples + -------- + >>> from gimie.io import RemoteResource + >>> url = "https://raw.githubusercontent.com/SDSC-ORD/gimie/main/README.md" + >>> resource = RemoteResource("README.md", url) + """ + + def __init__(self, name: str, url: str, headers: Optional[dict] = None): + self.name = name + self.url = url + self.headers = headers or {} + + def open(self) -> io.BufferedReader: + resp = requests.get( + self.url, headers=self.headers, stream=True + ).iter_content(chunk_size=128) + return iterable_to_stream(resp) + + +def iterable_to_stream( + iterable, buffer_size=io.DEFAULT_BUFFER_SIZE +) -> io.BufferedReader: + """ + Converts an iterable yielding bytestrings to a read-only input stream. + Lets you use an iterable (e.g. a generator) that yields bytestrings as a read-only + input stream. + + The stream implements Python 3's newer I/O API (available in Python 2's io module). + For efficiency, the stream is buffered. + + credits: https://stackoverflow.com/a/20260030/8440675 + """ + + class IterStream(io.RawIOBase): + def __init__(self): + self.leftover = "" + + def readable(self): + return True + + def readinto(self, b): + try: + l = len(b) # We're supposed to return at most this much + chunk = self.leftover or next(iterable) + output, self.leftover = chunk[:l], chunk[l:] + b[: len(output)] = output + return len(output) + except StopIteration: + return 0 # indicate EOF + + return io.BufferedReader(IterStream(), buffer_size=buffer_size) diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index 97722b25..6dacb98b 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -16,11 +16,12 @@ # limitations under the License. """Abstract classes for gimie objects.""" from abc import ABC, abstractmethod -from typing import Optional +from typing import List, Optional from rdflib import Graph from urllib.parse import urlparse +from gimie.io import Resource class Extractor(ABC): @@ -51,6 +52,10 @@ def to_graph(self) -> Graph: """Generate an RDF graph from the instance""" return Graph() + def list_files(self) -> List[Resource]: + """List all files in the repository HEAD.""" + ... + def serialize(self, format: str = "ttl", **kwargs) -> str: """Serialize the RDF graph representing the instance.""" return self.to_graph().serialize(format=format, **kwargs) # type: ignore diff --git a/gimie/sources/git.py b/gimie/sources/git.py index a2c341aa..51906c19 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -26,10 +26,10 @@ import pydriller from rdflib import Graph -from gimie.models import Person, PersonSchema +from gimie.io import LocalResource from gimie.graph.namespaces import SDO +from gimie.models import Person, PersonSchema from gimie.sources.abstract import Extractor -from gimie.utils import generate_uri @dataclass @@ -73,6 +73,9 @@ def extract(self): self.date_created = self._get_creation_date() self.date_modified = self._get_modification_date() + def list_files(self) -> List[LocalResource]: + raise NotImplementedError + def to_graph(self) -> Graph: """Generate an RDF graph from the instance""" jd = GitExtractorSchema().dumps(self) diff --git a/gimie/sources/github.py b/gimie/sources/github.py index 4aaa3089..8e9bc2a1 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -19,6 +19,7 @@ from dataclasses import dataclass from datetime import datetime from dateutil.parser import isoparse +from functools import cached_property import os import requests from typing import Any, Dict, List, Optional, Set, Union @@ -37,6 +38,7 @@ PersonSchema, ) from gimie.graph.namespaces import SDO +from gimie.io import RemoteResource from gimie.sources.common.license import get_spdx_url from gimie.sources.common.queries import ( send_rest_query, @@ -121,9 +123,12 @@ def to_graph(self) -> Graph: g.bind("schema", SDO) return g + def list_files(self) -> List[RemoteResource]: + raise NotImplementedError + def extract(self): """Extract metadata from target GitHub repository.""" - data = self._fetch_repo_data() + data = self._repo_data self.author = self._get_author(data["owner"]) self.contributors = self._fetch_contributors() self.description = data["description"] @@ -143,8 +148,9 @@ def extract(self): f"{self.url}/archive/refs/tags/{self.version}.tar.gz" ) - def _fetch_repo_data(self) -> Dict[str, Any]: - """Fetch repository metadata from GraphQL endpoint.""" + @cached_property + def _repo_data(self) -> Dict[str, Any]: + """Repository metadata fetched from GraphQL endpoint.""" owner, name = self.path.split("/") data = {"owner": owner, "name": name} repo_query = """ diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index cc6736ad..a7d4fbc5 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -4,6 +4,7 @@ import requests from datetime import datetime from dateutil.parser import isoparse +from functools import cached_property from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse @@ -12,14 +13,15 @@ from calamus.schema import JsonLDSchema from rdflib import Graph -from gimie.sources.abstract import Extractor +from gimie.graph.namespaces import SDO +from gimie.io import RemoteResource from gimie.models import ( Organization, OrganizationSchema, Person, PersonSchema, ) -from gimie.graph.namespaces import SDO +from gimie.sources.abstract import Extractor from gimie.sources.common.queries import send_graphql_query, send_rest_query load_dotenv() @@ -62,11 +64,14 @@ def to_graph(self) -> Graph: g.bind("schema", SDO) return g + def list_files(self) -> List[RemoteResource]: + raise NotImplementedError + def extract(self): """Extract metadata from target Gitlab repository.""" # fetch metadata - data = self._fetch_repo_data(self.path) + data = self._repo_data # Each Gitlab project has a unique identifier (integer) self.identifier = urlparse(data["id"]).path.split("/")[2] @@ -145,9 +150,10 @@ def _safe_extract_contributors( uniq_contrib = list({c["id"]: c for c in contributors}.values()) return [self._get_user(contrib) for contrib in uniq_contrib] - def _fetch_repo_data(self, path: str) -> Dict[str, Any]: + @cached_property + def _repo_data(self) -> Dict[str, Any]: """Fetch repository metadata from GraphQL endpoint.""" - data = {"path": path} + data = {"path": self.path} project_query = """ query project_query($path: ID!) { project(fullPath: $path) { diff --git a/poetry.lock b/poetry.lock index 6717801e..1a234e92 100644 --- a/poetry.lock +++ b/poetry.lock @@ -486,13 +486,13 @@ lxml = ["lxml"] [[package]] name = "identify" -version = "2.5.28" +version = "2.5.29" description = "File identification library for Python" optional = false python-versions = ">=3.8" files = [ - {file = "identify-2.5.28-py2.py3-none-any.whl", hash = "sha256:87816de144bf46d161bd5b3e8f5596b16cade3b80be537087334b26bc5c177f3"}, - {file = "identify-2.5.28.tar.gz", hash = "sha256:94bb59643083ebd60dc996d043497479ee554381fbc5307763915cda49b0e78f"}, + {file = "identify-2.5.29-py2.py3-none-any.whl", hash = "sha256:24437fbf6f4d3fe6efd0eb9d67e24dd9106db99af5ceb27996a5f7895f24bf1b"}, + {file = "identify-2.5.29.tar.gz", hash = "sha256:d43d52b86b15918c137e3a74fff5224f60385cd0e9c38e99d07c257f02f151a5"}, ] [package.extras] @@ -1328,13 +1328,13 @@ files = [ [[package]] name = "smmap" -version = "5.0.0" +version = "5.0.1" description = "A pure Python implementation of a sliding window memory map manager" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "smmap-5.0.0-py3-none-any.whl", hash = "sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94"}, - {file = "smmap-5.0.0.tar.gz", hash = "sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936"}, + {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"}, + {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"}, ] [[package]] @@ -1589,35 +1589,35 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6. [[package]] name = "types-pytz" -version = "2023.3.0.1" +version = "2023.3.1.1" description = "Typing stubs for pytz" optional = false python-versions = "*" files = [ - {file = "types-pytz-2023.3.0.1.tar.gz", hash = "sha256:1a7b8d4aac70981cfa24478a41eadfcd96a087c986d6f150d77e3ceb3c2bdfab"}, - {file = "types_pytz-2023.3.0.1-py3-none-any.whl", hash = "sha256:65152e872137926bb67a8fe6cc9cfd794365df86650c5d5fdc7b167b0f38892e"}, + {file = "types-pytz-2023.3.1.1.tar.gz", hash = "sha256:cc23d0192cd49c8f6bba44ee0c81e4586a8f30204970fc0894d209a6b08dab9a"}, + {file = "types_pytz-2023.3.1.1-py3-none-any.whl", hash = "sha256:1999a123a3dc0e39a2ef6d19f3f8584211de9e6a77fe7a0259f04a524e90a5cf"}, ] [[package]] name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" +version = "4.8.0" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, - {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, + {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, + {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, ] [[package]] name = "urllib3" -version = "2.0.4" +version = "2.0.5" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.7" files = [ - {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"}, - {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"}, + {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"}, + {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"}, ] [package.extras] @@ -1670,17 +1670,17 @@ files = [ [[package]] name = "zipp" -version = "3.16.2" +version = "3.17.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" files = [ - {file = "zipp-3.16.2-py3-none-any.whl", hash = "sha256:679e51dd4403591b2d6838a48de3d283f3d188412a9782faadf845f298736ba0"}, - {file = "zipp-3.16.2.tar.gz", hash = "sha256:ebc15946aa78bd63458992fc81ec3b6f7b1e92d51c35e6de1c3804e73b799147"}, + {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, + {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [metadata] diff --git a/pyproject.toml b/pyproject.toml index b6212ca6..8a172172 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Intended Audience :: Science/Research", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", @@ -110,12 +111,12 @@ footer = "" conventional_commits = true filter_commits = true commit_parsers = [ - {message = "^feat", group = "Features"}, - {message = "^fix", group = "Bug Fixes"}, - {message = "^doc", group = "Documentation"}, + { message = "^feat", group = "Features" }, + { message = "^fix", group = "Bug Fixes" }, + { message = "^doc", group = "Documentation" }, ] commit_preprocessors = [ - {pattern = 'Merged PR #[0-9]: (.*)', replace = "$1"}, - {pattern = " +", replace = " "}, + { pattern = 'Merged PR #[0-9]: (.*)', replace = "$1" }, + { pattern = " +", replace = " " }, ]