Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File-like interface to remote resources #70

Merged
merged 8 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions gimie/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""Standard input interfaces to local or remote resources for gimie."""

import io
import os
from pathlib import Path
import requests
from typing import Optional, Union


class Resource:
"""Abstract class for buffered read-only access to local or remote resources via
a file-like interface."""

def open(self) -> io.BufferedReader:
raise NotImplementedError


class LocalResource(Resource):
"""Providing buffered read-only access to local data.

Parameters
----------
name: the name of the resource, typically the filename.
url: the URL where the resource. can be downladed from.
headers: optional headers to pass to the request.

Examples
--------
>>> from gimie.io import LocalResource
>>> resource = LocalResource("README.md")
"""

def __init__(self, path: Union[str, os.PathLike]):
self.path = Path(path)

def open(self, mode="r") -> io.BufferedReader:
return io.BufferedReader(io.FileIO(self.path, mode))

@property
def name(self) -> str:
return self.path.name


class RemoteResource(Resource):
"""Provides buffered read-only access to remote data.

Parameters
----------
name: the name of the resource, typically the filename.
url: the URL where the resource. can be downladed from.
headers: optional headers to pass to the request.

Examples
--------
>>> from gimie.io import RemoteResource
>>> url = "https://raw.githubusercontent.com/SDSC-ORD/gimie/main/README.md"
>>> resource = RemoteResource("README.md", url)
"""

def __init__(self, name: str, url: str, headers: Optional[dict] = None):
self.name = name
self.url = url
self.headers = headers or {}

def open(self) -> io.BufferedReader:
resp = requests.get(
self.url, headers=self.headers, stream=True
).iter_content(chunk_size=128)
return iterable_to_stream(resp)


def iterable_to_stream(
iterable, buffer_size=io.DEFAULT_BUFFER_SIZE
) -> io.BufferedReader:
"""
Converts an iterable yielding bytestrings to a read-only input stream.
Lets you use an iterable (e.g. a generator) that yields bytestrings as a read-only
input stream.

The stream implements Python 3's newer I/O API (available in Python 2's io module).
For efficiency, the stream is buffered.

credits: https://stackoverflow.com/a/20260030/8440675
"""

class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = ""

def readable(self):
return True

def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[: len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF

return io.BufferedReader(IterStream(), buffer_size=buffer_size)
7 changes: 6 additions & 1 deletion gimie/sources/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
# limitations under the License.
"""Abstract classes for gimie objects."""
from abc import ABC, abstractmethod
from typing import Optional
from typing import List, Optional


from rdflib import Graph
from urllib.parse import urlparse
from gimie.io import Resource


class Extractor(ABC):
Expand Down Expand Up @@ -51,6 +52,10 @@ def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
return Graph()

def list_files(self) -> List[Resource]:
"""List all files in the repository HEAD."""
...

def serialize(self, format: str = "ttl", **kwargs) -> str:
"""Serialize the RDF graph representing the instance."""
return self.to_graph().serialize(format=format, **kwargs) # type: ignore
Expand Down
7 changes: 5 additions & 2 deletions gimie/sources/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@
import pydriller
from rdflib import Graph

from gimie.models import Person, PersonSchema
from gimie.io import LocalResource
from gimie.graph.namespaces import SDO
from gimie.models import Person, PersonSchema
from gimie.sources.abstract import Extractor
from gimie.utils import generate_uri


@dataclass
Expand Down Expand Up @@ -73,6 +73,9 @@ def extract(self):
self.date_created = self._get_creation_date()
self.date_modified = self._get_modification_date()

def list_files(self) -> List[LocalResource]:
raise NotImplementedError

def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
jd = GitExtractorSchema().dumps(self)
Expand Down
12 changes: 9 additions & 3 deletions gimie/sources/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dataclasses import dataclass
from datetime import datetime
from dateutil.parser import isoparse
from functools import cached_property
import os
import requests
from typing import Any, Dict, List, Optional, Set, Union
Expand All @@ -37,6 +38,7 @@
PersonSchema,
)
from gimie.graph.namespaces import SDO
from gimie.io import RemoteResource
from gimie.sources.common.license import get_spdx_url
from gimie.sources.common.queries import (
send_rest_query,
Expand Down Expand Up @@ -121,9 +123,12 @@ def to_graph(self) -> Graph:
g.bind("schema", SDO)
return g

def list_files(self) -> List[RemoteResource]:
raise NotImplementedError

def extract(self):
"""Extract metadata from target GitHub repository."""
data = self._fetch_repo_data()
data = self._repo_data
self.author = self._get_author(data["owner"])
self.contributors = self._fetch_contributors()
self.description = data["description"]
Expand All @@ -143,8 +148,9 @@ def extract(self):
f"{self.url}/archive/refs/tags/{self.version}.tar.gz"
)

def _fetch_repo_data(self) -> Dict[str, Any]:
"""Fetch repository metadata from GraphQL endpoint."""
@cached_property
def _repo_data(self) -> Dict[str, Any]:
"""Repository metadata fetched from GraphQL endpoint."""
owner, name = self.path.split("/")
data = {"owner": owner, "name": name}
repo_query = """
Expand Down
16 changes: 11 additions & 5 deletions gimie/sources/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests
from datetime import datetime
from dateutil.parser import isoparse
from functools import cached_property
from typing import Any, Dict, List, Optional, Union
from urllib.parse import urlparse

Expand All @@ -12,14 +13,15 @@
from calamus.schema import JsonLDSchema
from rdflib import Graph

from gimie.sources.abstract import Extractor
from gimie.graph.namespaces import SDO
from gimie.io import RemoteResource
from gimie.models import (
Organization,
OrganizationSchema,
Person,
PersonSchema,
)
from gimie.graph.namespaces import SDO
from gimie.sources.abstract import Extractor
from gimie.sources.common.queries import send_graphql_query, send_rest_query

load_dotenv()
Expand Down Expand Up @@ -62,11 +64,14 @@ def to_graph(self) -> Graph:
g.bind("schema", SDO)
return g

def list_files(self) -> List[RemoteResource]:
raise NotImplementedError

def extract(self):
"""Extract metadata from target Gitlab repository."""

# fetch metadata
data = self._fetch_repo_data(self.path)
data = self._repo_data

# Each Gitlab project has a unique identifier (integer)
self.identifier = urlparse(data["id"]).path.split("/")[2]
Expand Down Expand Up @@ -145,9 +150,10 @@ def _safe_extract_contributors(
uniq_contrib = list({c["id"]: c for c in contributors}.values())
return [self._get_user(contrib) for contrib in uniq_contrib]

def _fetch_repo_data(self, path: str) -> Dict[str, Any]:
@cached_property
def _repo_data(self) -> Dict[str, Any]:
"""Fetch repository metadata from GraphQL endpoint."""
data = {"path": path}
data = {"path": self.path}
project_query = """
query project_query($path: ID!) {
project(fullPath: $path) {
Expand Down
44 changes: 22 additions & 22 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
Expand Down Expand Up @@ -110,12 +111,12 @@ footer = "<!--generated by git-cliff -->"
conventional_commits = true
filter_commits = true
commit_parsers = [
{message = "^feat", group = "Features"},
{message = "^fix", group = "Bug Fixes"},
{message = "^doc", group = "Documentation"},
{ message = "^feat", group = "Features" },
{ message = "^fix", group = "Bug Fixes" },
{ message = "^doc", group = "Documentation" },
]

commit_preprocessors = [
{pattern = 'Merged PR #[0-9]: (.*)', replace = "$1"},
{pattern = " +", replace = " "},
{ pattern = 'Merged PR #[0-9]: (.*)', replace = "$1" },
{ pattern = " +", replace = " " },
]