Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: license matcher for git extractor #78

Merged
merged 16 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions gimie/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ class Resource:
"""Abstract class for buffered read-only access to local or remote resources via
a file-like interface."""

name: str
path: Union[str, os.PathLike]

def open(self) -> io.BufferedReader:
raise NotImplementedError

Expand Down
13 changes: 13 additions & 0 deletions gimie/sources/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from rdflib import Graph
from urllib.parse import urlparse
from gimie.sources.common.license import _get_license_url, is_license_path
from gimie.io import Resource


Expand Down Expand Up @@ -83,3 +84,15 @@ def base(self) -> str:
url = urlparse(self.url)
return f"{url.scheme}://{url.netloc}"
return self.base_url

def _get_licenses(self) -> list[str]:
"""Extract a SPDX License URL from a GitHub Repository"""
license_files = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_urls = []
for file in license_files:
license_url = _get_license_url(file)
if license_url:
license_urls.append(license_url)
return license_urls
68 changes: 41 additions & 27 deletions gimie/sources/common/license.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,50 @@
import os
import re
from spdx_license_list import LICENSES
from spdx_license_list import LICENSES, License
from scancode.api import get_licenses
from typing import List
from typing import Iterable, List, Optional
from gimie.io import Resource, iterable_to_stream, RemoteResource
import tempfile


def _get_licenses(temp_file_path: str) -> str:
"""Takes a file with a license text in it, and matches this using the scancode API to a license to get some possible
license matches. The highest match is then returned as a spdx license ID"""
license_detections = get_licenses(temp_file_path, include_text=True)[
def _get_license_url(license_file: Resource) -> Optional[str]:
"""Takes the path of a text file containing a license text, and matches this
using the scancode API to get possible license matches. The best match is
then returned as a spdx license URL"""
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(license_file.open().read())
temp_file.close()

license_detections = get_licenses(temp_file.name, include_text=True)[
"license_detections"
]
license_id = get_license_with_highest_coverage(license_detections)
spdx_license_id = get_spdx_license_id(LICENSES, license_id)
license_id = get_license_with_highest_coverage(license_detections) # type: ignore
spdx_license_id = get_spdx_license_id(LICENSES.keys(), license_id)
os.unlink(temp_file.name)
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
if spdx_license_id:
return f"https://spdx.org/licenses/{str(spdx_license_id)}.html"

return None


return spdx_license_id
def get_spdx_license_id(
ref_licenses: Iterable[str], license_id: Optional[str]
) -> Optional[str]:
"""Given a scancode API license ID also known as a license detection, returns the correctly capitalized
spdx id corresponding to it.

Parameters
----------
ref_licenses: Iterable[str]
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
An iterable of (reference) SPDX license ids.
license_id: Optional[str]
A license id to match with SPDX licenses.
"""

def get_spdx_license_id(license_dict: dict, license_id: str) -> str:
"""Given a scamcode API license ID also known as a license detection, returns the correctly capitalized
spdx id corresponding to it"""
if not license_id:
return None # Return None if the dictionary is empty
lower_ref_licenses = {ref.lower(): ref for ref in ref_licenses}

for key, value in license_dict.items():
if license_id:
if key.lower() == license_id.lower():
return value.id
else:
return None
if license_id in lower_ref_licenses:
return lower_ref_licenses[license_id]

return None

Expand All @@ -43,7 +59,9 @@ def is_license_path(filename: str) -> bool:
return False


def get_license_with_highest_coverage(license_detections: List[dict]) -> str:
def get_license_with_highest_coverage(
license_detections: List[dict],
) -> Optional[str]:
"""Filters a list of "license detections" (the output of scancode.api.get_licenses)
to return the one with the highest match percentage.
This is used to select among multiple license matches from a single file."""
Expand All @@ -54,12 +72,8 @@ def get_license_with_highest_coverage(license_detections: List[dict]) -> str:

matches = detection["matches"] if "matches" in detection else []
for match in matches:
match_coverage = match["score"] if "score" in match else 0
match_coverage = match.get("score", 0)
if match_coverage > highest_coverage:
highest_coverage = match_coverage
highest_license = (
match["license_expression"]
if "license_expression" in match
else None
)
highest_license = match.get("license_expression", None)
return highest_license
20 changes: 19 additions & 1 deletion gimie/sources/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,13 @@

from gimie.io import LocalResource
from gimie.graph.namespaces import SDO
from gimie.sources.common.license import (
_get_license_url,
is_license_path,
)
from gimie.models import Person, PersonSchema
from gimie.sources.abstract import Extractor
from pathlib import Path


@dataclass
Expand Down Expand Up @@ -62,6 +67,7 @@ class GitExtractor(Extractor):
contributors: Optional[List[Person]] = None
date_created: Optional[datetime] = None
date_modified: Optional[datetime] = None
license: Optional[List[str]] = None

def extract(self):
if self.local_path is None:
Expand All @@ -72,9 +78,20 @@ def extract(self):
self.contributors = self._get_contributors()
self.date_created = self._get_creation_date()
self.date_modified = self._get_modification_date()
self.license = self._get_licenses()

def list_files(self) -> List[LocalResource]:
raise NotImplementedError
file_list = []

if self.local_path is None:
return file_list

for p in Path(self.local_path).rglob("*"):
if (p.parts[0] == ".git") or not p.is_file():
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
continue
file_list.append(LocalResource(p))

return file_list

def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
Expand Down Expand Up @@ -138,6 +155,7 @@ class GitExtractorSchema(JsonLDSchema):
contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
date_created = fields.Date(SDO.dateCreated)
date_modified = fields.Date(SDO.dateModified)
license = fields.List(SDO.license, fields.IRI)

class Meta:
rdf_type = SDO.SoftwareSourceCode
Expand Down
23 changes: 3 additions & 20 deletions gimie/sources/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from gimie.sources.common.license import (
get_license_with_highest_coverage,
is_license_path,
_get_licenses,
_get_license_url,
)
from gimie.sources.common.queries import (
send_rest_query,
Expand Down Expand Up @@ -156,8 +156,8 @@ def extract(self):
self.date_published = isoparse(
data["latestRelease"]["publishedAt"]
)
if self._get_license() != "https://spdx.org/licenses/None.html":
self.license = self._get_license()

self.license = self._get_licenses()
if data["primaryLanguage"] is not None:
self.prog_langs = [data["primaryLanguage"]["name"]]
self.keywords = self._get_keywords(*data["repositoryTopics"]["nodes"])
Expand Down Expand Up @@ -322,23 +322,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person:
affiliations=orgs,
)

def _get_license(self) -> list[str]:
"""Extract a SPDX License URL from a GitHub Repository"""
license_files_iterator = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_files = list(license_files_iterator)
license_ids = []
for file in license_files:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.open().read())
license_id = _get_licenses(temp_file.name)
if license_id:
license_ids.append(
f"https://spdx.org/licenses/{str(license_id)}.html"
)
return license_ids


class GithubExtractorSchema(JsonLDSchema):
"""This defines the schema used for json-ld serialization."""
Expand Down
21 changes: 2 additions & 19 deletions gimie/sources/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from gimie.sources.common.license import (
get_license_with_highest_coverage,
is_license_path,
_get_licenses,
_get_license_url,
)
from gimie.sources.common.queries import send_graphql_query, send_rest_query

Expand Down Expand Up @@ -101,7 +101,7 @@ def extract(self):
self.date_published = isoparse(
data["releases"]["edges"][0]["node"]["releasedAt"]
)
self.license = self._get_license()
self.license = self._get_licenses()
self.keywords = data["topics"]

# Get contributors as the project members that are not owners and those that have written merge requests
Expand Down Expand Up @@ -286,23 +286,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person:
email=node.get("publicEmail"),
)

def _get_license(self) -> list[str]:
"""Extract a SPDX License URL from a GitLab Repository"""
license_files_iterator = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_files = list(license_files_iterator)
license_ids = []
for file in license_files:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.open().read())
license_id = _get_licenses(temp_file.name)
if license_id:
license_ids.append(
f"https://spdx.org/licenses/{str(license_id)}.html"
)
return license_ids

def _user_from_rest(self, username: str) -> Person:
"""Given a username, use the REST API to retrieve the Person object."""

Expand Down
Loading