Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: license matcher for git extractor #78

Merged
merged 16 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion gimie/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class Resource:
"""Abstract class for buffered read-only access to local or remote resources via
a file-like interface."""

name: str

def open(self) -> io.BufferedReader:
raise NotImplementedError

Expand All @@ -31,7 +33,7 @@ class LocalResource(Resource):
"""

def __init__(self, path: Union[str, os.PathLike]):
self.path = Path(path)
self.path: Path = Path(path)

def open(self, mode="r") -> io.BufferedReader:
return io.BufferedReader(io.FileIO(self.path, mode))
Expand Down
13 changes: 13 additions & 0 deletions gimie/sources/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from rdflib import Graph
from urllib.parse import urlparse
from gimie.sources.common.license import get_license_url, is_license_path
from gimie.io import Resource


Expand Down Expand Up @@ -83,3 +84,15 @@ def base(self) -> str:
url = urlparse(self.url)
return f"{url.scheme}://{url.netloc}"
return self.base_url

def _get_licenses(self) -> List[str]:
"""Extracts SPDX License URLs from the repository."""
license_files = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_urls = []
for file in license_files:
license_url = get_license_url(file)
if license_url:
license_urls.append(license_url)
return license_urls
118 changes: 88 additions & 30 deletions gimie/sources/common/license.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,87 @@
import os
import re
from spdx_license_list import LICENSES
from scancode.api import get_licenses
from typing import List
from gimie.io import Resource, iterable_to_stream, RemoteResource
from typing import Iterable, List, Optional
from gimie.io import Resource
import tempfile

SPDX_IDS = list(LICENSES.keys())

def _get_licenses(temp_file_path: str) -> str:
"""Takes a file with a license text in it, and matches this using the scancode API to a license to get some possible
license matches. The highest match is then returned as a spdx license ID"""
license_detections = get_licenses(temp_file_path, include_text=True)[

def get_license_url(license_file: Resource) -> Optional[str]:
"""Takes a file-like resource containing a license text, and matches its content
using the scancode API to get possible license matches. The best match is
then returned as a spdx license URL.

Examples
--------
>>> from gimie.io import LocalResource
>>> get_license_url(LocalResource('LICENSE'))
'https://spdx.org/licenses/Apache-2.0.html'
"""
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(license_file.open().read())
temp_file.close()

license_detections = get_licenses(temp_file.name, include_text=True)[
"license_detections"
]
license_id = get_license_with_highest_coverage(license_detections)
spdx_license_id = get_spdx_license_id(LICENSES, license_id)
license_id = get_license_with_highest_coverage(license_detections) # type: ignore
if license_id is None:
return None
spdx_license_id = get_spdx_license_id(license_id)
os.remove(temp_file.name)
if spdx_license_id:
return f"https://spdx.org/licenses/{str(spdx_license_id)}.html"

return spdx_license_id
return None


def get_spdx_license_id(license_dict: dict, license_id: str) -> str:
"""Given a scamcode API license ID also known as a license detection, returns the correctly capitalized
spdx id corresponding to it"""
if not license_id:
return None # Return None if the dictionary is empty
def get_spdx_license_id(
license_id: str,
spdx_ids: Iterable[str] = SPDX_IDS,
) -> Optional[str]:
"""Given a scancode API license ID also known as a license detection, returns the correctly capitalized
spdx id corresponding to it.

for key, value in license_dict.items():
if license_id:
if key.lower() == license_id.lower():
return value.id
else:
return None
Parameters
----------
license_id:
A license id to match with SPDX licenses.
spdx_ids:
An iterable of (reference) SPDX license ids.

return None
Examples
--------
>>> get_spdx_license_id('apache-2.0')
'Apache-2.0'
>>> get_spdx_license_id('gpl-3.0')
'GPL-3.0'
"""

lower_spdx_ids = {spdx.lower(): spdx for spdx in spdx_ids}

return lower_spdx_ids.get(license_id.lower(), None)


def is_license_path(filename: str) -> bool:
"""Given an input filename, returns a boolean indicating whether the filename path looks like a license."""
"""Given an input filename, returns a boolean indicating whether the filename path looks like a license.

Parameters
----------
filename:
A filename to check.

Examples
--------
>>> is_license_path('LICENSE.txt')
True
>>> is_license_path('LICENSE-APACHE')
True
>>> is_license_path('README.md')
False
"""
if filename.startswith("."):
return False
pattern = r".*(license(s)?.*|lizenz|reus(e|ing).*|copy(ing)?.*)(\.(txt|md|rst))?$"
Expand All @@ -43,23 +90,34 @@ def is_license_path(filename: str) -> bool:
return False


def get_license_with_highest_coverage(license_detections: List[dict]) -> str:
def get_license_with_highest_coverage(
license_detections: List[dict],
) -> Optional[str]:
"""Filters a list of "license detections" (the output of scancode.api.get_licenses)
to return the one with the highest match percentage.
This is used to select among multiple license matches from a single file."""
This is used to select among multiple license matches from a single file.

Parameters
----------
license_detections:
A list of license detections, as returned by scancode.api.get_licenses.

Examples
--------
>>> from scancode.api import get_licenses
>>> license_detections = get_licenses('LICENSE')['license_detections']
>>> get_license_with_highest_coverage(license_detections)
'apache-2.0'
"""
highest_coverage = 0.0
highest_license = None

for detection in license_detections:

matches = detection["matches"] if "matches" in detection else []
for match in matches:
match_coverage = match["score"] if "score" in match else 0
match_coverage = match.get("score", 0)
if match_coverage > highest_coverage:
highest_coverage = match_coverage
highest_license = (
match["license_expression"]
if "license_expression" in match
else None
)
highest_license = match.get("license_expression", None)
return highest_license
18 changes: 16 additions & 2 deletions gimie/sources/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""Extractor which uses a locally available (usually cloned) repository."""
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Union
from typing import List, Optional
import uuid

from calamus import fields
Expand All @@ -30,6 +30,7 @@
from gimie.graph.namespaces import SDO
from gimie.models import Person, PersonSchema
from gimie.sources.abstract import Extractor
from pathlib import Path


@dataclass
Expand Down Expand Up @@ -62,6 +63,7 @@ class GitExtractor(Extractor):
contributors: Optional[List[Person]] = None
date_created: Optional[datetime] = None
date_modified: Optional[datetime] = None
license: Optional[List[str]] = None

def extract(self):
if self.local_path is None:
Expand All @@ -72,9 +74,20 @@ def extract(self):
self.contributors = self._get_contributors()
self.date_created = self._get_creation_date()
self.date_modified = self._get_modification_date()
self.license = self._get_licenses()

def list_files(self) -> List[LocalResource]:
raise NotImplementedError
file_list = []

if self.local_path is None:
return file_list

for path in Path(self.local_path).rglob("*"):
if (path.parts[0] == ".git") or not path.is_file():
continue
file_list.append(LocalResource(path))

return file_list

def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
Expand Down Expand Up @@ -138,6 +151,7 @@ class GitExtractorSchema(JsonLDSchema):
contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
date_created = fields.Date(SDO.dateCreated)
date_modified = fields.Date(SDO.dateModified)
license = fields.List(SDO.license, fields.IRI)

class Meta:
rdf_type = SDO.SoftwareSourceCode
Expand Down
26 changes: 2 additions & 24 deletions gimie/sources/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@
from gimie.graph.namespaces import SDO

from gimie.io import RemoteResource
from gimie.sources.common.license import (
get_license_with_highest_coverage,
is_license_path,
_get_licenses,
)
from gimie.sources.common.queries import (
send_rest_query,
send_graphql_query,
Expand Down Expand Up @@ -156,8 +151,8 @@ def extract(self):
self.date_published = isoparse(
data["latestRelease"]["publishedAt"]
)
if self._get_license() != "https://spdx.org/licenses/None.html":
self.license = self._get_license()

self.license = self._get_licenses()
if data["primaryLanguage"] is not None:
self.prog_langs = [data["primaryLanguage"]["name"]]
self.keywords = self._get_keywords(*data["repositoryTopics"]["nodes"])
Expand Down Expand Up @@ -322,23 +317,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person:
affiliations=orgs,
)

def _get_license(self) -> list[str]:
"""Extract a SPDX License URL from a GitHub Repository"""
license_files_iterator = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_files = list(license_files_iterator)
license_ids = []
for file in license_files:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.open().read())
license_id = _get_licenses(temp_file.name)
if license_id:
license_ids.append(
f"https://spdx.org/licenses/{str(license_id)}.html"
)
return license_ids


class GithubExtractorSchema(JsonLDSchema):
"""This defines the schema used for json-ld serialization."""
Expand Down
26 changes: 2 additions & 24 deletions gimie/sources/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@
PersonSchema,
)
from gimie.sources.abstract import Extractor
from gimie.sources.common.license import (
get_license_with_highest_coverage,
is_license_path,
_get_licenses,
)
from gimie.sources.common.queries import send_graphql_query, send_rest_query

load_dotenv()
Expand Down Expand Up @@ -101,7 +96,7 @@ def extract(self):
self.date_published = isoparse(
data["releases"]["edges"][0]["node"]["releasedAt"]
)
self.license = self._get_license()
self.license = self._get_licenses()
self.keywords = data["topics"]

# Get contributors as the project members that are not owners and those that have written merge requests
Expand Down Expand Up @@ -149,7 +144,7 @@ def _safe_extract_author(

def _safe_extract_contributors(
self, repo: dict[str, Any]
) -> list[Person] | None:
) -> List[Person] | None:
members = [
user["node"]["user"]
for user in repo["projectMembers"]["edges"]
Expand Down Expand Up @@ -286,23 +281,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person:
email=node.get("publicEmail"),
)

def _get_license(self) -> list[str]:
"""Extract a SPDX License URL from a GitLab Repository"""
license_files_iterator = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_files = list(license_files_iterator)
license_ids = []
for file in license_files:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.open().read())
license_id = _get_licenses(temp_file.name)
if license_id:
license_ids.append(
f"https://spdx.org/licenses/{str(license_id)}.html"
)
return license_ids

def _user_from_rest(self, username: str) -> Person:
"""Given a username, use the REST API to retrieve the Person object."""

Expand Down