From 86f498769baf6e6d8f1a1b53b8080347e5043bd7 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 11:24:11 +0200 Subject: [PATCH 01/16] refactor(license): pythonic functions --- gimie/sources/common/license.py | 58 ++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index 1113360c..290b9040 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -1,34 +1,42 @@ import re -from spdx_license_list import LICENSES +from spdx_license_list import LICENSES, License from scancode.api import get_licenses -from typing import List +from typing import Iterable, List, Optional from gimie.io import Resource, iterable_to_stream, RemoteResource -def _get_licenses(temp_file_path: str) -> str: - """Takes a file with a license text in it, and matches this using the scancode API to a license to get some possible - license matches. The highest match is then returned as a spdx license ID""" +def _get_license_url(temp_file_path: str) -> str: + """Takes the path of a text file containing a license text, and matches this + using the scancode API to get possible license matches. The best match is + then returned as a spdx license URL""" license_detections = get_licenses(temp_file_path, include_text=True)[ "license_detections" ] - license_id = get_license_with_highest_coverage(license_detections) - spdx_license_id = get_spdx_license_id(LICENSES, license_id) + license_id = get_license_with_highest_coverage(license_detections) # type: ignore + spdx_license_id = get_spdx_license_id(LICENSES.keys(), license_id) + spdx_license_url = f"https://spdx.org/licenses/{str(spdx_license_id)}.html" - return spdx_license_id + return spdx_license_url -def get_spdx_license_id(license_dict: dict, license_id: str) -> str: - """Given a scamcode API license ID also known as a license detection, returns the correctly capitalized - spdx id corresponding to it""" - if not license_id: - return None # Return None if the dictionary is empty +def get_spdx_license_id( + ref_licenses: Iterable[str], license_id: Optional[str] +) -> Optional[str]: + """Given a scancode API license ID also known as a license detection, returns the correctly capitalized + spdx id corresponding to it. - for key, value in license_dict.items(): - if license_id: - if key.lower() == license_id.lower(): - return value.id - else: - return None + Parameters + ---------- + ref_licenses: Iterable[str] + An iterable of (reference) SPDX license ids. + license_id: Optional[str] + A license id to match with SPDX licenses. + """ + + lower_ref_licenses = {ref.lower(): ref for ref in ref_licenses} + + if license_id in lower_ref_licenses: + return lower_ref_licenses[license_id] return None @@ -43,7 +51,9 @@ def is_license_path(filename: str) -> bool: return False -def get_license_with_highest_coverage(license_detections: List[dict]) -> str: +def get_license_with_highest_coverage( + license_detections: List[dict], +) -> Optional[str]: """Filters a list of "license detections" (the output of scancode.api.get_licenses) to return the one with the highest match percentage. This is used to select among multiple license matches from a single file.""" @@ -54,12 +64,8 @@ def get_license_with_highest_coverage(license_detections: List[dict]) -> str: matches = detection["matches"] if "matches" in detection else [] for match in matches: - match_coverage = match["score"] if "score" in match else 0 + match_coverage = match.get("score", 0) if match_coverage > highest_coverage: highest_coverage = match_coverage - highest_license = ( - match["license_expression"] - if "license_expression" in match - else None - ) + highest_license = match.get("license_expression", None) return highest_license From 9a6327818fac64b0ddfacce65d8b803ccd660d6b Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 11:24:57 +0200 Subject: [PATCH 02/16] refactor(github,gitlab): simplify calls to _get_license --- gimie/sources/github.py | 14 ++++++-------- gimie/sources/gitlab.py | 16 +++++++--------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/gimie/sources/github.py b/gimie/sources/github.py index ebdc6cb8..4f6973d6 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -43,7 +43,7 @@ from gimie.sources.common.license import ( get_license_with_highest_coverage, is_license_path, - _get_licenses, + _get_license_url, ) from gimie.sources.common.queries import ( send_rest_query, @@ -328,16 +328,14 @@ def _get_license(self) -> list[str]: lambda p: is_license_path(p.name), self.list_files() ) license_files = list(license_files_iterator) - license_ids = [] + license_urls = [] for file in license_files: with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(file.open().read()) - license_id = _get_licenses(temp_file.name) - if license_id: - license_ids.append( - f"https://spdx.org/licenses/{str(license_id)}.html" - ) - return license_ids + license_url = _get_license_url(temp_file.name) + if license_url: + license_urls.append(license_url) + return license_urls class GithubExtractorSchema(JsonLDSchema): diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index f4cb0cf8..4a44663a 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -24,7 +24,7 @@ from gimie.sources.common.license import ( get_license_with_highest_coverage, is_license_path, - _get_licenses, + _get_license_url, ) from gimie.sources.common.queries import send_graphql_query, send_rest_query @@ -286,22 +286,20 @@ def _get_user(self, node: Dict[str, Any]) -> Person: email=node.get("publicEmail"), ) - def _get_license(self) -> list[str]: + def _get_licenses(self) -> list[str]: """Extract a SPDX License URL from a GitLab Repository""" license_files_iterator = filter( lambda p: is_license_path(p.name), self.list_files() ) license_files = list(license_files_iterator) - license_ids = [] + license_urls = [] for file in license_files: with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(file.open().read()) - license_id = _get_licenses(temp_file.name) - if license_id: - license_ids.append( - f"https://spdx.org/licenses/{str(license_id)}.html" - ) - return license_ids + license_url = _get_license_url(temp_file.name) + if license_urls: + license_urls.append(license_url) + return license_urls def _user_from_rest(self, username: str) -> Person: """Given a username, use the REST API to retrieve the Person object.""" From f0f8aa47f40be0b2056b60fbb652f2cafd1aa83f Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 11:33:21 +0200 Subject: [PATCH 03/16] refactor(github,gitlab): update license calling code --- gimie/sources/github.py | 6 +++--- gimie/sources/gitlab.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gimie/sources/github.py b/gimie/sources/github.py index 4f6973d6..9ec46923 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -156,8 +156,8 @@ def extract(self): self.date_published = isoparse( data["latestRelease"]["publishedAt"] ) - if self._get_license() != "https://spdx.org/licenses/None.html": - self.license = self._get_license() + + self.license = self._get_licenses() if data["primaryLanguage"] is not None: self.prog_langs = [data["primaryLanguage"]["name"]] self.keywords = self._get_keywords(*data["repositoryTopics"]["nodes"]) @@ -322,7 +322,7 @@ def _get_user(self, node: Dict[str, Any]) -> Person: affiliations=orgs, ) - def _get_license(self) -> list[str]: + def _get_licenses(self) -> list[str]: """Extract a SPDX License URL from a GitHub Repository""" license_files_iterator = filter( lambda p: is_license_path(p.name), self.list_files() diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index 4a44663a..7867978c 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -101,7 +101,7 @@ def extract(self): self.date_published = isoparse( data["releases"]["edges"][0]["node"]["releasedAt"] ) - self.license = self._get_license() + self.license = self._get_licenses() self.keywords = data["topics"] # Get contributors as the project members that are not owners and those that have written merge requests From c8f070895c05897f11d8fa1f8c387ff6de9ab3d7 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 14:38:25 +0200 Subject: [PATCH 04/16] fix(io): add missing fields in Resource --- gimie/io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gimie/io.py b/gimie/io.py index 53ea468d..abf6aaba 100644 --- a/gimie/io.py +++ b/gimie/io.py @@ -11,6 +11,9 @@ class Resource: """Abstract class for buffered read-only access to local or remote resources via a file-like interface.""" + name: str + path: Union[str, os.PathLike] + def open(self) -> io.BufferedReader: raise NotImplementedError From eaccdfe28d1530017765ea8452989562e7f4f863 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 14:39:28 +0200 Subject: [PATCH 05/16] refactor: mv _get_licenses to abstracte extractor --- gimie/sources/abstract.py | 14 ++++++++++++++ gimie/sources/github.py | 15 --------------- gimie/sources/gitlab.py | 15 --------------- 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index 6dacb98b..b43ce6dd 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -21,6 +21,7 @@ from rdflib import Graph from urllib.parse import urlparse +from gimie.sources.common.license import _get_license_url, is_license_path from gimie.io import Resource @@ -83,3 +84,16 @@ def base(self) -> str: url = urlparse(self.url) return f"{url.scheme}://{url.netloc}" return self.base_url + + def _get_licenses(self) -> list[str]: + """Extract a SPDX License URL from a GitHub Repository""" + license_files_iterator = filter( + lambda p: is_license_path(p.name), self.list_files() + ) + license_files = list(license_files_iterator) + license_urls = [] + for file in license_files: + license_url = _get_license_url(file) + if license_url: + license_urls.append(license_url) + return license_urls diff --git a/gimie/sources/github.py b/gimie/sources/github.py index 9ec46923..113cdfc9 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -322,21 +322,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person: affiliations=orgs, ) - def _get_licenses(self) -> list[str]: - """Extract a SPDX License URL from a GitHub Repository""" - license_files_iterator = filter( - lambda p: is_license_path(p.name), self.list_files() - ) - license_files = list(license_files_iterator) - license_urls = [] - for file in license_files: - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(file.open().read()) - license_url = _get_license_url(temp_file.name) - if license_url: - license_urls.append(license_url) - return license_urls - class GithubExtractorSchema(JsonLDSchema): """This defines the schema used for json-ld serialization.""" diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index 7867978c..a3abc5b7 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -286,21 +286,6 @@ def _get_user(self, node: Dict[str, Any]) -> Person: email=node.get("publicEmail"), ) - def _get_licenses(self) -> list[str]: - """Extract a SPDX License URL from a GitLab Repository""" - license_files_iterator = filter( - lambda p: is_license_path(p.name), self.list_files() - ) - license_files = list(license_files_iterator) - license_urls = [] - for file in license_files: - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(file.open().read()) - license_url = _get_license_url(temp_file.name) - if license_urls: - license_urls.append(license_url) - return license_urls - def _user_from_rest(self, username: str) -> Person: """Given a username, use the REST API to retrieve the Person object.""" From 6ad9f158661962461598deb8e946f3cddbed4e36 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 14:42:37 +0200 Subject: [PATCH 06/16] refactor(license): mv license file handling out of extractors --- gimie/sources/common/license.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index 290b9040..11a16ed3 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -3,15 +3,18 @@ from scancode.api import get_licenses from typing import Iterable, List, Optional from gimie.io import Resource, iterable_to_stream, RemoteResource +import tempfile -def _get_license_url(temp_file_path: str) -> str: +def _get_license_url(license_file: Resource) -> str: """Takes the path of a text file containing a license text, and matches this using the scancode API to get possible license matches. The best match is then returned as a spdx license URL""" - license_detections = get_licenses(temp_file_path, include_text=True)[ - "license_detections" - ] + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file.write(license_file.open().read()) + license_detections = get_licenses(temp_file.name, include_text=True)[ + "license_detections" + ] license_id = get_license_with_highest_coverage(license_detections) # type: ignore spdx_license_id = get_spdx_license_id(LICENSES.keys(), license_id) spdx_license_url = f"https://spdx.org/licenses/{str(spdx_license_id)}.html" From 4190e1be34c40b6797c1ac4aba2033880293b879 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 14:43:48 +0200 Subject: [PATCH 07/16] feat(git): support license property+list_files --- gimie/sources/git.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/gimie/sources/git.py b/gimie/sources/git.py index 51906c19..f17bbc67 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -28,8 +28,13 @@ from gimie.io import LocalResource from gimie.graph.namespaces import SDO +from gimie.sources.common.license import ( + _get_license_url, + is_license_path, +) from gimie.models import Person, PersonSchema from gimie.sources.abstract import Extractor +from pathlib import Path @dataclass @@ -62,6 +67,7 @@ class GitExtractor(Extractor): contributors: Optional[List[Person]] = None date_created: Optional[datetime] = None date_modified: Optional[datetime] = None + license: Optional[List[str]] = None def extract(self): if self.local_path is None: @@ -72,9 +78,20 @@ def extract(self): self.contributors = self._get_contributors() self.date_created = self._get_creation_date() self.date_modified = self._get_modification_date() + self.license = self._get_licenses() def list_files(self) -> List[LocalResource]: - raise NotImplementedError + file_list = [] + + if self.local_path is None: + return file_list + + for p in Path(self.local_path).rglob(""): + if ".git" in p.parts or not p.is_file(): + continue + file_list.append(LocalResource(p)) + + return file_list def to_graph(self) -> Graph: """Generate an RDF graph from the instance""" @@ -138,6 +155,7 @@ class GitExtractorSchema(JsonLDSchema): contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) date_created = fields.Date(SDO.dateCreated) date_modified = fields.Date(SDO.dateModified) + license = fields.List(SDO.license, fields.IRI) class Meta: rdf_type = SDO.SoftwareSourceCode From f9e4de1cd952ba2b6bea58e53ec6f105ce2267b0 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 16:08:03 +0200 Subject: [PATCH 08/16] fix(license): match body to type hints --- gimie/sources/abstract.py | 3 +-- gimie/sources/common/license.py | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index b43ce6dd..e2f6f0f4 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -87,10 +87,9 @@ def base(self) -> str: def _get_licenses(self) -> list[str]: """Extract a SPDX License URL from a GitHub Repository""" - license_files_iterator = filter( + license_files = filter( lambda p: is_license_path(p.name), self.list_files() ) - license_files = list(license_files_iterator) license_urls = [] for file in license_files: license_url = _get_license_url(file) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index 11a16ed3..fed1a40a 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -1,3 +1,4 @@ +import os import re from spdx_license_list import LICENSES, License from scancode.api import get_licenses @@ -6,20 +7,24 @@ import tempfile -def _get_license_url(license_file: Resource) -> str: +def _get_license_url(license_file: Resource) -> Optional[str]: """Takes the path of a text file containing a license text, and matches this using the scancode API to get possible license matches. The best match is then returned as a spdx license URL""" - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(license_file.open().read()) - license_detections = get_licenses(temp_file.name, include_text=True)[ - "license_detections" - ] + temp_file = tempfile.NamedTemporaryFile(delete=False) + temp_file.write(license_file.open().read()) + temp_file.close() + + license_detections = get_licenses(temp_file.name, include_text=True)[ + "license_detections" + ] license_id = get_license_with_highest_coverage(license_detections) # type: ignore spdx_license_id = get_spdx_license_id(LICENSES.keys(), license_id) - spdx_license_url = f"https://spdx.org/licenses/{str(spdx_license_id)}.html" + os.unlink(temp_file.name) + if spdx_license_id: + return f"https://spdx.org/licenses/{str(spdx_license_id)}.html" - return spdx_license_url + return None def get_spdx_license_id( From 9830db068ea936f6718a4a6a0df7cc1b52bc6265 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Mon, 16 Oct 2023 16:09:09 +0200 Subject: [PATCH 09/16] fix(git): list file glob pattern --- gimie/sources/git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gimie/sources/git.py b/gimie/sources/git.py index f17bbc67..5d7a1163 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -86,8 +86,8 @@ def list_files(self) -> List[LocalResource]: if self.local_path is None: return file_list - for p in Path(self.local_path).rglob(""): - if ".git" in p.parts or not p.is_file(): + for p in Path(self.local_path).rglob("*"): + if (p.parts[0] == ".git") or not p.is_file(): continue file_list.append(LocalResource(p)) From 38ac09fa5626932017742bc00964a1c3cea8da4d Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 12:08:13 +0200 Subject: [PATCH 10/16] fix(io): rm extraneous field in Resource --- gimie/io.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gimie/io.py b/gimie/io.py index abf6aaba..d41c732f 100644 --- a/gimie/io.py +++ b/gimie/io.py @@ -12,7 +12,6 @@ class Resource: a file-like interface.""" name: str - path: Union[str, os.PathLike] def open(self) -> io.BufferedReader: raise NotImplementedError @@ -34,7 +33,7 @@ class LocalResource(Resource): """ def __init__(self, path: Union[str, os.PathLike]): - self.path = Path(path) + self.path: Path = Path(path) def open(self, mode="r") -> io.BufferedReader: return io.BufferedReader(io.FileIO(self.path, mode)) From 3fcbfac84bd58d5c435112098dd679f4fc87c86c Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 12:16:50 +0200 Subject: [PATCH 11/16] refactor(license): better func names, rm unused impot --- gimie/sources/abstract.py | 6 +++--- gimie/sources/common/license.py | 6 +++--- gimie/sources/git.py | 6 +----- gimie/sources/github.py | 5 ----- gimie/sources/gitlab.py | 7 +------ 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index e2f6f0f4..499e978b 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -21,7 +21,7 @@ from rdflib import Graph from urllib.parse import urlparse -from gimie.sources.common.license import _get_license_url, is_license_path +from gimie.sources.common.license import get_license_url, is_license_path from gimie.io import Resource @@ -85,14 +85,14 @@ def base(self) -> str: return f"{url.scheme}://{url.netloc}" return self.base_url - def _get_licenses(self) -> list[str]: + def _get_licenses(self) -> List[str]: """Extract a SPDX License URL from a GitHub Repository""" license_files = filter( lambda p: is_license_path(p.name), self.list_files() ) license_urls = [] for file in license_files: - license_url = _get_license_url(file) + license_url = get_license_url(file) if license_url: license_urls.append(license_url) return license_urls diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index fed1a40a..2360fafd 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -1,13 +1,13 @@ import os import re -from spdx_license_list import LICENSES, License +from spdx_license_list import LICENSES from scancode.api import get_licenses from typing import Iterable, List, Optional -from gimie.io import Resource, iterable_to_stream, RemoteResource +from gimie.io import Resource import tempfile -def _get_license_url(license_file: Resource) -> Optional[str]: +def get_license_url(license_file: Resource) -> Optional[str]: """Takes the path of a text file containing a license text, and matches this using the scancode API to get possible license matches. The best match is then returned as a spdx license URL""" diff --git a/gimie/sources/git.py b/gimie/sources/git.py index 5d7a1163..96ba6fc0 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -17,7 +17,7 @@ """Extractor which uses a locally available (usually cloned) repository.""" from dataclasses import dataclass from datetime import datetime -from typing import List, Optional, Union +from typing import List, Optional import uuid from calamus import fields @@ -28,10 +28,6 @@ from gimie.io import LocalResource from gimie.graph.namespaces import SDO -from gimie.sources.common.license import ( - _get_license_url, - is_license_path, -) from gimie.models import Person, PersonSchema from gimie.sources.abstract import Extractor from pathlib import Path diff --git a/gimie/sources/github.py b/gimie/sources/github.py index 113cdfc9..eb6bb873 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -40,11 +40,6 @@ from gimie.graph.namespaces import SDO from gimie.io import RemoteResource -from gimie.sources.common.license import ( - get_license_with_highest_coverage, - is_license_path, - _get_license_url, -) from gimie.sources.common.queries import ( send_rest_query, send_graphql_query, diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index a3abc5b7..770ce247 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -21,11 +21,6 @@ PersonSchema, ) from gimie.sources.abstract import Extractor -from gimie.sources.common.license import ( - get_license_with_highest_coverage, - is_license_path, - _get_license_url, -) from gimie.sources.common.queries import send_graphql_query, send_rest_query load_dotenv() @@ -149,7 +144,7 @@ def _safe_extract_author( def _safe_extract_contributors( self, repo: dict[str, Any] - ) -> list[Person] | None: + ) -> List[Person] | None: members = [ user["node"]["user"] for user in repo["projectMembers"]["edges"] From 0fb1efd3fd1ff2fc3dee7f7a4bd3c46cb942fa6e Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 12:23:19 +0200 Subject: [PATCH 12/16] refactor(license): better var names, simpler signatures --- gimie/sources/common/license.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index 2360fafd..d16b610a 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -6,6 +6,8 @@ from gimie.io import Resource import tempfile +SPDX_IDS = list(LICENSES.keys()) + def get_license_url(license_file: Resource) -> Optional[str]: """Takes the path of a text file containing a license text, and matches this @@ -19,7 +21,9 @@ def get_license_url(license_file: Resource) -> Optional[str]: "license_detections" ] license_id = get_license_with_highest_coverage(license_detections) # type: ignore - spdx_license_id = get_spdx_license_id(LICENSES.keys(), license_id) + if license_id is None: + return None + spdx_license_id = get_spdx_license_id(license_id) os.unlink(temp_file.name) if spdx_license_id: return f"https://spdx.org/licenses/{str(spdx_license_id)}.html" @@ -28,25 +32,23 @@ def get_license_url(license_file: Resource) -> Optional[str]: def get_spdx_license_id( - ref_licenses: Iterable[str], license_id: Optional[str] + license_id: str, + spdx_ids: Iterable[str] = SPDX_IDS, ) -> Optional[str]: """Given a scancode API license ID also known as a license detection, returns the correctly capitalized spdx id corresponding to it. Parameters ---------- - ref_licenses: Iterable[str] - An iterable of (reference) SPDX license ids. - license_id: Optional[str] + license_id: A license id to match with SPDX licenses. + spdx_ids: + An iterable of (reference) SPDX license ids. """ - lower_ref_licenses = {ref.lower(): ref for ref in ref_licenses} + lower_spdx_ids = {spdx.lower(): spdx for spdx in spdx_ids} - if license_id in lower_ref_licenses: - return lower_ref_licenses[license_id] - - return None + return lower_spdx_ids.get(license_id.lower(), None) def is_license_path(filename: str) -> bool: From 864d0e4b9d76945ef19702b7483fbac71760d235 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 13:19:41 +0200 Subject: [PATCH 13/16] fix(license): os.unlink -> os.remove --- gimie/sources/common/license.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index d16b610a..39e00f6b 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -24,7 +24,7 @@ def get_license_url(license_file: Resource) -> Optional[str]: if license_id is None: return None spdx_license_id = get_spdx_license_id(license_id) - os.unlink(temp_file.name) + os.remove(temp_file.name) if spdx_license_id: return f"https://spdx.org/licenses/{str(spdx_license_id)}.html" From bf8412c4545f920fd404e59c7f1039c6486a158b Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 13:22:56 +0200 Subject: [PATCH 14/16] doc: better docstring for Extractor._get_licenses --- gimie/sources/abstract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index 499e978b..d46d5a00 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -86,7 +86,7 @@ def base(self) -> str: return self.base_url def _get_licenses(self) -> List[str]: - """Extract a SPDX License URL from a GitHub Repository""" + """Extracts SPDX License URLs from the repository.""" license_files = filter( lambda p: is_license_path(p.name), self.list_files() ) From 71f4ca88a85658afec6a4612d8594851c20922c3 Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 13:52:00 +0200 Subject: [PATCH 15/16] test(license): doctest + better docstrings for license module --- gimie/sources/common/license.py | 50 ++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/gimie/sources/common/license.py b/gimie/sources/common/license.py index 39e00f6b..9538638a 100644 --- a/gimie/sources/common/license.py +++ b/gimie/sources/common/license.py @@ -10,9 +10,16 @@ def get_license_url(license_file: Resource) -> Optional[str]: - """Takes the path of a text file containing a license text, and matches this + """Takes a file-like resource containing a license text, and matches its content using the scancode API to get possible license matches. The best match is - then returned as a spdx license URL""" + then returned as a spdx license URL. + + Examples + -------- + >>> from gimie.io import LocalResource + >>> get_license_url(LocalResource('LICENSE')) + 'https://spdx.org/licenses/Apache-2.0.html' + """ temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file.write(license_file.open().read()) temp_file.close() @@ -44,6 +51,13 @@ def get_spdx_license_id( A license id to match with SPDX licenses. spdx_ids: An iterable of (reference) SPDX license ids. + + Examples + -------- + >>> get_spdx_license_id('apache-2.0') + 'Apache-2.0' + >>> get_spdx_license_id('gpl-3.0') + 'GPL-3.0' """ lower_spdx_ids = {spdx.lower(): spdx for spdx in spdx_ids} @@ -52,7 +66,22 @@ def get_spdx_license_id( def is_license_path(filename: str) -> bool: - """Given an input filename, returns a boolean indicating whether the filename path looks like a license.""" + """Given an input filename, returns a boolean indicating whether the filename path looks like a license. + + Parameters + ---------- + filename: + A filename to check. + + Examples + -------- + >>> is_license_path('LICENSE.txt') + True + >>> is_license_path('LICENSE-APACHE') + True + >>> is_license_path('README.md') + False + """ if filename.startswith("."): return False pattern = r".*(license(s)?.*|lizenz|reus(e|ing).*|copy(ing)?.*)(\.(txt|md|rst))?$" @@ -66,7 +95,20 @@ def get_license_with_highest_coverage( ) -> Optional[str]: """Filters a list of "license detections" (the output of scancode.api.get_licenses) to return the one with the highest match percentage. - This is used to select among multiple license matches from a single file.""" + This is used to select among multiple license matches from a single file. + + Parameters + ---------- + license_detections: + A list of license detections, as returned by scancode.api.get_licenses. + + Examples + -------- + >>> from scancode.api import get_licenses + >>> license_detections = get_licenses('LICENSE')['license_detections'] + >>> get_license_with_highest_coverage(license_detections) + 'apache-2.0' + """ highest_coverage = 0.0 highest_license = None From 53ddd904734afbb0bb12138dee38a77116a3963a Mon Sep 17 00:00:00 2001 From: cmdoret Date: Tue, 17 Oct 2023 13:54:34 +0200 Subject: [PATCH 16/16] refactor(git): better var names in list_files --- gimie/sources/git.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gimie/sources/git.py b/gimie/sources/git.py index 96ba6fc0..09d49561 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -82,10 +82,10 @@ def list_files(self) -> List[LocalResource]: if self.local_path is None: return file_list - for p in Path(self.local_path).rglob("*"): - if (p.parts[0] == ".git") or not p.is_file(): + for path in Path(self.local_path).rglob("*"): + if (path.parts[0] == ".git") or not path.is_file(): continue - file_list.append(LocalResource(p)) + file_list.append(LocalResource(path)) return file_list