Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cff to doi parser #107

Merged
merged 7 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions gimie/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from gimie.io import Resource
from gimie.parsers.abstract import Parser
from gimie.parsers.license import LicenseParser, is_license_filename
from gimie.parsers.cff import CffParser


class ParserInfo(NamedTuple):
Expand All @@ -31,6 +32,7 @@ class ParserInfo(NamedTuple):

PARSERS = {
"license": ParserInfo(default=True, type=LicenseParser),
"cff": ParserInfo(default=True, type=CffParser),
}


Expand Down Expand Up @@ -72,6 +74,8 @@ def select_parser(
# Only parse licenses in the root directory
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
if is_license_filename(path.name) and len(path.parts) == 1:
name = "license"
elif path.name == "CITATION.cff":
name = "cff"
else:
return None

Expand Down
72 changes: 72 additions & 0 deletions gimie/parsers/cff.py
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Gimie
# Copyright 2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from io import BytesIO
import re
from typing import List, Optional, Set

from rdflib.term import URIRef

from gimie.graph.namespaces import SDO
from gimie.parsers.abstract import Parser, Property


class CffParser(Parser):
"""Parse cff file to extract the doi into schema:citation <doi>."""

def __init__(self):
super().__init__()

def parse(self, data: bytes) -> Set[Property]:
"""Extracts a DOI link from a CFF file and returns a
set with a single tuple <schema:citation> <doi>.
If no DOI is found, an empty set is returned.
"""
props = set()
doi = get_cff_doi(data)

if doi:
props.add((SDO.citation, URIRef(doi)))
return props


def get_cff_doi(data: bytes) -> Optional[str]:
"""Given a CFF file, returns the DOI, if any.

Parameters
----------
data:
The cff file body as bytes.

Examples
--------
>>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8"))
'10.5281/zenodo.1234'
>>> get_cff_doi(bytes("abc: def", encoding="utf8"))

"""

matches = re.search(
r"^doi: *(.*)$",
data.decode(),
flags=re.IGNORECASE | re.MULTILINE,
)
try:
doi = matches.groups()[0]
except AttributeError:
doi = None

return doi
2 changes: 1 addition & 1 deletion gimie/parsers/license/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:
"""
# Compute tfidf vector for input license
vectorizer = load_tfidf_vectorizer()
input_vec = vectorizer.transform([str(data)])
input_vec = vectorizer.transform([data.decode()])

# Load ids and tfidf vectors for spdx licenses
spdx_licenses = load_spdx_ids()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ classifiers = [
# Dependency management

[tool.poetry.dependencies]
python = ">=3.9,<=3.11"
python = ">=3.9,<=3.12"
gitpython = ">=3.1.35"
PyDriller = "^2.5"
pyshacl = "^0.20.0"
Expand Down
Loading