Skip to content

Commit

Permalink
Add maven license detection updates
Browse files Browse the repository at this point in the history
Add a new config variable for datafile handlers, which if enabled
will run the package license detection on the whole extracted
license statement and not on the respective values/attributes
one-by-one. Enabled this for maven to improve license detection there.

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
  • Loading branch information
AyanSinhaMahapatra committed Jul 5, 2023
1 parent 9fcdc92 commit bc8c56d
Show file tree
Hide file tree
Showing 332 changed files with 1,709 additions and 3,637 deletions.
26 changes: 21 additions & 5 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from summarycode.classify import LEGAL_STARTS_ENDS
from summarycode.classify import README_STARTS_ENDS

import saneyaml


"""
Detect and normalize licenses as found in package manifests data.
Expand Down Expand Up @@ -643,6 +645,7 @@ def get_normalized_license_detections(
try_as_expression=True,
approximate=True,
expression_symbols=None,
detect_license_on_full_extracted_statement=False,
):
"""
Return a normalized license expression string detected from a list of
Expand All @@ -657,7 +660,18 @@ def get_normalized_license_detections(
logger_debug(f'get_normalized_license_detections: extracted_license: {extracted_license}')
logger_debug(f'get_normalized_license_detections: type(extracted_license): {type(extracted_license)}')

if not isinstance(extracted_license, list):
if detect_license_on_full_extracted_statement:
extracted_license_statement = saneyaml.dump(extracted_license)
license_detections = get_license_detections_for_extracted_license_statement(
extracted_license_statement=extracted_license_statement,
try_as_expression=try_as_expression,
approximate=approximate,
expression_symbols=expression_symbols,
)
if TRACE:
logger_debug(f'get_normalized_license_detections: detect_license_on_full_extracted_statement:')

elif not isinstance(extracted_license, list):
if isinstance(extracted_license, str):
license_detections = get_license_detections_for_extracted_license_statement(
extracted_license_statement=extracted_license,
Expand All @@ -683,7 +697,7 @@ def get_normalized_license_detections(
license_detections.extend(detections)

else:
extracted_license_statement = repr(extracted_license)
extracted_license_statement = saneyaml.dump(extracted_license)
license_detections = get_license_detections_for_extracted_license_statement(
extracted_license_statement=extracted_license_statement,
try_as_expression=try_as_expression,
Expand Down Expand Up @@ -725,7 +739,7 @@ def get_normalized_license_detections(
license_detections.extend(detections)

else:
extracted_license_statement = repr(extracted_license_item)
extracted_license_statement = saneyaml.dump(extracted_license_item)

detections = get_license_detections_for_extracted_license_statement(
extracted_license_statement=extracted_license_statement,
Expand All @@ -746,6 +760,7 @@ def get_normalized_license_detections(
def get_license_detections_and_expression(
extracted_license_statement,
default_relation_license=None,
detect_license_on_full_extracted_statement=False,
try_as_expression=True,
approximate=True,
expression_symbols=None,
Expand Down Expand Up @@ -775,11 +790,12 @@ def get_license_detections_and_expression(
try_as_expression=try_as_expression,
approximate=approximate,
expression_symbols=expression_symbols,
detect_license_on_full_extracted_statement=detect_license_on_full_extracted_statement,
)

if not license_detections:
if not isinstance(extracted_license_statement, str):
extracted_license_statement = repr(extracted_license_statement)
extracted_license_statement = saneyaml.dump(extracted_license_statement)
license_detection = get_unknown_license_detection(query_string=extracted_license_statement)
license_detections = [license_detection]

Expand Down Expand Up @@ -808,7 +824,7 @@ def get_license_detections_for_extracted_license_statement(
return []

if not isinstance(extracted_license_statement, str):
extracted_license_statement = repr(extracted_license_statement)
extracted_license_statement = saneyaml.dump(extracted_license_statement)

matches, matched_as_expression = get_license_matches_for_extracted_license_statement(
query_string=extracted_license_statement,
Expand Down
28 changes: 21 additions & 7 deletions src/packagedcode/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class MavenPomXmlHandler(models.DatafileHandler):
default_primary_language = 'Java'
description = 'Apache Maven pom'
documentation_url = 'https://maven.apache.org/pom.html'
detect_license_on_full_extracted_statement = True

# TODO: implment more sophistcaed assembly with META-INF/MANIFEST.MF and META-INF/LICENSE

Expand Down Expand Up @@ -629,13 +630,26 @@ def _get_comments(self, xml=None):
def _find_licenses(self):
"""Return an iterable of license mappings."""
for lic in self.pom_data.findall('licenses/license'):
yield dict([
('name', self._get_attribute('name', lic)),
('url', self._get_attribute('url', lic)),
('comments', self._get_attribute('comments', lic)),
# arcane and seldom used
('distribution', self._get_attribute('distribution', lic)),
])
lic_mapping = {}

name = self._get_attribute('name', lic)
if name:
lic_mapping["name"] = name

url = self._get_attribute('url', lic)
if url:
lic_mapping["url"] = url

comments = self._get_attribute('comments', lic)
if comments:
lic_mapping["comments"] = comments

# arcane and seldom used
distribution = self._get_attribute('distribution', lic)
#if distribution:
# lic_mapping["distribution"] = distribution

yield lic_mapping

def _find_parties(self, key='developers/developer'):
"""Return an iterable of party mappings for a given xpath."""
Expand Down
16 changes: 15 additions & 1 deletion src/packagedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,12 +874,17 @@ def get_license_detections_and_expression(self):
default_relation_license=get_default_relation_license(
datasource_id=self.datasource_id,
)
detect_license_on_full_extracted_statement = get_detect_license_on_full_extracted_statement(
datasource_id=self.datasource_id,
)
else:
default_relation_license = 'AND'
detect_license_on_full_extracted_statement = False

return get_license_detections_and_expression(
extracted_license_statement=self.extracted_license_statement,
default_relation_license=default_relation_license
default_relation_license=default_relation_license,
detect_license_on_full_extracted_statement=detect_license_on_full_extracted_statement,
)


Expand All @@ -889,6 +894,11 @@ def get_default_relation_license(datasource_id):
return handler.default_relation_license


def get_detect_license_on_full_extracted_statement(datasource_id):
from packagedcode import HANDLER_BY_DATASOURCE_ID
handler = HANDLER_BY_DATASOURCE_ID[datasource_id]
return handler.detect_license_on_full_extracted_statement

def _rehydrate_list(cls, values):
"""
Yield ``cls`` objects built from a ``values`` list of mappings.
Expand Down Expand Up @@ -969,6 +979,10 @@ class DatafileHandler:
# Default Relation between license elements detected in an `extracted_license_statement`
default_relation_license = None

# Whether to consider the whole `extracted_license_statement` as one text to run
# license detection on, or run this seperately for the elements/values
detect_license_on_full_extracted_statement = None

@classmethod
def is_datafile(cls, location, filetypes=tuple(), _bare_filename=False):
"""
Expand Down
57 changes: 20 additions & 37 deletions tests/formattedcode/data/common/manifests-expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,40 +36,34 @@
"end_line": 1,
"matched_length": 8,
"match_coverage": 100.0,
"matcher": "1-hash",
"matcher": "2-aho",
"license_expression": "cddl-1.0",
"rule_identifier": "cddl-1.0.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0.RULE",
"matched_text": "Common Development and Distribution License (CDDL) v1.0"
}
],
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5"
},
{
"license_expression": "cddl-1.0",
"matches": [
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0"
},
{
"score": 100.0,
"start_line": 1,
"end_line": 1,
"start_line": 2,
"end_line": 2,
"matched_length": 7,
"match_coverage": 100.0,
"matcher": "1-hash",
"matcher": "2-aho",
"license_expression": "cddl-1.0",
"rule_identifier": "cddl-1.0_4.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_4.RULE",
"matched_text": "http://www.sun.com/cddl/cddl.html"
"matched_text": " url: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83"
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880"
}
],
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n comments:\n distribution: repo\n",
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
Expand Down Expand Up @@ -480,18 +474,13 @@
"license_expression": "apache-2.0",
"detection_count": 1
},
{
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5",
"license_expression": "cddl-1.0",
"detection_count": 1
},
{
"identifier": "cddl_1_0-c6dbef4d-659c-289f-5ee9-1ca0278edad6",
"license_expression": "cddl-1.0",
"detection_count": 1
},
{
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83",
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880",
"license_expression": "cddl-1.0",
"detection_count": 1
},
Expand Down Expand Up @@ -653,40 +642,34 @@
"end_line": 1,
"matched_length": 8,
"match_coverage": 100.0,
"matcher": "1-hash",
"matcher": "2-aho",
"license_expression": "cddl-1.0",
"rule_identifier": "cddl-1.0.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0.RULE",
"matched_text": "Common Development and Distribution License (CDDL) v1.0"
}
],
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5"
},
{
"license_expression": "cddl-1.0",
"matches": [
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0"
},
{
"score": 100.0,
"start_line": 1,
"end_line": 1,
"start_line": 2,
"end_line": 2,
"matched_length": 7,
"match_coverage": 100.0,
"matcher": "1-hash",
"matcher": "2-aho",
"license_expression": "cddl-1.0",
"rule_identifier": "cddl-1.0_4.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_4.RULE",
"matched_text": "http://www.sun.com/cddl/cddl.html"
"matched_text": " url: http://www.sun.com/cddl/cddl.html"
}
],
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83"
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880"
}
],
"other_license_expression": null,
"other_license_expression_spdx": null,
"other_license_detections": [],
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n comments:\n distribution: repo\n",
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
"notice_text": null,
"source_packages": [
"pkg:maven/javax.persistence/persistence-api@1.0?classifier=sources"
Expand Down Expand Up @@ -1397,7 +1380,7 @@
"rule_identifier": "pypi_gnu_lesser_general_public_license_v3.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pypi_gnu_lesser_general_public_license_v3.RULE",
"matched_text": "['License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)']"
"matched_text": "- 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)'"
}
],
"identifier": "lgpl_3_0-272571eb-5e68-95b6-ddb0-71de2d8df321"
Expand Down
Loading

0 comments on commit bc8c56d

Please sign in to comment.