-
-
Notifications
You must be signed in to change notification settings - Fork 540
/
match_spdx_lid.py
423 lines (343 loc) · 14.7 KB
/
match_spdx_lid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import os
import re
import sys
from license_expression import Keyword
from license_expression import LicenseSymbol
from license_expression import LicenseWithExceptionSymbol
from license_expression import Licensing
from licensedcode.match import LicenseMatch
from licensedcode.models import SpdxRule
from licensedcode.spans import Span
from textcode.markup import is_markup_text
from textcode.markup import demarkup_text
"""
Matching strategy for license expressions and "SPDX-License-Identifier:"
expression tags. This is also for spdx license-expressions with other
prefix strings (example: NuGet License URLs).
The matching aproach is a tad different:
First, we do not run this matcher against whole queries. Instead the matchable
text is collected during the query processing as Query.spdx_lines for any line
that starts withs these tokens ['spdx', 'license', 'identifier'] or ['spdx',
'licence', 'identifier'] begining with the first, second or third token position
in a line.
Then the words after "SPDX-license-identifier" are parsed as if they were an
SPDX license expression (with a few extra symbols and/or deprecated symbols
added to the list of license keys.
"""
# Tracing flags
TRACE = False
def logger_debug(*args):
pass
if TRACE or os.environ.get('SCANCODE_DEBUG_LICENSE'):
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
MATCH_SPDX_ID = '1-spdx-id'
MATCH_SPDX_ID_ORDER = 2
def spdx_id_match(idx, query_run, text, expression_symbols=None):
"""
Return one LicenseMatch by matching the `text` as an SPDX license expression
using the `query_run` positions and `idx` index for support.
Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if
provided. Otherwise use the standard SPDX license symbols.
"""
if TRACE:
logger_debug('spdx_id_match: start:', 'text:', text, 'query_run:', query_run)
# matching proper
expression_str = get_spdx_expression(text, expression_symbols=expression_symbols)
if not expression_str:
return
match_len = len(query_run)
match_start = query_run.start
matched_tokens = query_run.tokens
# build synthetic rule
# TODO: ensure that all the SPDX license keys are known symbols
rule = SpdxRule(
license_expression=expression_str,
# FIXME: for now we are putting the original query text as a
# rule text: this is likely incorrect when it comes to properly
# computing the known and unknowns and high and lows for this rule.
# Alternatively we could use the expression string, padded with
# spdx-license-identifier: this may be wrong too, if the line was
# not padded originally with this tag
text=text,
length=match_len,
)
# build match from parsed expression
# collect match start and end: e.g. the whole text
qspan = Span(range(match_start, query_run.end + 1))
# we use the query side to build the ispans
ispan = Span(range(0, match_len))
len_legalese = idx.len_legalese
hispan = Span(p for p, t in enumerate(matched_tokens) if t < len_legalese)
match = LicenseMatch(
rule=rule,
qspan=qspan,
ispan=ispan,
hispan=hispan,
query_run_start=match_start,
matcher=MATCH_SPDX_ID,
matcher_order=MATCH_SPDX_ID_ORDER,
query=query_run.query,
)
return match
def get_spdx_expression(text, expression_symbols=None):
"""
Return a matched license expression string or None by matching the
``text`` as an SPDX license expression.
Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if
provided. Otherwise use the standard SPDX license symbols.
This is used to handle cases of mixed standard SPDX and non-standard SPDX-
like symbols used for instance in some package manifests.
"""
from licensedcode.cache import get_spdx_symbols
from licensedcode.cache import get_unknown_spdx_symbol
licensing = Licensing()
if not expression_symbols:
expression_symbols = get_spdx_symbols()
unknown_symbol = get_unknown_spdx_symbol()
# _prefix, exp_text = prepare_text(text)
expression = get_expression(
text=text,
licensing=licensing,
expression_symbols=expression_symbols,
unknown_symbol=unknown_symbol,
)
if expression is None:
return
return expression.render()
def get_expression(text, licensing, expression_symbols, unknown_symbol):
"""
Return an Expression object by parsing the `text` string using the
``licensing`` reference Licensing.
Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if
provided. Otherwise use the standard SPDX license symbols.
Note that an expression is ALWAYS returned: if the parsing fails or some
other error happens somehow, this function returns instead a bare
expression made of only "unknown-spdx" symbol.
"""
_prefix, text = prepare_text(text)
if not text:
return
expression = None
try:
expression = _parse_expression(
text=text,
licensing=licensing,
expression_symbols=expression_symbols,
unknown_symbol=unknown_symbol,
)
except Exception:
try:
# Try to parse again using a lenient recovering parsing process such
# as for plain space or comma-separated list of licenses (e.g.
# UBoot)
expression = _reparse_invalid_expression(
text=text,
licensing=licensing,
expression_symbols=expression_symbols,
unknown_symbol=unknown_symbol,
)
except Exception:
pass
if expression is None:
expression = unknown_symbol
return expression
# Some older SPDX ids are deprecated and therefore no longer referenced in
# licenses so we track them here. This maps the old SPDX key to a scancode
# expression.
OLD_SPDX_EXCEPTION_LICENSES_SUBS = None
def get_old_expressions_subs_table(licensing):
global OLD_SPDX_EXCEPTION_LICENSES_SUBS
if not OLD_SPDX_EXCEPTION_LICENSES_SUBS:
# this is mapping an OLD SPDX id to a new SPDX expression
EXPRESSSIONS_BY_OLD_SPDX_IDS = {k.lower(): v.lower() for k, v in {
'eCos-2.0': 'GPL-2.0-or-later WITH eCos-exception-2.0',
'GPL-2.0-with-autoconf-exception': 'GPL-2.0-only WITH Autoconf-exception-2.0',
'GPL-2.0-with-bison-exception': 'GPL-2.0-only WITH Bison-exception-2.2',
'GPL-2.0-with-classpath-exception': 'GPL-2.0-only WITH Classpath-exception-2.0',
'GPL-2.0-with-font-exception': 'GPL-2.0-only WITH Font-exception-2.0',
'GPL-2.0-with-GCC-exception': 'GPL-2.0-only WITH GCC-exception-2.0',
'GPL-3.0-with-autoconf-exception': 'GPL-3.0-only WITH Autoconf-exception-3.0',
'GPL-3.0-with-GCC-exception': 'GPL-3.0-only WITH GCC-exception-3.1',
'wxWindows': 'LGPL-2.0-or-later WITH WxWindows-exception-3.1',
}.items()}
OLD_SPDX_EXCEPTION_LICENSES_SUBS = {
licensing.parse(k): licensing.parse(v)
for k, v in EXPRESSSIONS_BY_OLD_SPDX_IDS.items()
}
return OLD_SPDX_EXCEPTION_LICENSES_SUBS
def _parse_expression(text, licensing, expression_symbols, unknown_symbol):
"""
Return a LicenseExpression object by parsing the `text` string using the
``licensing`` reference Licensing. Return None or raise an exception on
errors.
Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if
provided. Otherwise use the standard SPDX license symbols.
"""
if not text:
return
text = text.lower()
expression = licensing.parse(text, simple=True)
if expression is None:
return
# substitute old SPDX symbols with new ones if any
old_expressions_subs = get_old_expressions_subs_table(licensing)
updated = expression.subs(old_expressions_subs)
# collect known symbols and build substitution table: replace known symbols
# with a symbol wrapping a known license and unkown symbols with the
# unknown-spdx symbol
symbols_table = {}
def _get_matching_symbol(_symbol):
return expression_symbols.get(_symbol.key.lower(), unknown_symbol)
for symbol in licensing.license_symbols(updated, unique=True, decompose=False):
if isinstance(symbol, LicenseWithExceptionSymbol):
# we have two symbols:make a a new symbo, from that
new_with = LicenseWithExceptionSymbol(
license_symbol=_get_matching_symbol(symbol.license_symbol),
exception_symbol=_get_matching_symbol(symbol.exception_symbol)
)
symbols_table[symbol] = new_with
else:
symbols_table[symbol] = _get_matching_symbol(symbol)
symbolized = updated.subs(symbols_table)
return symbolized
def _reparse_invalid_expression(
text,
licensing,
expression_symbols,
unknown_symbol
):
"""
Return an Expression object by parsing the `text` string using the
`licensing` reference Licensing.
Make a best attempt at parsing eventually ignoring some of the syntax.
The `text` string is assumed to be an invalid non-parseable expression.
Any keyword and parens will be ignored.
Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if
provided. Otherwise use the standard SPDX license symbols.
Note that an expression is ALWAYS returned: if the parsing fails or some
other error happens somehow, this function returns instead a bare
expression made of only "unknown-spdx" symbol.
"""
if not text:
return
results = licensing.simple_tokenizer(text)
# filter tokens to keep only symbols and keywords
tokens = [
r.value for r in results
if isinstance(r.value, (LicenseSymbol, Keyword))
]
# Here we have a mix of keywords and symbols that does not parse correctly.
# This could be because of some imbalance or any kind of other reasons. We
# ignore any parens or keyword and track if we have keywords or parens
has_keywords = False
has_symbols = False
filtered_tokens = []
for tok in tokens:
if isinstance(tok, Keyword):
has_keywords = True
continue
else:
filtered_tokens.append(tok)
has_symbols = True
if not has_symbols:
return unknown_symbol
# Build and reparse a synthetic expression using a default AND as keyword.
# This expression may not be a correct repsentation of the invalid original,
# but it always contains an unknown symbol if this is a not a simple uboot-
# style OR expression.
joined_as = ' AND '
if not has_keywords:
# this is bare list of symbols without parens and keywords, u-boot-
# style: we assume the OR keyword
joined_as = ' OR '
expression_text = joined_as.join(s.key for s in filtered_tokens)
expression = _parse_expression(
expression_text, licensing, expression_symbols, unknown_symbol)
# this is more than just a u-boot-style list of license keys
if has_keywords:
# ... so we append an arbitrary unknown-spdx symbol to witness that the
# expression is invalid
expression = licensing.AND(expression, unknown_symbol)
return expression
def prepare_text(text):
"""
Return a 2-tuple of (`prefix`, `expression_text`) built from `text` where
the `expression_text` is prepared to be suitable for SPDX license identifier
detection stripped from leading and trailing punctuations, normalized for
spaces and separateed from an SPDX-License-Identifier `prefix`.
"""
if is_markup_text(text):
text = demarkup_text(text)
prefix, expression = split_spdx_lid(text)
prefix = prefix.strip() if prefix is not None else prefix
return prefix, clean_text(expression)
def clean_text(text):
"""
Return a text suitable for SPDX license identifier detection cleaned from
certain leading and trailing punctuations and normalized for spaces.
"""
if is_markup_text(text):
text = demarkup_text(text)
dangling_markup = ['</a>', '</p>', '</div>', '</licenseUrl>']
for markup in dangling_markup:
if markup in text:
text = text.replace(markup, '')
text = ' '.join(text.split())
punctuation_spaces = "!\"#$%&'*,-./:;<=>?@[\\]^_`{|}~\t\r\n "
# remove significant expression punctuations in wrong spot: closing parens
# at head and opening parens or + at tail.
leading_punctuation_spaces = punctuation_spaces + ")+"
trailng_punctuation_spaces = punctuation_spaces + "("
text = text.lstrip(leading_punctuation_spaces).rstrip(trailng_punctuation_spaces)
# try to fix some common cases of leading and trailing missing parense
open_parens_count = text.count('(')
close_parens_count = text.count(')')
if open_parens_count == 1 and not close_parens_count:
text = text.replace('(', ' ')
elif close_parens_count == 1 and not open_parens_count:
text = text.replace(')', ' ')
if '">' in text:
text_fragments = text.split('">')
if text_fragments[1] in text_fragments[0]:
text = text_fragments[0]
return ' '.join(text.split())
_split_spdx_lid = re.compile(
'(spdx(?:\\-|\\s)+licen(?:s|c)e(?:\\-|\\s)+identifier\\s*:?\\s*)',
re.IGNORECASE).split
_nuget_split_spdx_lid = re.compile(
'(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
re.IGNORECASE).split
def split_spdx_lid(text):
"""
Split text if it contains an "SPDX license identifier". Return a 2-tuple if
if there is an SPDX license identifier where the first item contains the
"SPDX license identifier" text proper and the second item contains the
remainder of the line (expected to be a license expression). Otherwise
return a 2-tuple where the first item is None and the second item contains
the orignal text.
"""
segments = _split_spdx_lid(text)
expression = segments[-1]
if len(segments) > 1:
return segments[-2], expression
else:
segments = _nuget_split_spdx_lid(text)
expression = segments[-1]
if len(segments) > 1:
return segments[-2], expression
else:
return None, text