Skip to content

Commit

Permalink
Improve copyright POS tagging #930
Browse files Browse the repository at this point in the history
 * minor changes mostly from scanning several npms

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
  • Loading branch information
pombredanne committed Mar 12, 2018
1 parent 7bbc218 commit 469b254
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 15 deletions.
29 changes: 16 additions & 13 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,13 @@ def detect(location):
(r'^(Send|It|Mac|Support|Confidential|Information|Various|Mouse|Wheel'
r'|Vendor|Commercial|Indemnified|Luxi|These|Several|GnuPG|WPA|Supplicant'
r'|TagSoup|Contact|IA64|Foreign|Data|Atomic|Pentium|Note|Delay|Separa.*|Added'
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New'
r'|Glib|Gnome|Gaim|Open|Possible|In|Read|Permissions?|New|MIT'
r')$', 'NN'),

# Various non CAPS
(r'^(OR)$', 'NN'),

# Various rare non CAPS but NNP
# Various rare non CAPS but NNP, treated as full names
(r'^(FSF[\.,]?)$', 'NAME'),

# Windows XP
Expand All @@ -252,6 +252,9 @@ def detect(location):
r'[Ff]unctionality|bgcolor|F+|Rewrote|Much|remains?,?|Implementation|earlier'
r'|al.|is|laws|url|[Ss]ee)$', 'JUNK'),

# Some mixed case junk
(r'^LastModified$', 'JUNK'),

# Some font names
(r'^Lucida$', 'JUNK'),

Expand All @@ -277,8 +280,8 @@ def detect(location):

(r'^\$?LastChangedDate\$?$', 'YR'),

# Misc corner cases
(r'^Software,\',|\(Royal|PARADIGM|nexB|Antill\',$', 'NNP'),
# Misc corner cases that are NNP
(r'^Software,\',|\(Royal|PARADIGM|nexB|okunishinishi|yiminghe|Antill\',$', 'NNP'),

# rarer caps
# EPFL-LRC/ICA
Expand Down Expand Up @@ -347,9 +350,11 @@ def detect(location):
(r'^HOLDER\(S\)$', 'JUNK'),
(r'^([Hh]olders?|HOLDERS?)$', 'HOLDER'),

(r'^([Rr]espective)$', 'NN'),
# affiliates
(r'^[Aa]ffiliates?\.?$', 'NNP'),
# not NNPs
(r'^([Rr]espective|JavaScript)$', 'NN'),

# affiliates or "and its affiliate(s)."
(r'^[Aa]ffiliate(s|\(s\))?\.?$', 'NNP'),

# OU as in Org unit, found in some certficates
(r'^OU$', 'OU'),
Expand Down Expand Up @@ -494,11 +499,8 @@ def detect(location):
# all CAPS word, all letters including an optional trailing single quote
(r"^[A-Z]{2,}\'?$", 'CAPS'),

# email eventually in parens or brackets. The closing > or ) is optional
(r'[\<\(][a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?[\>\)]?', 'EMAIL'),

# email
(r'[a-zA-Z0-9\+_\-\.\%]+(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]*\.[a-zA-Z]{2,5}?', 'EMAIL'),
# email eventually in parens or brackets with some trailing punct.
(r'^[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,5}?[\>\)\.\,]*$', 'EMAIL'),

# URLS such as <(http://fedorahosted.org/lohit)>
(r'[<\(]https?:.*[>\)]', 'URL'),
Expand All @@ -525,6 +527,7 @@ def detect(location):

# comma as a conjunction
(r'^,$', 'CC'),

# .\" is not a noun
(r'^\.\\\?"?$', 'JUNK'),

Expand All @@ -537,7 +540,7 @@ def detect(location):
# communications
(r'communications', 'NNP'),

# Code variable names, snake case
# Code variable names including snake case
(r'^.*(_.*)+$', 'JUNK'),

# nouns (default)
Expand Down
1 change: 1 addition & 0 deletions tests/cluecode/data/copyrights/junk_trailing.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Otherwood (c) 2011 note this implementation is heavily based/inspired from the dictionary implementation
4 changes: 4 additions & 0 deletions tests/cluecode/data/copyrights/junk_trailing.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
what:
- copyrights
copyrights:
- Otherwood (c) 2011
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ copyrights:
- Copyright 2006, Joe Gregorio contributors Thomas Broyer (t.broyer@ltgt.net)', James Antill
holders:
- Joe Gregorio contributors
- Thomas Broyer James Antill
- Thomas Broyer
- (t.broyer@ltgt.net)
- James Antill
holders_summary:
- Joe Gregorio contributors
- Thomas Broyer James Antill
- Thomas Broyer
- (t.broyer@ltgt.net)
- James Antill
notes: extra trailing contribution should not be detected

0 comments on commit 469b254

Please sign in to comment.