Skip to content

Commit

Permalink
Improve replacement of HTML entities #930
Browse files Browse the repository at this point in the history
Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
  • Loading branch information
pombredanne committed Mar 12, 2018
1 parent 922d00e commit 6ee9e06
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 15 deletions.
4 changes: 2 additions & 2 deletions etc/scripts/testdata/livescan/expected.csv
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,pr
/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost &lt tj@learnboost.com&gt,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost &lt,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost <tj@learnboost.com>,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,tj@learnboost.com,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,
Expand Down
27 changes: 21 additions & 6 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -1795,8 +1795,8 @@ def prepare_text_line(line):
"""
Prepare a line of text for copyright detection.
"""
re_sub = re.sub
# FIXME: maintain the original character positions
re_sub = re.sub

# strip whitespace
line = line.strip()
Expand All @@ -1811,13 +1811,10 @@ def prepare_text_line(line):
# replace ('
line = line.replace(r'("', ' ')

# strip comment markers
# common comment characters
line = line.strip('\\/*#%;')
# un common comment line prefix in dos
line = re_sub('^rem\s+', ' ', line)
line = re_sub('^\@rem\s+', ' ', line)
# un common comment line prefix in autotools am/in
# less common comment line prefix in autotools am/in
line = re_sub('^dnl\s+', ' ', line)
# un common comment line prefix in man pages
line = re_sub('^\.\\\\"', ' ', line)
Expand All @@ -1832,7 +1829,9 @@ def prepare_text_line(line):
line = line.replace('&copy;', ' (c) ')
line = line.replace('&#169;', ' (c) ')
line = line.replace('&#xa9;', ' (c) ')
line = line.replace('&#XA9;', ' (c) ')
line = line.replace(u'\xa9', ' (c) ')
line = line.replace(u'\XA9', ' (c) ')
# FIXME: what is \xc2???
line = line.replace(u'\xc2', '')

Expand All @@ -1842,6 +1841,16 @@ def prepare_text_line(line):
line = line.replace(u'&#13;&#10;', ' ')
line = line.replace(u'&#13;', ' ')
line = line.replace(u'&#10;', ' ')
# spaces
line = line.replace(u'&ensp;', ' ')
line = line.replace(u'&emsp;', ' ')
line = line.replace(u'&thinsp;', ' ')

# common named entities
line = line.replace(u'&quot;', '"').replace(u'&#34;', '"')
line = line.replace(u'&amp;', '&').replace(u'&#38;', '&')
line = line.replace(u'&gt;', '>').replace(u'&#62;', '>')
line = line.replace(u'&lt;', '<').replace(u'&#60;', '<')

# normalize (possibly repeated) quotes to unique single quote '
# backticks ` and "
Expand All @@ -1851,8 +1860,10 @@ def prepare_text_line(line):
# quotes to space? but t'so will be wrecked
# line = line.replace(u"'", ' ')

# remove explicit \\n
# treat explicit CR, LF and tabs as space
line = line.replace("\\n", ' ')
line = line.replace("\\r", ' ')
line = line.replace("\\t", ' ')

# remove backslash
line = line.replace("\\", ' ')
Expand Down Expand Up @@ -1908,4 +1919,8 @@ def prepare_text_line(line):
# why?
line = lowercase_well_known_word(line)

# strip comment markers
# common comment characters
line = line.strip('\\/*#%;')

return line
1 change: 1 addition & 0 deletions tests/cluecode/data/copyrights/boost.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
\n\nCopyright (c) 2012 LearnBoost &lt;tj@learnboost.com&gt;\n\n
11 changes: 11 additions & 0 deletions tests/cluecode/data/copyrights/boost.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
what:
- copyrights
- holders
- holders_summary

copyrights:
- Copyright (c) 2012 LearnBoost <tj@learnboost.com>
holders:
- LearnBoost
holders_summary:
- LearnBoost
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ what:
- holders
- holders_summary
copyrights:
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org/
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org
holders:
- the Xiph.Org Foundation
holders_summary:
Expand Down
2 changes: 1 addition & 1 deletion tests/cluecode/data/ics/speex-libspeex/smallft.c.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ what:
- holders
- holders_summary
copyrights:
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org/
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org
holders:
- the XIPHOPHORUS Company
holders_summary:
Expand Down
10 changes: 5 additions & 5 deletions tests/formattedcode/data/csv/livescan/expected.csv
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,programming_language,is_binary,is_text,is_archive,is_media,is_source,is_script,files_count,dirs_count,size_count,scan_errors,license__key,license__score,license__short_name,license__category,license__owner,license__homepage_url,license__text_url,license__reference_url,license__spdx_license_key,license__spdx_url,start_line,end_line,matched_rule__identifier,matched_rule__license_choice,matched_rule__licenses,copyright,copyright_holder,email,url,package__type,package__name,package__version,package__primary_language,package__summary,package__description,package__size,package__release_date,package__authors,package__homepage_url,package__notes,package__download_urls,package__bug_tracking_url,package__vcs_repository,package__copyright_top_level
/json2csv.rb,file,json2csv.rb,json2csv,.rb,1014,2017-10-03,92a83e5f8566bee7c83cf798c1b8912d609f56e0,380b7a5f483db7ace853b8f9dca5bfec,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/json2csv.rb,file,json2csv.rb,json2csv,.rb,1014,2018-02-20,92a83e5f8566bee7c83cf798c1b8912d609f56e0,380b7a5f483db7ace853b8f9dca5bfec,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,apache-2.0,89.53,Apache 2.0,Permissive,Apache Software Foundation,http://www.apache.org/licenses/,http://www.apache.org/licenses/LICENSE-2.0,https://enterprise.dejacode.com/urn/urn:dje:license:apache-2.0,Apache-2.0,https://spdx.org/licenses/Apache-2.0,5,14,apache-2.0_7.RULE,,apache-2.0,,,,,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,Copyright (c) 2017 nexB Inc. and others.,,,,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,,nexB Inc. and others.,,,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,http://nexb.com/,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,https://github.com/nexB/scancode-toolkit/,,,,,,,,,,,,,,,
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,10,,,,,,,http://apache.org/licenses/LICENSE-2.0,,,,,,,,,,,,,,,
/license,file,license,license,,679,2017-10-03,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,text/plain,ASCII text,,False,True,False,False,False,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/license,file,license,license,,679,2018-02-20,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,text/plain,ASCII text,,False,True,False,False,False,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/license,,,,,,,,,,,,,,,,,,,,,,gpl-2.0-plus,100.00,GPL 2.0 or later,Copyleft,Free Software Foundation (FSF),http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,https://enterprise.dejacode.com/urn/urn:dje:license:gpl-2.0-plus,GPL-2.0+,https://spdx.org/licenses/GPL-2.0,1,12,gpl-2.0-plus.LICENSE,,gpl-2.0-plus,,,,,,,,,,,,,,,,,,,
/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/package.json,file,package.json,package,.json,2200,2018-02-20,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,,mit,,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,,mit,,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost &lt tj@learnboost.com&gt,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost &lt,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost <tj@learnboost.com>,,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost,,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,tj@learnboost.com,,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,
Expand Down

0 comments on commit 6ee9e06

Please sign in to comment.