Skip to content

Commit

Permalink
Fix missing copyrights of alpine packages
Browse files Browse the repository at this point in the history
fix #191
Signed-off-by: Mateusz Perc <m.perc@samsung.com>
  • Loading branch information
quepop committed Jun 10, 2021
1 parent d259a46 commit 2f59440
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
105 changes: 105 additions & 0 deletions scanpipe/pipes/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,118 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

import json
import os
import posixpath
import re
import subprocess

from packagedcode import alpine
from packageurl import PackageURL

from scanpipe.pipes import scancode

APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
APORTS_DIR_NAME = "aports"
APORTS_SUBDIRS = ["main", "non-free", "community", "testing", "unmaintained"]


def extract_source_urls_apkbuild(apkbuild_path):

This comment has been minimized.

Copy link
@pombredanne

pombredanne Jul 1, 2021

I have been thinking more about this for https://github.com/nexB/scancode.io/issues/191 and sourcing arbitrary bash script this way is too much of a security concern.
An alternative could include:

  1. using a container, jail or some sorts of chroot to try to minimize the risk a little
  2. implement a simple bash/shell script parser and perform parameter expansion

I think 2. is best and much less involved than having a container depdendency.
Furthermore, there could be other variables of interest in an APKBUILD and we need eventually to parse other shell-based manifests to extract metadata such as PKGBUILD (Arch), ebuild (Gentoo) m4 (Autotools) and a few more.

Therefore I am implementing this that can then be reused here:

"""
Extract all the urls from the APKBUILD's source variable.
"""
extraction_result = subprocess.run(
f"source {apkbuild_path} ; echo $source",
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
shell=True,
check=True,
executable="/bin/bash",
)
return re.findall(
r"(?:http|https|ftp):\/\/[^\s\"]*", extraction_result.stdout.decode("utf-8")
)


def extract_copyrights_json(scan_result_path):
"""
Having scancode result file extract all the copyrights into an array (deduplicated).
"""
if not os.path.exists(scan_result_path):
return None
with open(scan_result_path) as scan_result:
json_obj = json.load(scan_result)
copyrights = set()
for file_obj in json_obj["files"]:
for copyright in file_obj["copyrights"]:
copyrights.add(copyright["value"])
return list(copyrights)


def download_aports_repo(alpine_version, aports_dir):
"""
Download aports repository.
"""
ver = alpine_version.split(".")
if not os.path.exists(aports_dir):
subprocess.check_call(
["git", "clone", "-b", f"{ver[0]}.{ver[1]}-stable", APORTS_URL, aports_dir]
)


def complement_missing_copyrights(package, aports_dir, out_dir, tmp_dir):
"""
Check if package is not a subpackage - if not, proceed with copyright extraction.
Find package's aports subdir - it's APKBUILD path.
Download all the source code used to build the package.
Extract it and run scancode over extracted and associated files (aports).
Complement package's missing copyrights.
"""
if (
not package.source_packages
or package.name == PackageURL.from_string(package.source_packages[0]).name
):
package_id = f"{package.name}_{package.version}"
package_dir = posixpath.join(tmp_dir, package_id)
aports_commit_id = package.vcs_url.split("id=")[1]
scan_result_path = posixpath.join(out_dir, f"{package_id}.json")
if not os.path.exists(scan_result_path):
if subprocess.call(["git", "-C", aports_dir, "checkout", aports_commit_id]):
return
for repo_branch in APORTS_SUBDIRS:
apkbuild_dir = posixpath.join(aports_dir, repo_branch, package.name)
apkbuild_path = posixpath.join(apkbuild_dir, "APKBUILD")
if not os.path.exists(apkbuild_path):
continue
subprocess.check_call(["cp", "-R", apkbuild_dir, package_dir])
for url in extract_source_urls_apkbuild(apkbuild_path):
subprocess.check_call(["wget", "-P", package_dir, url])
scancode.run_extractcode(
location=package_dir, options=["--shallow"], raise_on_error=True
)
scancode.run_scancode(
location=package_dir,
output_file=scan_result_path,
options=["--copyright"],
raise_on_error=True,
)
break
package.copyright = extract_copyrights_json(scan_result_path)


def package_getter(root_dir, **kwargs):
"""
Download aports repository.
Yield installed package objects.
Complement missing copyrights.
"""
tmp_dir = kwargs["project"].tmp_path
out_dir = kwargs["project"].output_path
aports_dir = posixpath.join(tmp_dir, APORTS_DIR_NAME)
alpine_version = kwargs["version"]

download_aports_repo(alpine_version, aports_dir)
packages = alpine.get_installed_packages(root_dir)
for package in packages:
complement_missing_copyrights(package, aports_dir, out_dir, tmp_dir)
yield package.purl, package
2 changes: 2 additions & 0 deletions scanpipe/pipes/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
package_getter = partial(
rootfs.PACKAGE_GETTER_BY_DISTRO[distro_id],
distro=distro_id,
version=image.distro.version_id,
detect_licenses=detect_licenses,
project=project,
)

installed_packages = image.get_installed_packages(package_getter)
Expand Down

0 comments on commit 2f59440

Please sign in to comment.