diff --git a/scanpipe/pipelines/docker.py b/scanpipe/pipelines/docker.py index 580308b06..c919e0013 100644 --- a/scanpipe/pipelines/docker.py +++ b/scanpipe/pipelines/docker.py @@ -65,7 +65,7 @@ def extract_layers(self): def find_images_os_and_distro(self): """ - Find the operating system and distro of the images. + Finds the operating system and distro of input images. """ for image in self.images: image.get_and_set_distro() diff --git a/scanpipe/pipelines/windows_docker.py b/scanpipe/pipelines/windows_docker.py index c4142346b..21590a2b5 100644 --- a/scanpipe/pipelines/windows_docker.py +++ b/scanpipe/pipelines/windows_docker.py @@ -28,7 +28,7 @@ class WindowsDocker(Docker): """ - A pipeline to analyze a Windows Docker image. + A pipeline to analyze Windows Docker images. """ @classmethod @@ -53,14 +53,13 @@ def steps(cls): def tag_known_software_packages(self): """ - Flag files from well-known software packages by checking common install - paths + Flag files from well-known software packages by checking common install paths. """ windows.tag_known_software(self.project) def tag_uninteresting_codebase_resources(self): """ - Flag files that are known to be uninteresting + Flag files that are known to be uninteresting. """ docker.tag_whiteout_codebase_resources(self.project) windows.tag_uninteresting_windows_codebase_resources(self.project) @@ -70,7 +69,7 @@ def tag_uninteresting_codebase_resources(self): def tag_program_files_dirs_as_packages(self): """ Report the immediate subdirectories of `Program Files` and `Program - Files (x86)` as packages + Files (x86)` as packages. """ windows.tag_program_files(self.project) diff --git a/scanpipe/pipes/rootfs.py b/scanpipe/pipes/rootfs.py index 15dfab9bb..0b4a9d202 100644 --- a/scanpipe/pipes/rootfs.py +++ b/scanpipe/pipes/rootfs.py @@ -351,18 +351,19 @@ def tag_ignorable_codebase_resources(project): for pattern in default_ignores.keys(): # Translate glob pattern to regex translated_pattern = fnmatch.translate(pattern) - # postgresql does not like parts of Python regex + # PostgreSQL does not like parts of Python regex if translated_pattern.startswith("(?s"): translated_pattern = translated_pattern.replace("(?s", "(?") lookups |= Q(rootfs_path__icontains=pattern) lookups |= Q(rootfs_path__iregex=translated_pattern) + qs = project.codebaseresources.no_status() qs.filter(lookups).update(status="ignored-default-ignores") def tag_data_files_with_no_clues(project): """ - Tag CodebaseResources that have a file type of `data` and no detected clues + Tags CodebaseResources that have a file type of `data` and no detected clues to be uninteresting. """ lookup = Q( @@ -375,15 +376,18 @@ def tag_data_files_with_no_clues(project): emails=[], urls=[], ) - project.codebaseresources.filter(lookup).update(status="ignored-data-file-no-clues") + + qs = project.codebaseresources + qs.filter(lookup).update(status="ignored-data-file-no-clues") def tag_media_files_as_uninteresting(project): """ - Tag CodebaseResources that are media files to be uninteresting. + Tags CodebaseResources that are media files to be uninteresting. + + `mimes` and `types` are taken from TypeCode: + https://github.com/nexB/typecode/blob/main/src/typecode/contenttype.py#L528 """ - # `mimes` and `types` were taken from TypeCode - # https://github.com/nexB/typecode/blob/c38f6831c59acae02a34a1288b9ce16e2e1f1733/src/typecode/contenttype.py#L528 mimes = ( "image", "picture", @@ -392,6 +396,7 @@ def tag_media_files_as_uninteresting(project): "graphic", "sound", ) + types = ( "image data", "graphics image", @@ -417,10 +422,12 @@ def tag_media_files_as_uninteresting(project): "image data", "netpbm", ) + lookup = Q() - for m in mimes: - lookup |= Q(mime_type__icontains=m) - for t in types: - lookup |= Q(file_type__icontains=t) + for mime_type in mimes: + lookup |= Q(mime_type__icontains=mime_type) + for file_type in types: + lookup |= Q(file_type__icontains=file_type) + qs = project.codebaseresources.no_status() qs.filter(lookup).update(status="ignored-media-file") diff --git a/scanpipe/pipes/windows.py b/scanpipe/pipes/windows.py index d2537461d..12ca0f705 100644 --- a/scanpipe/pipes/windows.py +++ b/scanpipe/pipes/windows.py @@ -98,23 +98,21 @@ def tag_installed_package_files(project, root_dir_pattern, package, q_objects=[] For all CodebaseResources from `project` whose `rootfs_path` starts with `root_dir_pattern`, add `package` to the discovered_packages of each CodebaseResource and set the status. - - If there are Q() objects in `q_objects`, then those Q() objects are chained - to the initial query (`lookup`) using AND to allow a more specific query for - package files. """ qs = project.codebaseresources.no_status() lookup = Q(rootfs_path__startswith=root_dir_pattern) + + # If there are Q() objects in `q_objects`, then those Q() objects are chained + # to the initial query `lookup` using AND to allow a more specific query for + # package files. for q_object in q_objects: lookup &= q_object + installed_package_files = qs.filter(lookup) # If we find files whose names start with `root_dir_pattern`, we consider - # these files to be part of the Package `package` and tag these files as - # such + # these files to be part of the Package `package` and tag these files as such. if installed_package_files: - created_package = pipes.update_or_create_package( - project=project, package_data=package.to_dict() - ) + created_package = pipes.update_or_create_package(project, package.to_dict()) for installed_package_file in installed_package_files: installed_package_file.discovered_packages.add(created_package) installed_package_file.status = "installed-package" @@ -122,33 +120,16 @@ def tag_installed_package_files(project, root_dir_pattern, package, q_objects=[] created_package.save() -def tag_known_software(project): - """ - Find Windows software in `project` by checking `project`s CodebaseResources - to see if their rootfs_path is is under a known software root directory. If - there are CodebaseResources that are under a known software root directory, - a DiscoveredPackage is created for that software package and all files under - that software package's root directory are considered installed files for - that package. - - Currently, we are only checking for Python and openjdk in Windows Docker - image layers. - - If a version number cannot be determined for an installed software Package, - then a version number of "nv" will be set. - """ +def _tag_python_software(project): qs = project.codebaseresources.no_status() - python_root_directory_name_pattern = r"(^/(Files/)?Python(\d+)?)/.*$" - python_root_directory_name_pattern_compiled = re.compile( - python_root_directory_name_pattern - ) + python_root_pattern = r"(^/(Files/)?Python(\d+)?)/.*$" + python_root_pattern_compiled = re.compile(python_root_pattern) + python_versions_by_path = {} - for python_codebase_resource in qs.filter( - rootfs_path__regex=python_root_directory_name_pattern - ): + for python_resource in qs.filter(rootfs_path__regex=python_root_pattern): _, python_root_dir, _, version, _ = re.split( - python_root_directory_name_pattern_compiled, - python_codebase_resource.rootfs_path, + python_root_pattern_compiled, + python_resource.rootfs_path, ) if python_root_dir in python_versions_by_path: continue @@ -177,19 +158,16 @@ def tag_known_software(project): q_objects=q_objects, ) + +def _tag_openjdk_software(project): qs = project.codebaseresources.no_status() - openjdk_root_directory_name_pattern = ( - r"^(/(Files/)?(open)?jdk(-((\d*)(\.\d+)*))*)/.*$" - ) - openjdk_root_directory_name_pattern_compiled = re.compile( - openjdk_root_directory_name_pattern - ) + openjdk_root_pattern = r"^(/(Files/)?(open)?jdk(-((\d*)(\.\d+)*))*)/.*$" + openjdk_root_pattern_compiled = re.compile(openjdk_root_pattern) + openjdk_versions_by_path = {} - for openjdk_codebase_resource in qs.filter( - rootfs_path__regex=openjdk_root_directory_name_pattern - ): + for openjdk_codebase_resource in qs.filter(rootfs_path__regex=openjdk_root_pattern): _, openjdk_root_path, _, _, _, openjdk_version, _, _, _ = re.split( - openjdk_root_directory_name_pattern_compiled, + openjdk_root_pattern_compiled, openjdk_codebase_resource.rootfs_path, ) if openjdk_root_path in openjdk_versions_by_path: @@ -207,10 +185,31 @@ def tag_known_software(project): homepage_url="http://openjdk.java.net/", ) tag_installed_package_files( - project=project, root_dir_pattern=openjdk_path, package=openjdk_package + project=project, + root_dir_pattern=openjdk_path, + package=openjdk_package, ) +def tag_known_software(project): + """ + Find Windows software in `project` by checking `project`s CodebaseResources + to see if their rootfs_path is is under a known software root directory. If + there are CodebaseResources that are under a known software root directory, + a DiscoveredPackage is created for that software package and all files under + that software package's root directory are considered installed files for + that package. + + Currently, we are only checking for Python and openjdk in Windows Docker + image layers. + + If a version number cannot be determined for an installed software Package, + then a version number of "nv" will be set. + """ + _tag_python_software(project) + _tag_openjdk_software(project) + + PROGRAM_FILES_DIRS_TO_IGNORE = ( "Common Files", "Microsoft", @@ -219,8 +218,7 @@ def tag_known_software(project): def tag_program_files(project): """ - Report all subdirectories of Program Files and Program Files (x86) as - Packages + Report all subdirectories of Program Files and Program Files (x86) as Packages. If a Package is detected in this manner, then we will attempt to determine the version from the path. If a version cannot be determined, a version of @@ -228,14 +226,13 @@ def tag_program_files(project): """ qs = project.codebaseresources.no_status() # Get all files from Program Files and Program Files (x86) - program_files_one_directory_below_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))" - program_files_one_directory_below_pattern_compiled = re.compile( - program_files_one_directory_below_pattern - ) + program_files_subdir_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))" + program_files_subdir_pattern_compiled = re.compile(program_files_subdir_pattern) + program_files_dirname_by_path = {} for program_file in qs.filter(rootfs_path__regex=r"^.*/Program Files( \(x86\))?"): _, program_files_subdir, _, dirname, _ = re.split( - program_files_one_directory_below_pattern_compiled, program_file.rootfs_path + program_files_subdir_pattern_compiled, program_file.rootfs_path ) if ( program_files_subdir in program_files_dirname_by_path @@ -244,14 +241,10 @@ def tag_program_files(project): continue program_files_dirname_by_path[program_files_subdir] = dirname - for ( - program_root_dir, - program_root_dir_name, - ) in program_files_dirname_by_path.items(): - package = win_reg.InstalledWindowsProgram( - name=program_root_dir_name, - version="nv", - ) + for root_dir, root_dir_name in program_files_dirname_by_path.items(): + package = win_reg.InstalledWindowsProgram(name=root_dir_name, version="nv") tag_installed_package_files( - project=project, root_dir_pattern=program_root_dir, package=package + project=project, + root_dir_pattern=root_dir, + package=package, ) diff --git a/setup.py b/setup.py index 5671013fa..1d25b7344 100755 --- a/setup.py +++ b/setup.py @@ -57,11 +57,11 @@ ], "scancodeio_pipelines": [ "docker = scanpipe.pipelines.docker:Docker", - "windows_docker = scanpipe.pipelines.windows_docker:WindowsDocker", "load_inventory = scanpipe.pipelines.load_inventory:LoadInventory", "root_filesystems = scanpipe.pipelines.root_filesystems:RootFS", "scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase", "scan_package = scanpipe.pipelines.scan_package:ScanPackage", + "windows_docker = scanpipe.pipelines.windows_docker:WindowsDocker", ], }, classifiers=[