Restructure pipelines for verbosity (#1074)

* Restructure pipelines for verbosity Remove scan_codebase_packages pipeline, and restructure inspect_packages pipeline into load_sbom and resolve_packages pipelines. Reference: #1035 Reference: #1034 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Refactor functions and improve docstrings Reference: #1074 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Add unittests for new functions Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Update docs and add CHANGELOG entry Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Improve docstrings for pipelines Suggested-by: Philippe Ombredanne <pombredanne@nexb.com> Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> --------- Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
aboutcode-org · Feb 14, 2024 · 7bb4499 · 7bb4499
1 parent 100b64e
commit 7bb4499
Show file tree

Hide file tree

Showing 18 changed files with 434 additions and 164 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -17,6 +17,20 @@ v33.2.0 (unreleased)
 
    https://github.com/nexB/scancode.io/issues/1071
 
+- Rename pipeline for consistency and precision:
+  * scan_codebase_packages: inspect_packages
+
+  Restructure the inspect_manifest pipeline into:
+  * load_sbom: for loading SPDX/CycloneDX SBOMs and ABOUT files
+  * resolve_dependencies: for resolving package dependencies
+  * inspect_packages: gets package data from package manifests/lockfiles 
+
+  A data migration is included to facilitate the migration of existing data.
+  Only the new names are available in the web UI but the REST API and CLI are backward
+  compatible with the old names.
+  https://github.com/nexB/scancode.io/issues/1034
+  https://github.com/nexB/scancode.io/discussions/1035
+
 v33.1.0 (2024-02-02)
 --------------------
 

diff --git a/docs/automation.rst b/docs/automation.rst
@@ -27,7 +27,7 @@ automation methods such as a cron job or a git hook::
         "https://github.com/nexB/scancode.io/archive/refs/tags/v32.4.0.zip",
     ]
     PIPELINES = [
-        "scan_codebase_package",
+        "inspect_packages",
         "find_vulnerabilities",
     ]
     EXECUTE_NOW = True

diff --git a/docs/built-in-pipelines.rst b/docs/built-in-pipelines.rst
@@ -72,6 +72,22 @@ Load Inventory
     :members:
     :member-order: bysource
 
+.. _pipeline_load_sbom:
+
+Load SBOM
+---------
+.. autoclass:: scanpipe.pipelines.load_sbom.LoadSBOM()
+    :members:
+    :member-order: bysource
+
+.. _pipeline_resolve_dependencies:
+
+Resolve Dependencies
+--------------------
+.. autoclass:: scanpipe.pipelines.resolve_dependencies.ResolveDependencies()
+    :members:
+    :member-order: bysource
+
 .. _pipeline_map_deploy_to_develop:
 
 Map Deploy To Develop
@@ -126,14 +142,6 @@ Scan Codebase
     :members:
     :member-order: bysource
 
-.. _pipeline_scan_codebase_package:
-
-Scan Codebase Package
----------------------
-.. autoclass:: scanpipe.pipelines.scan_codebase_packages.ScanCodebasePackages()
-    :members:
-    :member-order: bysource
-
 .. _pipeline_scan_single_package:
 
 Scan Single Package

diff --git a/docs/faq.rst b/docs/faq.rst
@@ -25,18 +25,27 @@ Here are some general guidelines based on different input scenarios:
 
 - If you have a **Docker image** as input, use the
   :ref:`analyze_docker_image <pipeline_analyze_docker_image>` pipeline.
-- For a full **codebase compressed as an archive**, choose the
+- For a full **codebase compressed as an archive**, optionally also with
+  it's **pre-resolved dependenices**, and want to detect all the packages
+  present linked with their respective files, use the
   :ref:`scan_codebase <pipeline_scan_codebase>` pipeline.
-- If you have a **single package archive**, opt for the
+- If you have a **single package archive**, and you want to get information
+  on licenses, copyrights and package metadata for it, opt for the
   :ref:`scan_single_package <pipeline_scan_single_package>` pipeline.
 - When dealing with a **Linux root filesystem** (rootfs), the
   :ref:`analyze_root_filesystem_or_vm_image <pipeline_analyze_root_filesystem>` pipeline
   is the appropriate choice.
 - For processing the results of a **ScanCode-toolkit scan** or **ScanCode.io scan**,
   use the :ref:`load_inventory <pipeline_load_inventory>` pipeline.
-- When you have **manifest files**, such as a
-  **CycloneDX BOM, SPDX document, lockfile**, etc.,
-  use the :ref:`inspect_packages <pipeline_inspect_packages>` pipeline.
+- When you want to import **SPDX/CycloneDX SBOMs or ABOUT files** into a project,
+  use the :ref:`load_sbom <pipeline_load_sbom>` pipeline.
+- When you have **lockfiles or other package manifests** in a codebase and you want to
+  resolve packages from their package requirements, use the
+  :ref:`resolve_dependencies <pipeline_resolve_dependencies>` pipeline.
+- When you have application **package archives/codebases** and optionally also
+  their **pre-resolved dependenices** and you want to **inspect packages**
+  present in the package manifests and dependency, use the
+  :ref:`inspect_packages <pipeline_inspect_packages>` pipeline.
 - For scenarios involving both a **development and deployment codebase**, consider using
   the :ref:`map_deploy_to_develop <pipeline_map_deploy_to_develop>` pipeline.
 

diff --git a/scanpipe/apps.py b/scanpipe/apps.py
@@ -178,6 +178,7 @@ def get_new_pipeline_name(pipeline_name):
             "inspect_manifest": "inspect_packages",
             "deploy_to_develop": "map_deploy_to_develop",
             "scan_package": "scan_single_package",
+            "scan_codebase_packages": "inspect_packages",
         }
         if new_name := pipeline_old_names_mapping.get(pipeline_name):
             warnings.warn(

diff --git a/scanpipe/migrations/0053_restructure_pipelines_data.py b/scanpipe/migrations/0053_restructure_pipelines_data.py
@@ -0,0 +1,33 @@
+# Generated by Django 5.0.1 on 2024-02-09 15:05
+
+from django.db import migrations
+
+
+pipeline_old_names_mapping = {
+    "scan_codebase_packages": "inspect_packages",
+}
+
+
+def rename_pipelines_data(apps, schema_editor):
+    Run = apps.get_model("scanpipe", "Run")
+    for old_name, new_name in pipeline_old_names_mapping.items():
+        Run.objects.filter(pipeline_name=old_name).update(pipeline_name=new_name)
+
+
+def reverse_rename_pipelines_data(apps, schema_editor):
+    Run = apps.get_model("scanpipe", "Run")
+    for old_name, new_name in pipeline_old_names_mapping.items():
+        Run.objects.filter(pipeline_name=new_name).update(pipeline_name=old_name)
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("scanpipe", "0052_run_selected_groups"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            rename_pipelines_data,
+            reverse_code=reverse_rename_pipelines_data,
+        ),
+    ]
diff --git a/scanpipe/pipelines/inspect_packages.py b/scanpipe/pipelines/inspect_packages.py
@@ -21,32 +21,23 @@
 # Visit https://github.com/nexB/scancode.io for support and download.
 
 from scanpipe.pipelines.scan_codebase import ScanCodebase
-from scanpipe.pipes import resolve
-from scanpipe.pipes import update_or_create_package
+from scanpipe.pipes import scancode
 
 
 class InspectPackages(ScanCodebase):
     """
-    Inspect a codebase manifest files and resolve their associated packages.
+    Inspect a codebase for packages and pre-resolved dependencies.
 
-    Supports resolved packages for:
-    - Python: using nexB/python-inspector, supports requirements.txt and
-    setup.py manifests as input
+    This pipeline inspects a codebase for application packages
+    and their dependencies using package manifests and dependency
+    lockfiles. It does not resolve dependencies, it does instead
+    collect already pre-resolved dependencies from lockfiles, and
+    direct dependencies (possibly not resolved) as found in
+    package manifests' dependency sections.
 
-    Supports:
-    - BOM: SPDX document, CycloneDX BOM, AboutCode ABOUT file
-    - Python: requirements.txt, setup.py, setup.cfg, Pipfile.lock
-    - JavaScript: yarn.lock lockfile, npm package-lock.json lockfile
-    - Java: Java JAR MANIFEST.MF, Gradle build script
-    - Ruby: RubyGems gemspec manifest, RubyGems Bundler Gemfile.lock
-    - Rust: Rust Cargo.lock dependencies lockfile, Rust Cargo.toml package manifest
-    - PHP: PHP composer lockfile, PHP composer manifest
-    - NuGet: nuspec package manifest
-    - Dart: pubspec manifest, pubspec lockfile
-    - OS: FreeBSD compact package manifest, Debian installed packages database
-
-    Full list available at https://scancode-toolkit.readthedocs.io/en/
-    doc-update-licenses/reference/available_package_parsers.html
+    See documentation for the list of supported package manifests and
+    dependency lockfiles:
+    https://scancode-toolkit.readthedocs.io/en/stable/reference/available_package_parsers.html
     """
 
     @classmethod
@@ -55,46 +46,19 @@ def steps(cls):
             cls.copy_inputs_to_codebase_directory,
             cls.extract_archives,
             cls.collect_and_create_codebase_resources,
+            cls.flag_empty_files,
             cls.flag_ignored_resources,
-            cls.get_manifest_inputs,
-            cls.get_packages_from_manifest,
-            cls.create_resolved_packages,
+            cls.scan_for_application_packages,
         )
 
-    def get_manifest_inputs(self):
-        """Locate all the manifest files from the project's input/ directory."""
-        self.manifest_resources = resolve.get_manifest_resources(self.project)
-
-    def get_packages_from_manifest(self):
-        """Get packages data from manifest files."""
-        self.resolved_packages = []
-
-        if not self.manifest_resources.exists():
-            self.project.add_warning(
-                description="No manifests found for resolving packages",
-                model="get_packages_from_manifest",
-            )
-            return
-
-        for resource in self.manifest_resources:
-            if packages := resolve.resolve_packages(resource.location):
-                self.resolved_packages.extend(packages)
-            else:
-                self.project.add_error(
-                    description="No packages could be resolved for",
-                    model="get_packages_from_manifest",
-                    details={"path": resource.path},
-                )
-
-    def create_resolved_packages(self):
-        """Create the resolved packages and their dependencies in the database."""
-        for package_data in self.resolved_packages:
-            package_data = resolve.set_license_expression(package_data)
-            dependencies = package_data.pop("dependencies", [])
-            update_or_create_package(self.project, package_data)
-
-            for dependency_data in dependencies:
-                resolved_package = dependency_data.get("resolved_package")
-                if resolved_package:
-                    resolved_package.pop("dependencies", [])
-                    update_or_create_package(self.project, resolved_package)
+    def scan_for_application_packages(self):
+        """
+        Scan resources for package information to add DiscoveredPackage
+        and DiscoveredDependency objects from detected package data.
+        """
+        # `assemble` is set to False because here in this pipeline we
+        # only detect package_data in resources and create
+        # Package/Dependency instances directly instead of assembling
+        # the packages and assigning files to them
+        scancode.scan_for_application_packages(self.project, assemble=False)
+        scancode.process_package_data(self.project)
diff --git a/scanpipe/pipelines/scan_codebase_packages.py → scanpipe/pipelines/load_sbom.py b/scanpipe/pipelines/scan_codebase_packages.py → scanpipe/pipelines/load_sbom.py
@@ -21,15 +21,18 @@
 # Visit https://github.com/nexB/scancode.io for support and download.
 
 from scanpipe.pipelines.scan_codebase import ScanCodebase
-from scanpipe.pipes import scancode
+from scanpipe.pipes import resolve
 
 
-class ScanCodebasePackages(ScanCodebase):
+class LoadSBOM(ScanCodebase):
     """
-    Scan a codebase for PURLs without assembling full packages/dependencies.
+    Load package data from one or more SBOMs.
 
-    This Pipeline is intended for gathering PURL information from a
-    codebase without the overhead of full package assembly.
+    Supported SBOMs:
+    - SPDX document
+    - CycloneDX BOM
+    Other formats:
+    - AboutCode .ABOUT files for package curations.
     """
 
     @classmethod
@@ -40,12 +43,27 @@ def steps(cls):
             cls.collect_and_create_codebase_resources,
             cls.flag_empty_files,
             cls.flag_ignored_resources,
-            cls.scan_for_application_packages,
+            cls.get_sbom_inputs,
+            cls.get_packages_from_sboms,
+            cls.create_packages_from_sboms,
         )
 
-    def scan_for_application_packages(self):
-        """Scan unknown resources for packages information."""
-        # `assemble` is set to False because here in this pipeline we
-        # only detect package_data in resources without creating
-        # Package/Dependency instances, to get all the purls from a codebase.
-        scancode.scan_for_application_packages(self.project, assemble=False)
+    def get_sbom_inputs(self):
+        """Locate all the SBOMs among the codebase resources."""
+        self.manifest_resources = resolve.get_manifest_resources(self.project)
+
+    def get_packages_from_sboms(self):
+        """Get packages data from SBOMs."""
+        self.packages = resolve.get_packages(
+            project=self.project,
+            package_registry=resolve.sbom_registry,
+            manifest_resources=self.manifest_resources,
+            model="get_packages_from_sboms",
+        )
+
+    def create_packages_from_sboms(self):
+        """Create the packages and dependencies from the SBOM, in the database."""
+        resolve.create_packages_and_dependencies(
+            project=self.project,
+            packages=self.packages,
+        )
diff --git a/scanpipe/pipelines/populate_purldb.py b/scanpipe/pipelines/populate_purldb.py
@@ -22,7 +22,6 @@
 
 from scanpipe.pipelines import Pipeline
 from scanpipe.pipes import purldb
-from scanpipe.pipes import scancode
 
 
 class PopulatePurlDB(Pipeline):
@@ -36,7 +35,6 @@ def steps(cls):
         return (
             cls.populate_purldb_with_discovered_packages,
             cls.populate_purldb_with_discovered_dependencies,
-            cls.populate_purldb_with_detected_purls,
         )
 
     def populate_purldb_with_discovered_packages(self):
@@ -50,26 +48,3 @@ def populate_purldb_with_discovered_dependencies(self):
         purldb.populate_purldb_with_discovered_dependencies(
             project=self.project, logger=self.log
         )
-
-    def populate_purldb_with_detected_purls(self):
-        """Add DiscoveredPackage to PurlDB."""
-        no_packages_and_no_dependencies = all(
-            [
-                not self.project.discoveredpackages.exists(),
-                not self.project.discovereddependencies.exists(),
-            ]
-        )
-        # Even when there are no packages/dependencies, resource level
-        # package data could be detected (i.e. when we detect packages,
-        # but skip the assembly step that creates
-        # package/dependency instances)
-        if no_packages_and_no_dependencies:
-            packages = scancode.get_packages_with_purl_from_resources(self.project)
-            purls = [{"purl": package.purl} for package in packages]
-
-            self.log(f"Populating PurlDB with {len(purls):,d} " "detected PURLs"),
-            purldb.feed_purldb(
-                packages=purls,
-                chunk_size=100,
-                logger=self.log,
-            )