Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add base pipeline for importers and migrate PyPa importer to aboutcode pipeline #1559

Merged
merged 9 commits into from
Aug 27, 2024
1 change: 0 additions & 1 deletion vulnerabilities/import_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import Importer
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.improver import Inference
from vulnerabilities.improvers.default import DefaultImporter
from vulnerabilities.models import Advisory
Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vulnerabilities.importers import oss_fuzz
from vulnerabilities.importers import postgresql
from vulnerabilities.importers import project_kb_msr2019
from vulnerabilities.importers import pypa
from vulnerabilities.importers import pysec
from vulnerabilities.importers import redhat
from vulnerabilities.importers import retiredotnet
Expand All @@ -40,13 +39,13 @@
from vulnerabilities.importers import ubuntu_usn
from vulnerabilities.importers import vulnrichment
from vulnerabilities.importers import xen
from vulnerabilities.pipelines import pypa_importer

IMPORTERS_REGISTRY = [
nvd.NVDImporter,
github.GitHubAPIImporter,
gitlab.GitLabAPIImporter,
npm.NpmImporter,
pypa.PyPaImporter,
nginx.NginxImporter,
pysec.PyPIImporter,
alpine_linux.AlpineImporter,
Expand Down Expand Up @@ -75,6 +74,7 @@
github_osv.GithubOSVImporter,
epss.EPSSImporter,
vulnrichment.VulnrichImporter,
pypa_importer.PyPaImporterPipeline,
]

IMPORTERS_REGISTRY = {x.qualified_name: x for x in IMPORTERS_REGISTRY}
66 changes: 0 additions & 66 deletions vulnerabilities/importers/pypa.py

This file was deleted.

8 changes: 8 additions & 0 deletions vulnerabilities/management/commands/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from vulnerabilities.import_runner import ImportRunner
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline


class Command(BaseCommand):
Expand Down Expand Up @@ -57,6 +58,13 @@ def import_data(self, importers):

for importer in importers:
self.stdout.write(f"Importing data using {importer.qualified_name}")
if issubclass(importer, VulnerableCodeBaseImporterPipeline):
status, error = importer().execute()
if status != 0:
self.stdout.write(error)
failed_importers.append(importer.qualified_name)
continue

try:
ImportRunner(importer).run()
self.stdout.write(
Expand Down
98 changes: 97 additions & 1 deletion vulnerabilities/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
from datetime import datetime
from datetime import timezone
from traceback import format_exc as traceback_format_exc
from typing import Iterable

from aboutcode.pipeline import BasePipeline
from aboutcode.pipeline import LoopProgress

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.improver import MAX_CONFIDENCE
from vulnerabilities.models import Advisory
from vulnerabilities.pipes.advisory import import_advisory
from vulnerabilities.pipes.advisory import insert_advisory
from vulnerabilities.utils import classproperty

module_logger = logging.getLogger(__name__)
Expand All @@ -32,3 +41,90 @@ def qualified_name(cls):
Fully qualified name prefixed with the module name of the pipeline used in logging.
"""
return f"{cls.__module__}.{cls.__qualname__}"


class VulnerableCodeBaseImporterPipeline(VulnerableCodePipeline):
"""
Base importer pipeline for importing advisories.

Uses:
Subclass this Pipeline and implement ``advisories_count`` and ``collect_advisories`` method.
Also override the ``steps`` and ``advisory_confidence`` as needed.
"""

license_url = None
spdx_license_expression = None
repo_url = None
importer_name = None
advisory_confidence = MAX_CONFIDENCE

@classmethod
def steps(cls):
return (
# Add step for downloading/cloning resource as required.
cls.collect_and_store_advisories,
cls.import_new_advisories,
# Add step for removing downloaded/cloned resource as required.
)

def collect_advisories(self) -> Iterable[AdvisoryData]:
"""
Yield AdvisoryData for importer pipeline.

Populate the `self.collected_advisories_count` field and yield AdvisoryData
"""
raise NotImplementedError

def advisories_count(self) -> int:
"""
Return the estimated AdvisoryData to be yielded by ``collect_advisories``.

Used by ``collect_and_store_advisories`` to log the progress of advisory collection.
"""
raise NotImplementedError

def collect_and_store_advisories(self):
collected_advisory_count = 0
progress = LoopProgress(total_iterations=self.advisories_count(), logger=self.log)
for advisory in progress.iter(self.collect_advisories()):
if _obj := insert_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
logger=self.log,
):
collected_advisory_count += 1

self.log(f"Successfully collected {collected_advisory_count:,d} advisories")

def import_new_advisories(self):
new_advisories = Advisory.objects.filter(
created_by=self.qualified_name,
date_imported__isnull=True,
)

new_advisories_count = new_advisories.count()

self.log(f"Importing {new_advisories_count:,d} new advisories")

imported_advisory_count = 0
progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log)
for advisory in progress.iter(new_advisories.paginated()):
self.import_advisory(advisory=advisory)
if advisory.date_imported:
imported_advisory_count += 1

self.log(f"Successfully imported {imported_advisory_count:,d} new advisories")

def import_advisory(self, advisory: Advisory) -> int:
try:
import_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
confidence=self.advisory_confidence,
logger=self.log,
)
except Exception as e:
self.log(
f"Failed to import advisory: {advisory!r} with error {e!r}:\n{traceback_format_exc()}",
level=logging.ERROR,
)
70 changes: 70 additions & 0 deletions vulnerabilities/pipelines/pypa_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import logging
from pathlib import Path
from typing import Iterable

import saneyaml
from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importers.osv import parse_advisory_data
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline
from vulnerabilities.utils import get_advisory_url

module_logger = logging.getLogger(__name__)


class PyPaImporterPipeline(VulnerableCodeBaseImporterPipeline):
"""Collect advisories from PyPA GitHub repository."""

spdx_license_expression = "CC-BY-4.0"
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
repo_url = "git+https://github.com/pypa/advisory-database"
importer_name = "Pypa Importer"

@classmethod
def steps(cls):
return (
cls.clone,
cls.collect_and_store_advisories,
cls.import_new_advisories,
cls.clean_downloads,
)

def clone(self):
self.log(f"Cloning `{self.repo_url}`")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
vulns_directory = Path(self.vcs_response.dest_dir) / "vulns"
return sum(1 for _ in vulns_directory.rglob("*.yaml"))

def collect_advisories(self) -> Iterable[AdvisoryData]:
base_directory = Path(self.vcs_response.dest_dir)
vulns_directory = base_directory / "vulns"
self.advisories_count = sum(1 for _ in vulns_directory.rglob("*.yaml"))

for advisory in vulns_directory.rglob("*.yaml"):
advisory_url = get_advisory_url(
keshav-space marked this conversation as resolved.
Show resolved Hide resolved
file=advisory,
base_path=base_directory,
url="https://github.com/pypa/advisory-database/blob/main/",
)
advisory_dict = saneyaml.load(advisory.read_text())
yield parse_advisory_data(
raw_data=advisory_dict,
supported_ecosystems=["pypi"],
advisory_url=advisory_url,
)

def clean_downloads(self):
if self.vcs_response:
self.log(f"Removing cloned repository")
self.vcs_response.delete()
Loading
Loading