Skip to content

Commit

Permalink
Merge pull request #1559 from aboutcode-org/1509-pypa-importer-pipeline
Browse files Browse the repository at this point in the history
Add base pipeline for importers and migrate PyPa importer to aboutcode pipeline
  • Loading branch information
keshav-space authored Aug 27, 2024
2 parents 1e3afdc + d73cfd4 commit 5b982c6
Show file tree
Hide file tree
Showing 13 changed files with 523 additions and 79 deletions.
1 change: 0 additions & 1 deletion vulnerabilities/import_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import Importer
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.improver import Inference
from vulnerabilities.improvers.default import DefaultImporter
from vulnerabilities.models import Advisory
Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from vulnerabilities.importers import oss_fuzz
from vulnerabilities.importers import postgresql
from vulnerabilities.importers import project_kb_msr2019
from vulnerabilities.importers import pypa
from vulnerabilities.importers import pysec
from vulnerabilities.importers import redhat
from vulnerabilities.importers import retiredotnet
Expand All @@ -40,13 +39,13 @@
from vulnerabilities.importers import ubuntu_usn
from vulnerabilities.importers import vulnrichment
from vulnerabilities.importers import xen
from vulnerabilities.pipelines import pypa_importer

IMPORTERS_REGISTRY = [
nvd.NVDImporter,
github.GitHubAPIImporter,
gitlab.GitLabAPIImporter,
npm.NpmImporter,
pypa.PyPaImporter,
nginx.NginxImporter,
pysec.PyPIImporter,
alpine_linux.AlpineImporter,
Expand Down Expand Up @@ -75,6 +74,7 @@
github_osv.GithubOSVImporter,
epss.EPSSImporter,
vulnrichment.VulnrichImporter,
pypa_importer.PyPaImporterPipeline,
]

IMPORTERS_REGISTRY = {x.qualified_name: x for x in IMPORTERS_REGISTRY}
66 changes: 0 additions & 66 deletions vulnerabilities/importers/pypa.py

This file was deleted.

8 changes: 8 additions & 0 deletions vulnerabilities/management/commands/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from vulnerabilities.import_runner import ImportRunner
from vulnerabilities.importers import IMPORTERS_REGISTRY
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline


class Command(BaseCommand):
Expand Down Expand Up @@ -57,6 +58,13 @@ def import_data(self, importers):

for importer in importers:
self.stdout.write(f"Importing data using {importer.qualified_name}")
if issubclass(importer, VulnerableCodeBaseImporterPipeline):
status, error = importer().execute()
if status != 0:
self.stdout.write(error)
failed_importers.append(importer.qualified_name)
continue

try:
ImportRunner(importer).run()
self.stdout.write(
Expand Down
98 changes: 97 additions & 1 deletion vulnerabilities/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
from datetime import datetime
from datetime import timezone
from traceback import format_exc as traceback_format_exc
from typing import Iterable

from aboutcode.pipeline import BasePipeline
from aboutcode.pipeline import LoopProgress

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.improver import MAX_CONFIDENCE
from vulnerabilities.models import Advisory
from vulnerabilities.pipes.advisory import import_advisory
from vulnerabilities.pipes.advisory import insert_advisory
from vulnerabilities.utils import classproperty

module_logger = logging.getLogger(__name__)
Expand All @@ -32,3 +41,90 @@ def qualified_name(cls):
Fully qualified name prefixed with the module name of the pipeline used in logging.
"""
return f"{cls.__module__}.{cls.__qualname__}"


class VulnerableCodeBaseImporterPipeline(VulnerableCodePipeline):
"""
Base importer pipeline for importing advisories.
Uses:
Subclass this Pipeline and implement ``advisories_count`` and ``collect_advisories`` method.
Also override the ``steps`` and ``advisory_confidence`` as needed.
"""

license_url = None
spdx_license_expression = None
repo_url = None
importer_name = None
advisory_confidence = MAX_CONFIDENCE

@classmethod
def steps(cls):
return (
# Add step for downloading/cloning resource as required.
cls.collect_and_store_advisories,
cls.import_new_advisories,
# Add step for removing downloaded/cloned resource as required.
)

def collect_advisories(self) -> Iterable[AdvisoryData]:
"""
Yield AdvisoryData for importer pipeline.
Populate the `self.collected_advisories_count` field and yield AdvisoryData
"""
raise NotImplementedError

def advisories_count(self) -> int:
"""
Return the estimated AdvisoryData to be yielded by ``collect_advisories``.
Used by ``collect_and_store_advisories`` to log the progress of advisory collection.
"""
raise NotImplementedError

def collect_and_store_advisories(self):
collected_advisory_count = 0
progress = LoopProgress(total_iterations=self.advisories_count(), logger=self.log)
for advisory in progress.iter(self.collect_advisories()):
if _obj := insert_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
logger=self.log,
):
collected_advisory_count += 1

self.log(f"Successfully collected {collected_advisory_count:,d} advisories")

def import_new_advisories(self):
new_advisories = Advisory.objects.filter(
created_by=self.qualified_name,
date_imported__isnull=True,
)

new_advisories_count = new_advisories.count()

self.log(f"Importing {new_advisories_count:,d} new advisories")

imported_advisory_count = 0
progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log)
for advisory in progress.iter(new_advisories.paginated()):
self.import_advisory(advisory=advisory)
if advisory.date_imported:
imported_advisory_count += 1

self.log(f"Successfully imported {imported_advisory_count:,d} new advisories")

def import_advisory(self, advisory: Advisory) -> int:
try:
import_advisory(
advisory=advisory,
pipeline_name=self.qualified_name,
confidence=self.advisory_confidence,
logger=self.log,
)
except Exception as e:
self.log(
f"Failed to import advisory: {advisory!r} with error {e!r}:\n{traceback_format_exc()}",
level=logging.ERROR,
)
70 changes: 70 additions & 0 deletions vulnerabilities/pipelines/pypa_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
import logging
from pathlib import Path
from typing import Iterable

import saneyaml
from fetchcode.vcs import fetch_via_vcs

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importers.osv import parse_advisory_data
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline
from vulnerabilities.utils import get_advisory_url

module_logger = logging.getLogger(__name__)


class PyPaImporterPipeline(VulnerableCodeBaseImporterPipeline):
"""Collect advisories from PyPA GitHub repository."""

spdx_license_expression = "CC-BY-4.0"
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
repo_url = "git+https://github.com/pypa/advisory-database"
importer_name = "Pypa Importer"

@classmethod
def steps(cls):
return (
cls.clone,
cls.collect_and_store_advisories,
cls.import_new_advisories,
cls.clean_downloads,
)

def clone(self):
self.log(f"Cloning `{self.repo_url}`")
self.vcs_response = fetch_via_vcs(self.repo_url)

def advisories_count(self):
vulns_directory = Path(self.vcs_response.dest_dir) / "vulns"
return sum(1 for _ in vulns_directory.rglob("*.yaml"))

def collect_advisories(self) -> Iterable[AdvisoryData]:
base_directory = Path(self.vcs_response.dest_dir)
vulns_directory = base_directory / "vulns"
self.advisories_count = sum(1 for _ in vulns_directory.rglob("*.yaml"))

for advisory in vulns_directory.rglob("*.yaml"):
advisory_url = get_advisory_url(
file=advisory,
base_path=base_directory,
url="https://github.com/pypa/advisory-database/blob/main/",
)
advisory_dict = saneyaml.load(advisory.read_text())
yield parse_advisory_data(
raw_data=advisory_dict,
supported_ecosystems=["pypi"],
advisory_url=advisory_url,
)

def clean_downloads(self):
if self.vcs_response:
self.log(f"Removing cloned repository")
self.vcs_response.delete()
Loading

0 comments on commit 5b982c6

Please sign in to comment.