Skip to content

Commit

Permalink
Merge pull request #600 from Hritik14/migration/importer_yielder
Browse files Browse the repository at this point in the history
Dump importer_yielder in favor of IMPORTER_REGISTRY and drop Etags
  • Loading branch information
pombredanne authored Feb 7, 2022
2 parents 14850ea + 45cdaf5 commit 4ebaa48
Show file tree
Hide file tree
Showing 79 changed files with 664 additions and 1,407 deletions.
6 changes: 0 additions & 6 deletions vulnerabilities/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

from vulnerabilities.models import (
PackageRelatedVulnerability,
Importer,
Package,
Vulnerability,
VulnerabilityReference,
Expand Down Expand Up @@ -55,11 +54,6 @@ class PackageRelatedVulnerabilityAdmin(admin.ModelAdmin):
search_fields = ["vulnerability__vulnerability_id", "package__name"]


@admin.register(Importer)
class ImporterAdmin(admin.ModelAdmin):
pass


@admin.register(VulnerabilitySeverity)
class VulnerabilitySeverityAdmin(admin.ModelAdmin):
pass
4 changes: 2 additions & 2 deletions vulnerabilities/fixtures/github.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"name": "github",
"license": "",
"last_run": "2021-03-06T09:09:01.523Z",
"data_source": "GitHubAPIDataSource",
"data_source": "GitHubAPIImporter",
"data_source_cfg": {
"endpoint": "https://api.github.com/graphql",
"ecosystems": [
Expand All @@ -115,4 +115,4 @@
"reference": 136
}
}
]
]
34 changes: 17 additions & 17 deletions vulnerabilities/fixtures/openssl.json
Original file line number Diff line number Diff line change
Expand Up @@ -40538,7 +40538,7 @@
"name": "rust",
"license": "cc0-1.0",
"last_run": "2020-09-14T12:47:17.451Z",
"data_source": "RustDataSource",
"data_source": "RustImporter",
"data_source_cfg": {
"branch": null,
"repository_url": "https://github.com/RustSec/advisory-db",
Expand All @@ -40555,7 +40555,7 @@
"name": "alpine",
"license": "",
"last_run": null,
"data_source": "AlpineDataSource",
"data_source": "AlpineImporter",
"data_source_cfg": {
"branch": null,
"repository_url": "https://gitlab.alpinelinux.org/alpine/infra/alpine-secdb"
Expand All @@ -40569,7 +40569,7 @@
"name": "archlinux",
"license": "mit",
"last_run": null,
"data_source": "ArchlinuxDataSource",
"data_source": "ArchlinuxImporter",
"data_source_cfg": {
"archlinux_tracker_url": "https://security.archlinux.org/json"
}
Expand All @@ -40582,7 +40582,7 @@
"name": "debian",
"license": "mit",
"last_run": null,
"data_source": "DebianDataSource",
"data_source": "DebianImporter",
"data_source_cfg": {
"debian_tracker_url": "https://security-tracker.debian.org/tracker/data/json"
}
Expand All @@ -40595,7 +40595,7 @@
"name": "safetydb",
"license": "cc-by-nc-4.0",
"last_run": null,
"data_source": "SafetyDbDataSource",
"data_source": "SafetyDbImporter",
"data_source_cfg": {
"url": "https://raw.githubusercontent.com/pyupio/safety-db/master/data/insecure_full.json",
"etags": {}
Expand All @@ -40609,7 +40609,7 @@
"name": "npm",
"license": "mit",
"last_run": null,
"data_source": "NpmDataSource",
"data_source": "NpmImporter",
"data_source_cfg": {
"repository_url": "https://github.com/nodejs/security-wg.git"
}
Expand All @@ -40622,7 +40622,7 @@
"name": "ruby",
"license": "",
"last_run": null,
"data_source": "RubyDataSource",
"data_source": "RubyImporter",
"data_source_cfg": {
"repository_url": "https://github.com/rubysec/ruby-advisory-db.git"
}
Expand All @@ -40635,7 +40635,7 @@
"name": "ubuntu",
"license": "gpl-2.0",
"last_run": null,
"data_source": "UbuntuDataSource",
"data_source": "UbuntuImporter",
"data_source_cfg": {
"etags": {},
"releases": [
Expand All @@ -40655,7 +40655,7 @@
"name": "retiredotnet",
"license": "mit",
"last_run": null,
"data_source": "RetireDotnetDataSource",
"data_source": "RetireDotnetImporter",
"data_source_cfg": {
"repository_url": "https://github.com/RetireNet/Packages.git"
}
Expand All @@ -40668,7 +40668,7 @@
"name": "suse_backports",
"license": "",
"last_run": null,
"data_source": "SUSEBackportsDataSource",
"data_source": "SUSEBackportsImporter",
"data_source_cfg": {
"url": "http://ftp.suse.com/pub/projects/security/yaml/",
"etags": {}
Expand All @@ -40682,7 +40682,7 @@
"name": "debian_oval",
"license": "",
"last_run": null,
"data_source": "DebianOvalDataSource",
"data_source": "DebianOvalImporter",
"data_source_cfg": {
"etags": {},
"releases": [
Expand All @@ -40701,7 +40701,7 @@
"name": "redhat",
"license": "cc-by-4.0",
"last_run": null,
"data_source": "RedhatDataSource",
"data_source": "RedhatImporter",
"data_source_cfg": {}
}
},
Expand All @@ -40712,7 +40712,7 @@
"name": "gentoo",
"license": "",
"last_run": null,
"data_source": "GentooDataSource",
"data_source": "GentooImporter",
"data_source_cfg": {
"repository_url": "https://anongit.gentoo.org/git/data/glsa.git"
}
Expand All @@ -40725,7 +40725,7 @@
"name": "openssl",
"license": "",
"last_run": "2020-09-14T12:52:58.762Z",
"data_source": "OpenSSLDataSource",
"data_source": "OpenSSLImporter",
"data_source_cfg": {
"etags": {
"https://www.openssl.org/news/vulnerabilities.xml": "\"513bd-5aee0a1c716f0-gzip\""
Expand All @@ -40740,7 +40740,7 @@
"name": "ubuntu_usn",
"license": "gpl-2.0",
"last_run": null,
"data_source": "UbuntuUSNDataSource",
"data_source": "UbuntuUSNImporter",
"data_source_cfg": {
"etags": {},
"db_url": "https://usn.ubuntu.com/usn-db/database-all.json.bz2"
Expand All @@ -40754,7 +40754,7 @@
"name": "github",
"license": "",
"last_run": null,
"data_source": "GitHubAPIDataSource",
"data_source": "GitHubAPIImporter",
"data_source_cfg": {
"endpoint": "https://api.github.com/graphql",
"ecosystems": [
Expand All @@ -40765,4 +40765,4 @@
}
}
}
]
]
38 changes: 12 additions & 26 deletions vulnerabilities/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from typing import List
from typing import Optional
from typing import Tuple
from unittest.mock import MagicMock

import requests
import saneyaml
Expand Down Expand Up @@ -67,32 +68,8 @@ def fetch_yaml(url):
return saneyaml.load(response.content)


# FIXME: this is NOT how etags work .
# We should instead send the proper HTTP header
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/If-None-Match
# and integrate this finely in the processing as this typically needs to use
# streaming=True requests, and proper handling of the HTTP return code
# In all cases this ends up being a single request, not a HEADD followed
# by another real request
def create_etag(data_src, url, etag_key):
"""
Etags are like hashes of web responses. For a data source `data_src`,
we maintain (url, etag) mappings in the DB. `create_etag` creates
(`url`, etag) pair. If a (`url`, etag) already exists then the code
skips processing the response further to avoid duplicate work.
`etag_key` is the name of header which contains the etag for the url.
"""
etag = requests.head(url).headers.get(etag_key)
if not etag:
return True

elif url in data_src.config.etags:
if data_src.config.etags[url] == etag:
return False

data_src.config.etags[url] = etag
return True
# FIXME: Remove this entirely after complete importer-improver migration
create_etag = MagicMock()


def contains_alpha(string):
Expand Down Expand Up @@ -190,3 +167,12 @@ def split_markdown_front_matter(text: str) -> Tuple[str, str]:
return frontmatter, markdown

return "", text


# TODO: Replace this with combination of @classmethod and @property after upgrading to python 3.9
class classproperty(object):
def __init__(self, fget):
self.fget = fget

def __get__(self, owner_self, owner_cls):
return self.fget(owner_cls)
64 changes: 19 additions & 45 deletions vulnerabilities/import_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,79 +25,50 @@
import datetime
import json
import logging
from typing import Set
from typing import List
from typing import Iterable


from vulnerabilities import models
from vulnerabilities.models import Advisory
from vulnerabilities.data_source import AdvisoryData
from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import Importer

logger = logging.getLogger(__name__)


class ImportRunner:
"""
The ImportRunner is responsible for inserting and updating data about vulnerabilities and
affected/unaffected/fixed packages in the database. The two main goals for the implementation
are correctness and efficiency.
affected/unaffected/fixed packages in the database. The main goal for the implementation
is correctness
Correctness:
- There must be no duplicates in the database (should be enforced by the schema).
- No valid data from the data source must be skipped or truncated.
Efficiency:
- Bulk inserts should be used whenever possible.
- Checking whether a record already exists should be kept to a minimum
(the data source should know this instead).
- All update and select operations must use indexed columns.
"""

def __init__(self, importer: models.Importer):
def __init__(self, importer: Importer):
self.importer = importer

def run(self, cutoff_date: datetime.datetime = None) -> None:
def run(self) -> None:
"""
Create a data source for the given importer and store the data retrieved in the database.
cutoff_date - optional timestamp of the oldest data to include in the import
NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities
are potentially shared across many packages, from the same data source and from different
data sources. For example, a vulnerability in the Linux kernel is mentioned by advisories
from all Linux distributions that package this kernel version.
"""
logger.info(f"Starting import for {self.importer.name}.")
data_source = self.importer.make_data_source(cutoff_date=cutoff_date)
with data_source:
advisory_data = data_source.advisory_data()
importer_name = data_source.qualified_name()
process_advisories(advisory_datas=advisory_data, importer_name=importer_name)
self.importer.last_run = datetime.datetime.now(tz=datetime.timezone.utc)
self.importer.data_source_cfg = dataclasses.asdict(data_source.config)
self.importer.save()

logger.info(f"Finished import for {self.importer.name}.")

importer_name = self.importer.qualified_name
importer_class = self.importer
logger.info(f"Starting import for {importer_name}")
advisory_datas = importer_class().advisory_data()
count = process_advisories(advisory_datas=advisory_datas, importer_name=importer_name)
logger.info(f"Finished import for {importer_name}. Imported {count} advisories.")

def vuln_ref_exists(vulnerability, url, reference_id):
return models.VulnerabilityReference.objects.filter(
vulnerability=vulnerability, reference_id=reference_id, url=url
).exists()


def get_vuln_pkg_refs(vulnerability, package):
return models.PackageRelatedVulnerability.objects.filter(
vulnerability=vulnerability,
package=package,
)


def process_advisories(advisory_datas: Iterable[AdvisoryData], importer_name: str) -> None:
def process_advisories(advisory_datas: Iterable[AdvisoryData], importer_name: str) -> List:
"""
Insert advisories into the database
Return the number of inserted advisories.
"""

count = 0
for data in advisory_datas:
obj, created = Advisory.objects.get_or_create(
aliases=data.aliases,
Expand All @@ -114,5 +85,8 @@ def process_advisories(advisory_datas: Iterable[AdvisoryData], importer_name: st
logger.info(
f"[*] New Advisory with aliases: {obj.aliases!r}, created_by: {obj.created_by}"
)
count += 1
else:
logger.debug(f"Advisory with aliases: {obj.aliases!r} already exists. Skipped.")

return count
Loading

0 comments on commit 4ebaa48

Please sign in to comment.