Skip to content

Commit

Permalink
Add test data generation tool. (#217)
Browse files Browse the repository at this point in the history
Added a tool to populate AIPscan with randomly generated example data.
  • Loading branch information
mcantelon committed Oct 5, 2023
1 parent 40b01a3 commit ea78e61
Show file tree
Hide file tree
Showing 9 changed files with 303 additions and 0 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,29 @@ Shut down the AIPscan Docker containers and remove the rabbitmq volumes:
docker-composer down --volumes
```

## Tools

The `tools` directory contains scripts that can be run by developers and system
adminsitrators.

#### Test data generator

The test data generator, `tools/generate-test-data.py`, tool populates
AIPscan's databse with randomly generated example data.

### Running tools

These should be run using the same system user and virtual environment that
AIPscan is running under.

Here's how you would run the `generate-test-data.py` tool, for example:

$ cd <path to AIPscan base directory>
$ sudo -u <system user> /bin/bash
$ source <path to virtual environment>/bin/activate
$ python3 tools/generate-test-data.py


# Usage

* Ensure that the Flask Server, RabbitMQ server, and Celery worker queue are up and running.
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r base.txt

faker==14.2.1
flake8==5.0.4
pytest==5.4.3
pytest_cov==2.11.1
Expand Down
Empty file added tools/__init__.py
Empty file.
Empty file added tools/app/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions tools/app/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import sys

sys.path.append("../AIPscan")

config_name = "default"
70 changes: 70 additions & 0 deletions tools/generate-test-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from app import tool
from faker import Faker
from flask import Flask
from helpers import data

from AIPscan import db
from AIPscan.models import FetchJob
from config import CONFIGS

app = Flask(__name__)
app.config.from_object(CONFIGS[tool.config_name])

db.init_app(app)

fake = Faker()
randint = fake.random.randint

with app.app_context():
# Add example storage services
ss_to_create = 2

print(f"Creating pipeline and {ss_to_create} storage services...")
pipeline = data.create_pipeline()

ss_ids = []
fetch_jobs = {}

default_created = False
for _ in range(ss_to_create):
is_default = len(ss_ids) == 0

ss = data.create_storage_service(is_default)
ss_ids.append(ss.id)

fetch_job = data.create_fetch_job(ss.id)
fetch_jobs[ss.id] = fetch_job.id

# Populate storage service locations
storage_locations_per_ss = 2
ss_locations_to_create = ss_to_create * storage_locations_per_ss

print(
f"Creating {ss_locations_to_create} storage service locations (and their AIPs)..."
)

aip_batches_created = 0
total_aip_batches = len(ss_ids) * storage_locations_per_ss
for ss_id in ss_ids:
for _ in range(storage_locations_per_ss):
# Add location
sl = data.create_location(ss_id)

# Add AIPs
aip_batches_created += 1

print(f"Creating AIPs ({aip_batches_created}/{total_aip_batches})...")

aipcount = 0
for _ in range(1, randint(100, 300)):
aip = data.create_aip(pipeline.id, ss_id, sl.id, fetch_jobs[ss.id])
data.create_aip_files(100, 300, aip.id)
aipcount += 1

# Update package/AIP counts in fetch job
fetch_job = FetchJob.query.get(fetch_jobs[ss_id])
fetch_job.total_packages += aipcount
fetch_job.total_aips += aipcount
db.session.commit()

print("Done.")
Empty file added tools/helpers/__init__.py
Empty file.
116 changes: 116 additions & 0 deletions tools/helpers/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from datetime import date

from faker import Faker

from AIPscan import db
from AIPscan.models import (
AIP,
FetchJob,
File,
Pipeline,
StorageLocation,
StorageService,
)

fake = Faker()
randint = fake.random.randint


def create_pipeline():
pipeline = Pipeline(origin_pipeline=fake.uuid4(), dashboard_url=fake.url())

db.session.add(pipeline)
db.session.commit()

return pipeline


def create_storage_service(default):
ss = StorageService(
name=fake.text(20)[:-1],
url=fake.url(),
user_name=fake.profile()["username"],
api_key=fake.password(),
download_limit=0,
download_offset=0,
default=default,
)

db.session.add(ss)
db.session.commit()

return ss


def create_fetch_job(storage_service_id):
fetch_job = FetchJob(
total_packages=0,
total_aips=0,
total_deleted_aips=0,
download_start=date.today(),
download_end=date.today(),
download_directory=fake.file_path(),
storage_service_id=storage_service_id,
)
fetch_job.total_dips = 0
fetch_job.total_sips = 0
fetch_job.total_replicas = 0

db.session.add(fetch_job)
db.session.commit()

return fetch_job


def create_location(storage_service_id):
location = StorageLocation(
current_location=fake.file_path(),
description=fake.text(20)[:-1],
storage_service_id=storage_service_id,
)

db.session.add(location)
db.session.commit()

return location


def create_aip(pipeline_id, storage_service_id, storage_location_id, fetch_job_id):
aip = AIP(
uuid=fake.uuid4(),
transfer_name=fake.text(20)[:-1],
create_date=date.today(),
mets_sha256=fake.sha256(),
size=randint(10000, 100_000_000),
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=pipeline_id,
)

db.session.add(aip)
db.session.commit()

return aip


def create_aip_files(min, max, aip_id):
for _ in range(1, randint(min, max)):
aipfile = File(
aip_id=aip_id,
name=fake.text(20)[:-1],
filepath=fake.file_path(),
uuid=fake.uuid4(),
file_type="original",
size=randint(1000, 1_000_000),
date_created=date.today(),
puid=fake.text(20)[:-1],
file_format=fake.text(20)[:-1],
format_version=fake.text(20)[:-1],
checksum_type=fake.text(20)[:-1],
checksum_value=fake.text(20)[:-1],
premis_object="",
)

db.session.add(aipfile)
db.session.commit()
88 changes: 88 additions & 0 deletions tools/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import datetime

import pytest

from tools.helpers import data


@pytest.fixture
def mock_db_add(mocker):
mocker.patch("AIPscan.db.session.add")
mocker.patch("AIPscan.db.session.commit")


def test_create_storage_service(mock_db_add):
ss = data.create_storage_service(True)

assert ss.name
assert type(ss.name) == str

assert ss.url
assert type(ss.url) == str

assert ss.user_name
assert type(ss.user_name) == str

assert ss.api_key
assert type(ss.api_key) == str

assert ss.default
assert type(ss.default) == bool

ss = data.create_storage_service(False)
assert not ss.default


def test_create_fetch_job(mock_db_add):
ss = data.create_storage_service(True)
ss.id = 1

fetch_job = data.create_fetch_job(ss.id)

assert fetch_job.download_start
assert type(fetch_job.download_start) == datetime.date

assert fetch_job.download_end
assert type(fetch_job.download_end) == datetime.date

assert fetch_job.download_directory
assert type(fetch_job.download_directory) == str

assert fetch_job.storage_service_id == ss.id


def test_create_location(mock_db_add):
location = data.create_location(1)

assert location.current_location
assert type(location.current_location) == str

assert location.description
assert type(location.description) == str

assert location.storage_service_id == 1


def test_create_aip(mock_db_add):
aip = data.create_aip(1, 2, 3, 4)

assert aip.uuid
assert type(aip.uuid) == str

assert aip.transfer_name
assert type(aip.transfer_name) == str

assert aip.create_date
assert type(aip.create_date) == datetime.date

assert aip.mets_sha256
assert type(aip.mets_sha256) == str

assert aip.size
assert type(aip.size) == int

assert aip.origin_pipeline_id == 1
assert aip.storage_service_id == 2
assert aip.storage_location_id == 3
assert aip.fetch_job_id == 4
assert aip.origin_pipeline_id == 1

0 comments on commit ea78e61

Please sign in to comment.