Skip to content

Commit

Permalink
Improved CSV generator and performance test (#109)
Browse files Browse the repository at this point in the history
Signed-off-by: Justin Ohrenberger <justinohrenberger@gmail.com>
  • Loading branch information
johrenberger authored Jan 8, 2024
1 parent d6e9ba1 commit 52d2e58
Show file tree
Hide file tree
Showing 2 changed files with 376 additions and 0 deletions.
250 changes: 250 additions & 0 deletions tests/samples/csv_random_data_generate_at_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
from concurrent.futures import ThreadPoolExecutor
import csv
import functools
import io
import logging
import random
import time
import pytz
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()

def log_execution_time(func):
"""Decorator to log the execution time of a function."""

@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
logging.info(f"{func.__name__} executed in {end_time - start_time:.2f} seconds")
return result

return wrapper

class FakeFocus:

def __init__(self):
self._cache = {}

# Current time in UTC
now = datetime.now(pytz.utc)

# 30 days ago from now
thirty_days_ago = now - timedelta(days=30)

@property
def InvoiceIssuer(self):
return self._cached_property('InvoiceIssuer', self.get_aws_invoice_issuer)

@property
def ResourceID(self):
return self._cached_property('ResourceID', fake.uuid4)

@property
def ChargeType(self):
return self._cached_property('ChargeType', self.get_charge_type)

@property
def Provider(self):
return self._cached_property('Provider', fake.company)

@property
def BillingAccountName(self):
return self._cached_property('BillingAccountName', fake.company)

@property
def SubAccountName(self):
# Only generate a new datetime when it's not in cache
if 'SubAccountName' not in self._cache:
# Generate a random datetime object within the last 30 days
random_datetime = fake.date_time_between(start_date=self.thirty_days_ago, end_date=self.now, tzinfo=pytz.utc)
formatted_date = datetime.strftime(random_datetime, '%Y-%m-%dT%H:%M:%SZ')
self._cache['SubAccountName'] = formatted_date

return self._cache['SubAccountName']

@property
def BillingAccountId(self):
return self._cached_property('BillingAccountId', fake.uuid4)

@property
def Publisher(self):
return self._cached_property('Publisher', self.get_aws_publisher)

@property
def ResourceName(self):
return self._cached_property('ResourceName', self.get_aws_resource_name)

@property
def ServiceName(self):
return self._cached_property('ServiceName', self.get_aws_service_name)

@property
def BilledCurrency(self):
return self._cached_property('BilledCurrency', lambda: 'USD')

@property
def BillingPeriodEnd(self):
# Only generate a new datetime when it's not in cache
if 'BillingPeriodEnd' not in self._cache:
# Generate a random datetime object within the last 30 days
random_datetime = fake.date_time_between(start_date=self.thirty_days_ago, end_date=self.now, tzinfo=pytz.utc)
formatted_date = datetime.strftime(random_datetime, '%Y-%m-%dT%H:%M:%SZ')
self._cache['BillingPeriodEnd'] = formatted_date

return self._cache['BillingPeriodEnd']

@property
def BillingPeriodStart(self):
# Only generate a new datetime when it's not in cache
if 'BillingPeriodStart' not in self._cache:
# Generate a random datetime object within the last 30 days
random_datetime = fake.date_time_between(start_date=self.thirty_days_ago, end_date=self.now, tzinfo=pytz.utc)
formatted_date = datetime.strftime(random_datetime, '%Y-%m-%dT%H:%M:%SZ')
self._cache['BillingPeriodStart'] = formatted_date

return self._cache['BillingPeriodStart']

@property
def Region(self):
return self._cached_property('Region', self.get_aws_region)

@property
def ServiceCategory(self):
return self._cached_property('ServiceCategory', self.get_aws_service_category)

@property
def ChargePeriodStart(self):
# Only generate a new datetime when it's not in cache
if 'ChargePeriodStart' not in self._cache:
# Generate a random datetime object within the last 30 days
random_datetime = fake.date_time_between(start_date=self.thirty_days_ago, end_date=self.now, tzinfo=pytz.utc)
formatted_date = datetime.strftime(random_datetime, '%Y-%m-%dT%H:%M:%SZ')
self._cache['ChargePeriodStart'] = formatted_date

return self._cache['ChargePeriodStart']

@property
def ChargePeriodEnd(self):
# Only generate a new datetime when it's not in cache
if 'ChargePeriodEnd' not in self._cache:
# Generate a random datetime object within the last 30 days
random_datetime = fake.date_time_between(start_date=self.thirty_days_ago, end_date=self.now, tzinfo=pytz.utc)
formatted_date = datetime.strftime(random_datetime, '%Y-%m-%dT%H:%M:%SZ')
self._cache['ChargePeriodEnd'] = formatted_date

return self._cache['ChargePeriodEnd']

@property
def BilledCost(self):
return self._cached_property('BilledCost', lambda: fake.pyfloat(left_digits=3, right_digits=2, positive=True))

@property
def AmortizedCost(self):
return self._cached_property('AmortizedCost', lambda: fake.pyfloat(left_digits=3, right_digits=2, positive=True))

def _cached_property(self, prop_name, generator_func):
if prop_name not in self._cache:
self._cache[prop_name] = generator_func()
return self._cache[prop_name]

def to_dict(self):
return {
'InvoiceIssuer': self.InvoiceIssuer,
'ResourceID': self.ResourceID,
'ChargeType': self.ChargeType,
'Provider': self.Provider,
'BillingAccountName': self.BillingAccountName,
'SubAccountName': self.SubAccountName,
'BillingAccountId': self.BillingAccountId,
'Publisher': self.Publisher,
'ResourceName': self.ResourceName,
'ServiceName': self.ServiceName,
'BilledCurrency': self.BilledCurrency,
'BillingPeriodEnd': self.BillingPeriodEnd,
'BillingPeriodStart': self.BillingPeriodStart,
'Region': self.Region,
'ServiceCategory': self.ServiceCategory,
'ChargePeriodStart': self.ChargePeriodStart,
'ChargePeriodEnd': self.ChargePeriodEnd,
'BilledCost': self.BilledCost,
'AmortizedCost': self.AmortizedCost
}


def get_aws_invoice_issuer(self):
aws_entities = [
'AWS Inc.', 'Amazon Web Services', 'AWS Marketplace',
'Amazon Data Services', 'AWS CloudFront', 'Amazon S3 Billing',
'Amazon EC2 Billing', 'AWS Lambda Billing'
]
return str(random.choice(aws_entities))

def get_charge_type(self):
aws_entities = [
'Adjustment', 'Purchase', 'Tax',
'Usage'
]
return str(random.choice(aws_entities))

def get_aws_publisher(self):
publisher_types = ['Software', 'Service', 'Platform']
publisher_suffix = random.choice(['Inc.', 'LLC', 'Ltd.', 'Group', 'Technologies', 'Solutions'])
return f"{fake.company()} {random.choice(publisher_types)} {publisher_suffix}"

def get_aws_resource_name(self):
resource_types = ['i-', 'vol-', 'snap-', 'ami-', 'bucket-', 'db-']
resource_prefix = random.choice(resource_types)
resource_id = fake.hexify(text='^^^^^^^^', upper=False)
return f'{resource_prefix}{resource_id}'

def get_aws_service_category(self):
aws_service_categories = [
'AI and Machine Learning', 'Analytics', 'Business Applications', 'Compute', 'Databases', 'Developer Tools', 'Multicloud',
'Identity', 'Integration', 'Internet of Things', 'Management and Governance', 'Media', 'Migration', 'Mobile', 'Networking',
'Security', 'Storage', 'Web', 'Other'
]
return random.choice(aws_service_categories)

def get_aws_service_name(self):
aws_services = [
'Amazon EC2', 'Amazon S3', 'AWS Lambda', 'Amazon RDS',
'Amazon DynamoDB', 'Amazon VPC', 'Amazon Route 53',
'Amazon CloudFront', 'AWS Elastic Beanstalk', 'Amazon SNS',
'Amazon SQS', 'Amazon Redshift', 'AWS CloudFormation',
'AWS IAM', 'Amazon EBS', 'Amazon ECS', 'Amazon EKS',
'Amazon ElastiCache', 'AWS Fargate', 'AWS Glue'
]
return random.choice(aws_services)

def get_aws_region(self):
aws_regions = [
'us-east-1', 'us-west-1', 'us-west-2', 'eu-west-1', 'eu-central-1',
'ap-southeast-1', 'ap-southeast-2', 'ap-northeast-1', 'ap-northeast-2',
'ap-south-1', 'sa-east-1', 'ca-central-1', 'eu-north-1', 'eu-west-2',
'eu-west-3', 'ap-east-1', 'me-south-1', 'af-south-1', 'eu-south-1'
]
return random.choice(aws_regions)

def generate_fake_focus():
return FakeFocus()

def write_focus_to_csv(focus, csv_writer):
csv_writer.writerow(focus.to_dict())

@log_execution_time
def generate_and_write_fake_focuses(csv_filename, num_records):
headers = FakeFocus().to_dict().keys()

with open(csv_filename, 'w', newline='') as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=headers)
csv_writer.writeheader()

with ThreadPoolExecutor() as executor:
futures = [executor.submit(generate_fake_focus) for _ in range(num_records)]
for future in futures:
focus = future.result()
write_focus_to_csv(focus, csv_writer)
126 changes: 126 additions & 0 deletions tests/test_progressive_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import logging
import os
import subprocess
import time
import unittest

from tests.samples.csv_random_data_generate_at_scale import generate_and_write_fake_focuses

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')


class TestProgressivePerformance(unittest.TestCase):
@classmethod
def setUpClass(cls):
#Generate 1000 fake focuses to a CSV file
cls.csv_filename_1000 = 'fake_focuses1000.csv'
cls.csv_filename_10000 = 'fake_focuses10000.csv'
cls.csv_filename_50000 = 'fake_focuses50000.csv'
cls.csv_filename_100000 = 'fake_focuses100000.csv'
cls.csv_filename_250000 = 'fake_focuses250000.csv'
cls.csv_filename_500000 = 'fake_focuses500000.csv'

logging.info("Generating file with 1,000 records")
cls.generate_test_file(str(cls.csv_filename_1000), 1000)

# logging.info("Generating file with 10,0000 records")
# cls.generate_test_file(str(cls.csv_filename_10000), 10000)

# logging.info("Generating file with 50,0000 records")
# cls.generate_test_file(str(cls.csv_filename_50000), 50000)

# logging.info("Generating file with 100,0000 records")
# cls.generate_test_file(str(cls.csv_filename_100000), 100000)

# logging.info("Generating file with 250,0000 records")
# cls.generate_test_file(str(cls.csv_filename_250000), 250000)

# logging.info("Generating file with 500,0000 records")
# cls.generate_test_file(str(cls.csv_filename_500000), 500000)

@classmethod
def tearDownClass(cls):
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

if os.path.exists(os.path.join(base_dir, 'fake_focuses.csv')):
os.remove(os.path.join(base_dir, 'fake_focuses.csv'))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_1000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_1000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_10000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_10000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_50000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_50000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_100000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_100000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_250000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_250000)))

if os.path.exists(os.path.join(base_dir, str(cls.csv_filename_500000))):
os.remove(os.path.join(base_dir, str(cls.csv_filename_500000)))

@classmethod
def generate_test_file(cls, csv_filename, number_of_records):
#Generate fake focuses to a CSV file
# fake_focuses = generate_fake_focus(number_of_records)

# write_fake_focuses_to_csv(fake_focuses, csv_filename)
generate_and_write_fake_focuses(csv_filename, number_of_records)


def run_validator(self, args):
# Get the current directory of this test file
test_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to the application directory
app_dir = os.path.join(test_dir, '../focus_validator')
# Set the environment variable for logging level
env = os.environ.copy()
env["LOG_LEVEL"] = "INFO"

command = ['poetry', 'run', 'python', os.path.join(app_dir, 'main.py')] + args
return subprocess.run(command, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)

def test_1000_record_csv_performance(self):
self.execute_performance(str(self.csv_filename_1000), 25.0)

# def test_10000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_10000), 25.0)

# def test_50000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_50000), 150.0)

# def test_100000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_100000), 300.0)

# def test_250000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_250000), 300.0)

# def test_500000_record_csv_performance(self):
# self.execute_performance(str(self.csv_filename_500000), 300.0)

def execute_performance(self, file_name, performance_threshold):
# Get the current directory of this test file
test_dir = os.path.dirname(os.path.abspath(__file__))

start_time = time.time()

# Command to execute the focus_validator tool
result = self.run_validator(['--data-file', os.path.join(test_dir, '../' + file_name)])
print(result.stdout)

end_time = time.time()
duration = end_time - start_time
logging.info(f"File: {file_name} Duration: {duration} seconds")

self.assertLess(duration, performance_threshold, f"Performance test exceeded threshold. Duration: {duration} seconds")
self.assertEqual(result.returncode, 0, "Focus Validator did not exit cleanly.")


if __name__ == '__main__':
unittest.main()

0 comments on commit 52d2e58

Please sign in to comment.