Skip to content

Commit

Permalink
ADD: a test script to analyze the journal database for performance
Browse files Browse the repository at this point in the history
  • Loading branch information
tcpan committed Oct 17, 2024
1 parent 74e4a89 commit 053e700
Showing 1 changed file with 87 additions and 0 deletions.
87 changes: 87 additions & 0 deletions chorus_upload/db_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#%%

import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# %%
dbfile = "/home/tpan/src/chorus-extract-upload/journal.20240925_waveforms.db"

# %%
# open the sqlite database
conn = sqlite3.connect(dbfile)
c = conn.cursor()

# get the size, md5_duration, upload_duration, and verify_duration from the journal table, for rows where the upload_dtstr is later than 20240920000000
result = c.execute("SELECT size, upload_dtstr, md5_duration, upload_duration, verify_duration FROM journal").fetchall()

# close the connection
conn.close()

# %%
# create a pandas dataframe from the results
orig_df = pd.DataFrame(result, columns=["size", "upload_dtstr", "md5_duration", "upload_duration", "verify_duration"])

orig_df['size'] = orig_df['size'].astype('int') / (1024*1024)
orig_df['upload_throughput'] = orig_df['size'] / orig_df['upload_duration']

df = orig_df[orig_df['upload_dtstr'] > '20240921000000']



# %%
# create a size histogram
df.hist(column = 'size', bins=20)
hist, bin_edges = np.histogram(np.log2(df['size']), bins=20) # log scale to partition the bins
bin_edges = np.round(np.exp2(bin_edges), 6) # convert edges back to linear scale
print(bin_edges)
# place histogram of sizes as a column
df['size (MiB)'] = pd.cut(df['size'], bins=bin_edges, labels=bin_edges[1:])


# %%
def plot_violin(df, x_column, y_column, bins):
# stratify the data into bins.


# partitioned = []
# for i in range(len(bin_edges) - 1):
# partitioned.append(df[(df[x_column] >= bin_edges[i]) & (df[x_column] < bin_edges[i+1])][y_column].values)

fig, axes = plt.subplots()
my_plot = sns.boxplot(data=df, x =x_column, y= y_column, ax = axes)

# axes.violinplot(dataset = partitioned)
axes.set_title(f'{y_column} vs {x_column}')
axes.yaxis.grid(True)
axes.set_xlabel(x_column)
axes.set_ylabel(y_column)
my_plot.set_xticklabels(bins[1:], rotation=90)
# axes.set_xticks(bin_edges)
plt.show()


#%%
# plot the md5_duration as a min-max boxplot, binned by size
plot_violin(df, 'size (MiB)', 'md5_duration', bin_edges)


#%%
# plot the upload_duration as a min-max boxplot, binned by size
plot_violin(df, 'size (MiB)', 'upload_duration', bin_edges)


#%%
# plot the verify_duration as a min-max boxplot, binned by size
plot_violin(df, 'size (MiB)', 'verify_duration', bin_edges)

# %%

# plot the upload_throughput as a min-max boxplot, binned by size
plot_violin(df, 'size (MiB)', 'upload_throughput', bin_edges)


# %%

0 comments on commit 053e700

Please sign in to comment.