Skip to content

Commit

Permalink
Add sortable size CSV field (#149) (#261)
Browse files Browse the repository at this point in the history
Package and file size values in CSV exports were made to be human readable
which was good for legibility, but impeded the ability to sort rows by size.
Added automatic population of, whenever size data is included in an export, an
additional column containing the "raw" byte value of a package or file size.
  • Loading branch information
mcantelon authored Dec 6, 2023
1 parent 70701ad commit 89d6466
Show file tree
Hide file tree
Showing 20 changed files with 121 additions and 49 deletions.
1 change: 1 addition & 0 deletions AIPscan/Data/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
FIELD_RELATED_PAIRING = "RelatedPairing"

FIELD_SIZE = "Size"
FIELD_SIZE_BYTES = "SizeBytes"
FIELD_STORAGE_LOCATION = "StorageLocation"
FIELD_STORAGE_NAME = "StorageName"

Expand Down
41 changes: 36 additions & 5 deletions AIPscan/Reporter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def sort_puids(puids):
return natsorted(puids)


def translate_headers(headers):
def translate_headers(headers, add_bytes_column=False):
"""Translate headers from something machine readable to something
more user friendly and translatable.
"""
Expand Down Expand Up @@ -69,6 +69,22 @@ def translate_headers(headers):
fields.FIELD_USER: "User",
fields.FIELD_VERSION: "Version",
}

# Attempt to add an additional header representing a column containing size
# expressed as a number of bytes, rather than in human-readable form, so
# rows can more easily be sorted by size
if add_bytes_column:
headers = (
headers.copy()
) # So we don't change the list object passed to this function

# Handle the two standard size columns
for header in [fields.FIELD_AIP_SIZE, fields.FIELD_SIZE]:
# If size header is found then insert another for the size in bytes afer it
if header in headers:
bytes_header = field_lookup[header] + " (bytes)"
headers.insert(headers.index(header) + 1, bytes_header)

return [field_lookup.get(header, header) for header in headers]


Expand All @@ -89,12 +105,27 @@ def format_size_for_csv(rows):
:returns: rows with formatted size field (list of dicts)
"""
edited_rows = []

for row in rows:
try:
# Add size in bytes after original size column
row_key_list = list(row.keys())

if fields.FIELD_SIZE in row_key_list:
size_position = row_key_list.index(fields.FIELD_SIZE) + 1
row_items = list(row.items())

row_items.insert(
size_position, (fields.FIELD_SIZE_BYTES, row[fields.FIELD_SIZE])
)
row = dict(row_items)

# Format original size column
row[fields.FIELD_SIZE] = filesizeformat(row[fields.FIELD_SIZE])
except KeyError:
pass
return rows

edited_rows.append(row)

return edited_rows


def download_csv(headers, rows, filename="report.csv"):
Expand Down
9 changes: 6 additions & 3 deletions AIPscan/Reporter/report_aip_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
fields.FIELD_FORMATS,
]

HEADERS = [
TABLE_HEADERS = [
fields.FIELD_AIP_NAME,
fields.FIELD_CREATED_DATE,
fields.FIELD_SIZE,
Expand Down Expand Up @@ -110,8 +110,9 @@ def aip_contents():
)

if csv:
headers = translate_headers(CSV_HEADERS, True)

filename = "aip_contents.csv"
headers = translate_headers(CSV_HEADERS)
aips = _create_aip_formats_string_representation(
aip_data.get(fields.FIELD_AIPS), separator="|"
)
Expand All @@ -120,12 +121,14 @@ def aip_contents():

aips = _create_aip_formats_string_representation(aip_data.get(fields.FIELD_AIPS))

headers = translate_headers(TABLE_HEADERS)

return render_template(
"report_aip_contents.html",
storage_service=storage_service_id,
storage_service_name=aip_data.get(fields.FIELD_STORAGE_NAME),
storage_location_description=aip_data.get(fields.FIELD_STORAGE_LOCATION),
columns=translate_headers(HEADERS),
columns=headers,
aips=aip_data.get(fields.FIELD_AIPS),
start_date=start_date,
end_date=get_display_end_date(end_date),
Expand Down
6 changes: 4 additions & 2 deletions AIPscan/Reporter/report_aips_by_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ def aips_by_format():
original_files = parse_bool(request.args.get(request_params.ORIGINAL_FILES, True))
csv = parse_bool(request.args.get(request_params.CSV), default=False)

headers = translate_headers(HEADERS)

aip_data = report_data.aips_by_file_format(
storage_service_id=storage_service_id,
file_format=file_format,
Expand All @@ -38,10 +36,14 @@ def aips_by_format():
)

if csv:
headers = translate_headers(HEADERS, True)

filename = "aips_by_file_format_{}.csv".format(file_format)
csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_aips_by_format.html",
storage_service_id=storage_service_id,
Expand Down
6 changes: 4 additions & 2 deletions AIPscan/Reporter/report_aips_by_puid.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ def aips_by_puid():
original_files = parse_bool(request.args.get(request_params.ORIGINAL_FILES, True))
csv = parse_bool(request.args.get(request_params.CSV), default=False)

headers = translate_headers(HEADERS)

aip_data = report_data.aips_by_puid(
storage_service_id=storage_service_id,
puid=puid,
Expand All @@ -63,10 +61,14 @@ def aips_by_puid():
)

if csv:
headers = translate_headers(HEADERS, True)

filename = "aips_by_puid_{}.csv".format(puid)
csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_aips_by_puid.html",
storage_service_id=storage_service_id,
Expand Down
6 changes: 4 additions & 2 deletions AIPscan/Reporter/report_format_versions_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,15 @@ def report_format_versions_count():
)
versions = version_data.get(fields.FIELD_FORMAT_VERSIONS)

headers = translate_headers(HEADERS)

if csv:
headers = translate_headers(HEADERS, True)

filename = "format_versions.csv"
csv_data = format_size_for_csv(versions)
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_format_versions_count.html",
storage_service_id=storage_service_id,
Expand Down
6 changes: 4 additions & 2 deletions AIPscan/Reporter/report_formats_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def report_formats_count():
)
formats = formats_data.get(fields.FIELD_FORMATS)

headers = translate_headers(HEADERS)

if csv:
headers = translate_headers(HEADERS, True)

filename = "file_formats.csv"
csv_data = format_size_for_csv(formats)
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_formats_count.html",
storage_service_id=storage_service_id,
Expand Down
7 changes: 4 additions & 3 deletions AIPscan/Reporter/report_largest_aips.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ def largest_aips():
pass
csv = parse_bool(request.args.get(request_params.CSV), default=False)

headers = translate_headers(HEADERS)

aip_data = report_data.largest_aips(
storage_service_id=storage_service_id,
start_date=start_date,
Expand All @@ -46,11 +44,14 @@ def largest_aips():
)

if csv:
headers = translate_headers(HEADERS, True)

filename = "largest_aips.csv"
headers = translate_headers(HEADERS)
csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_largest_aips.html",
storage_service_id=storage_service_id,
Expand Down
9 changes: 5 additions & 4 deletions AIPscan/Reporter/report_largest_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
translate_headers,
)

HEADERS = [
TABLE_HEADERS = [
fields.FIELD_FILENAME,
fields.FIELD_SIZE,
fields.FIELD_FORMAT,
Expand Down Expand Up @@ -52,8 +52,6 @@ def largest_files():
pass
csv = parse_bool(request.args.get(request_params.CSV), default=False)

headers = translate_headers(HEADERS)

file_data = report_data.largest_files(
storage_service_id=storage_service_id,
start_date=start_date,
Expand All @@ -64,11 +62,14 @@ def largest_files():
)

if csv:
headers = translate_headers(CSV_HEADERS, True)

filename = "largest_files.csv"
headers = translate_headers(CSV_HEADERS)
csv_data = format_size_for_csv(file_data[fields.FIELD_FILES])
return download_csv(headers, csv_data, filename)

headers = translate_headers(TABLE_HEADERS)

return render_template(
"report_largest_files.html",
storage_service_id=storage_service_id,
Expand Down
6 changes: 4 additions & 2 deletions AIPscan/Reporter/report_storage_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,20 @@ def storage_locations():
)
csv = parse_bool(request.args.get(request_params.CSV), default=False)

headers = translate_headers(HEADERS)

locations_data = report_data.storage_locations(
storage_service_id=storage_service_id, start_date=start_date, end_date=end_date
)
locations = locations_data.get(fields.FIELD_LOCATIONS)

if csv:
headers = translate_headers(HEADERS, True)

filename = "storage_locations.csv"
csv_data = format_size_for_csv(locations)
return download_csv(headers, csv_data, filename)

headers = translate_headers(HEADERS)

return render_template(
"report_storage_locations.html",
storage_service_id=storage_service_id,
Expand Down
2 changes: 1 addition & 1 deletion AIPscan/Reporter/tests/test_aip_contents.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from flask import current_app

EXPECTED_CSV_CONTENTS = b"UUID,AIP Name,Created Date,Size,Formats\r\n111111111111-1111-1111-11111111,Test AIP,2020-01-01 00:00:00,0 Bytes,fmt/43 (ACME File Format 0.0.0): 1 file|fmt/61 (ACME File Format 0.0.0): 1 file\r\n222222222222-2222-2222-22222222,Test AIP,2020-06-01 00:00:00,0 Bytes,x-fmt/111 (ACME File Format 0.0.0): 3 files|fmt/61 (ACME File Format 0.0.0): 2 files\r\n"
EXPECTED_CSV_CONTENTS = b"UUID,AIP Name,Created Date,Size,Size (bytes),Formats\r\n111111111111-1111-1111-11111111,Test AIP,2020-01-01 00:00:00,0 Bytes,0,fmt/43 (ACME File Format 0.0.0): 1 file|fmt/61 (ACME File Format 0.0.0): 1 file\r\n222222222222-2222-2222-22222222,Test AIP,2020-06-01 00:00:00,0 Bytes,0,x-fmt/111 (ACME File Format 0.0.0): 3 files|fmt/61 (ACME File Format 0.0.0): 2 files\r\n"


def test_aip_contents(aip_contents):
Expand Down
8 changes: 2 additions & 6 deletions AIPscan/Reporter/tests/test_aips_by_file_format.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import pytest
from flask import current_app

EXPECTED_CSV_ORIGINAL = (
b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB\r\n"
)
EXPECTED_CSV_PRESERVATION = (
b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB\r\n"
)
EXPECTED_CSV_ORIGINAL = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB,1000\r\n"
EXPECTED_CSV_PRESERVATION = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB,2000\r\n"


@pytest.mark.parametrize(
Expand Down
8 changes: 2 additions & 6 deletions AIPscan/Reporter/tests/test_aips_by_puid.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,8 @@
from AIPscan.models import File, FileType
from AIPscan.Reporter.report_aips_by_puid import get_format_string_from_puid

EXPECTED_CSV_ORIGINAL = (
b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB\r\n"
)
EXPECTED_CSV_PRESERVATION = (
b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB\r\n"
)
EXPECTED_CSV_ORIGINAL = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB,1000\r\n"
EXPECTED_CSV_PRESERVATION = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB,2000\r\n"

FILE_WITH_FORMAT_ONLY = File(
uuid=uuid.uuid4(),
Expand Down
2 changes: 1 addition & 1 deletion AIPscan/Reporter/tests/test_format_versions_count.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from flask import current_app

EXPECTED_CSV_CONTENTS = b"PUID,Format,Version,Count,Size\r\nfmt/44,JPEG,1.02,1,2.0 kB\r\nfmt/43,JPEG,1.01,1,1.0 kB\r\nfmt/468,ISO Disk Image File,,1,0 Bytes\r\n"
EXPECTED_CSV_CONTENTS = b"PUID,Format,Version,Count,Size,Size (bytes)\r\nfmt/44,JPEG,1.02,1,2.0 kB,2000\r\nfmt/43,JPEG,1.01,1,1.0 kB,1000\r\nfmt/468,ISO Disk Image File,,1,0 Bytes,0\r\n"


def test_format_versions_count(app_with_populated_format_versions):
Expand Down
4 changes: 1 addition & 3 deletions AIPscan/Reporter/tests/test_formats_count.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from flask import current_app

EXPECTED_CSV_CONTENTS = (
b"Format,Count,Size\r\nJPEG,2,3.0 kB\r\nISO Disk Image File,1,0 Bytes\r\n"
)
EXPECTED_CSV_CONTENTS = b"Format,Count,Size,Size (bytes)\r\nJPEG,2,3.0 kB,3000\r\nISO Disk Image File,1,0 Bytes,0\r\n"


def test_formats_count(app_with_populated_format_versions):
Expand Down
33 changes: 31 additions & 2 deletions AIPscan/Reporter/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from AIPscan.Data.tests import MOCK_STORAGE_SERVICE_ID as STORAGE_SERVICE_ID
from AIPscan.models import File, FileType
from AIPscan.Reporter import helpers
from AIPscan.Reporter.report_aips_by_format import HEADERS
from AIPscan.Reporter.report_aip_contents import CSV_HEADERS as AIP_CONTENTS_HEADERS
from AIPscan.Reporter.report_aips_by_format import HEADERS as AIPS_BY_FORMAT_HEADERS

ROWS_WITH_SIZE = [
{
Expand All @@ -33,11 +34,13 @@
fields.FIELD_AIP_UUID: "test uuid",
fields.FIELD_AIP_NAME: "test name",
fields.FIELD_SIZE: "1.6 MB",
fields.FIELD_SIZE_BYTES: 1560321,
},
{
fields.FIELD_AIP_UUID: "test uuid2",
fields.FIELD_AIP_NAME: "test name2",
fields.FIELD_SIZE: "123.4 kB",
fields.FIELD_SIZE_BYTES: 123423,
},
]

Expand All @@ -62,7 +65,7 @@ def test_download_csv(app_instance, mocker):
mock_get_ss_name = mocker.patch("AIPscan.Data._get_storage_service")
mock_get_ss_name.return_value = STORAGE_SERVICE

headers = helpers.translate_headers(HEADERS)
headers = helpers.translate_headers(AIPS_BY_FORMAT_HEADERS)

report_data = aips_by_file_format(STORAGE_SERVICE_ID, "test")
response = helpers.download_csv(headers, report_data[fields.FIELD_AIPS], CSV_FILE)
Expand Down Expand Up @@ -93,6 +96,32 @@ def test_download_csv(app_instance, mocker):
assert line_count == len(query_results) + 1


@pytest.mark.parametrize(
"data,expected_output",
[
# No adding of header for size in bytes
(
{"headers": AIPS_BY_FORMAT_HEADERS, "add_bytes_column": False},
["AIP Name", "UUID", "Count", "Size"],
),
# Adding of header for size in bytes at end of header list
(
{"headers": AIPS_BY_FORMAT_HEADERS, "add_bytes_column": True},
["AIP Name", "UUID", "Count", "Size", "Size (bytes)"],
),
# Adding of header for size in bytes not at end of header list
(
{"headers": AIP_CONTENTS_HEADERS, "add_bytes_column": True},
["UUID", "AIP Name", "Created Date", "Size", "Size (bytes)", "Formats"],
),
],
)
def test_translate_headers(data, expected_output):
headers = helpers.translate_headers(data["headers"], data["add_bytes_column"])

assert headers == expected_output


@pytest.mark.parametrize(
"data,expected_output",
[
Expand Down
Loading

0 comments on commit 89d6466

Please sign in to comment.