Add sortable size CSV field (#149) (#261)

Package and file size values in CSV exports were made to be human readable which was good for legibility, but impeded the ability to sort rows by size. Added automatic population of, whenever size data is included in an export, an additional column containing the "raw" byte value of a package or file size.
artefactual-labs · Dec 6, 2023 · 89d6466 · 89d6466
1 parent 70701ad
commit 89d6466
Show file tree

Hide file tree

Showing 20 changed files with 121 additions and 49 deletions.
diff --git a/AIPscan/Data/fields.py b/AIPscan/Data/fields.py
@@ -53,6 +53,7 @@
 FIELD_RELATED_PAIRING = "RelatedPairing"
 
 FIELD_SIZE = "Size"
+FIELD_SIZE_BYTES = "SizeBytes"
 FIELD_STORAGE_LOCATION = "StorageLocation"
 FIELD_STORAGE_NAME = "StorageName"
 

diff --git a/AIPscan/Reporter/helpers.py b/AIPscan/Reporter/helpers.py
@@ -28,7 +28,7 @@ def sort_puids(puids):
     return natsorted(puids)
 
 
-def translate_headers(headers):
+def translate_headers(headers, add_bytes_column=False):
     """Translate headers from something machine readable to something
     more user friendly and translatable.
     """
@@ -69,6 +69,22 @@ def translate_headers(headers):
         fields.FIELD_USER: "User",
         fields.FIELD_VERSION: "Version",
     }
+
+    # Attempt to add an additional header representing a column containing size
+    # expressed as a number of bytes, rather than in human-readable form, so
+    # rows can more easily be sorted by size
+    if add_bytes_column:
+        headers = (
+            headers.copy()
+        )  # So we don't change the list object passed to this function
+
+        # Handle the two standard size columns
+        for header in [fields.FIELD_AIP_SIZE, fields.FIELD_SIZE]:
+            # If size header is found then insert another for the size in bytes afer it
+            if header in headers:
+                bytes_header = field_lookup[header] + " (bytes)"
+                headers.insert(headers.index(header) + 1, bytes_header)
+
     return [field_lookup.get(header, header) for header in headers]
 
 
@@ -89,12 +105,27 @@ def format_size_for_csv(rows):
 
     :returns: rows with formatted size field (list of dicts)
     """
+    edited_rows = []
+
     for row in rows:
-        try:
+        # Add size in bytes after original size column
+        row_key_list = list(row.keys())
+
+        if fields.FIELD_SIZE in row_key_list:
+            size_position = row_key_list.index(fields.FIELD_SIZE) + 1
+            row_items = list(row.items())
+
+            row_items.insert(
+                size_position, (fields.FIELD_SIZE_BYTES, row[fields.FIELD_SIZE])
+            )
+            row = dict(row_items)
+
+            # Format original size column
             row[fields.FIELD_SIZE] = filesizeformat(row[fields.FIELD_SIZE])
-        except KeyError:
-            pass
-    return rows
+
+        edited_rows.append(row)
+
+    return edited_rows
 
 
 def download_csv(headers, rows, filename="report.csv"):

diff --git a/AIPscan/Reporter/report_aip_contents.py b/AIPscan/Reporter/report_aip_contents.py
@@ -22,7 +22,7 @@
     fields.FIELD_FORMATS,
 ]
 
-HEADERS = [
+TABLE_HEADERS = [
     fields.FIELD_AIP_NAME,
     fields.FIELD_CREATED_DATE,
     fields.FIELD_SIZE,
@@ -110,8 +110,9 @@ def aip_contents():
     )
 
     if csv:
+        headers = translate_headers(CSV_HEADERS, True)
+
         filename = "aip_contents.csv"
-        headers = translate_headers(CSV_HEADERS)
         aips = _create_aip_formats_string_representation(
             aip_data.get(fields.FIELD_AIPS), separator="|"
         )
@@ -120,12 +121,14 @@ def aip_contents():
 
     aips = _create_aip_formats_string_representation(aip_data.get(fields.FIELD_AIPS))
 
+    headers = translate_headers(TABLE_HEADERS)
+
     return render_template(
         "report_aip_contents.html",
         storage_service=storage_service_id,
         storage_service_name=aip_data.get(fields.FIELD_STORAGE_NAME),
         storage_location_description=aip_data.get(fields.FIELD_STORAGE_LOCATION),
-        columns=translate_headers(HEADERS),
+        columns=headers,
         aips=aip_data.get(fields.FIELD_AIPS),
         start_date=start_date,
         end_date=get_display_end_date(end_date),

diff --git a/AIPscan/Reporter/report_aips_by_format.py b/AIPscan/Reporter/report_aips_by_format.py
@@ -28,8 +28,6 @@ def aips_by_format():
     original_files = parse_bool(request.args.get(request_params.ORIGINAL_FILES, True))
     csv = parse_bool(request.args.get(request_params.CSV), default=False)
 
-    headers = translate_headers(HEADERS)
-
     aip_data = report_data.aips_by_file_format(
         storage_service_id=storage_service_id,
         file_format=file_format,
@@ -38,10 +36,14 @@ def aips_by_format():
     )
 
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "aips_by_file_format_{}.csv".format(file_format)
         csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_aips_by_format.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_aips_by_puid.py b/AIPscan/Reporter/report_aips_by_puid.py
@@ -53,8 +53,6 @@ def aips_by_puid():
     original_files = parse_bool(request.args.get(request_params.ORIGINAL_FILES, True))
     csv = parse_bool(request.args.get(request_params.CSV), default=False)
 
-    headers = translate_headers(HEADERS)
-
     aip_data = report_data.aips_by_puid(
         storage_service_id=storage_service_id,
         puid=puid,
@@ -63,10 +61,14 @@ def aips_by_puid():
     )
 
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "aips_by_puid_{}.csv".format(puid)
         csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_aips_by_puid.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_format_versions_count.py b/AIPscan/Reporter/report_format_versions_count.py
@@ -41,13 +41,15 @@ def report_format_versions_count():
     )
     versions = version_data.get(fields.FIELD_FORMAT_VERSIONS)
 
-    headers = translate_headers(HEADERS)
-
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "format_versions.csv"
         csv_data = format_size_for_csv(versions)
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_format_versions_count.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_formats_count.py b/AIPscan/Reporter/report_formats_count.py
@@ -51,13 +51,15 @@ def report_formats_count():
     )
     formats = formats_data.get(fields.FIELD_FORMATS)
 
-    headers = translate_headers(HEADERS)
-
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "file_formats.csv"
         csv_data = format_size_for_csv(formats)
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_formats_count.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_largest_aips.py b/AIPscan/Reporter/report_largest_aips.py
@@ -35,8 +35,6 @@ def largest_aips():
         pass
     csv = parse_bool(request.args.get(request_params.CSV), default=False)
 
-    headers = translate_headers(HEADERS)
-
     aip_data = report_data.largest_aips(
         storage_service_id=storage_service_id,
         start_date=start_date,
@@ -46,11 +44,14 @@ def largest_aips():
     )
 
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "largest_aips.csv"
-        headers = translate_headers(HEADERS)
         csv_data = format_size_for_csv(aip_data[fields.FIELD_AIPS])
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_largest_aips.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_largest_files.py b/AIPscan/Reporter/report_largest_files.py
@@ -13,7 +13,7 @@
     translate_headers,
 )
 
-HEADERS = [
+TABLE_HEADERS = [
     fields.FIELD_FILENAME,
     fields.FIELD_SIZE,
     fields.FIELD_FORMAT,
@@ -52,8 +52,6 @@ def largest_files():
         pass
     csv = parse_bool(request.args.get(request_params.CSV), default=False)
 
-    headers = translate_headers(HEADERS)
-
     file_data = report_data.largest_files(
         storage_service_id=storage_service_id,
         start_date=start_date,
@@ -64,11 +62,14 @@ def largest_files():
     )
 
     if csv:
+        headers = translate_headers(CSV_HEADERS, True)
+
         filename = "largest_files.csv"
-        headers = translate_headers(CSV_HEADERS)
         csv_data = format_size_for_csv(file_data[fields.FIELD_FILES])
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(TABLE_HEADERS)
+
     return render_template(
         "report_largest_files.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/report_storage_locations.py b/AIPscan/Reporter/report_storage_locations.py
@@ -38,18 +38,20 @@ def storage_locations():
     )
     csv = parse_bool(request.args.get(request_params.CSV), default=False)
 
-    headers = translate_headers(HEADERS)
-
     locations_data = report_data.storage_locations(
         storage_service_id=storage_service_id, start_date=start_date, end_date=end_date
     )
     locations = locations_data.get(fields.FIELD_LOCATIONS)
 
     if csv:
+        headers = translate_headers(HEADERS, True)
+
         filename = "storage_locations.csv"
         csv_data = format_size_for_csv(locations)
         return download_csv(headers, csv_data, filename)
 
+    headers = translate_headers(HEADERS)
+
     return render_template(
         "report_storage_locations.html",
         storage_service_id=storage_service_id,

diff --git a/AIPscan/Reporter/tests/test_aip_contents.py b/AIPscan/Reporter/tests/test_aip_contents.py
@@ -1,6 +1,6 @@
 from flask import current_app
 
-EXPECTED_CSV_CONTENTS = b"UUID,AIP Name,Created Date,Size,Formats\r\n111111111111-1111-1111-11111111,Test AIP,2020-01-01 00:00:00,0 Bytes,fmt/43 (ACME File Format 0.0.0): 1 file|fmt/61 (ACME File Format 0.0.0): 1 file\r\n222222222222-2222-2222-22222222,Test AIP,2020-06-01 00:00:00,0 Bytes,x-fmt/111 (ACME File Format 0.0.0): 3 files|fmt/61 (ACME File Format 0.0.0): 2 files\r\n"
+EXPECTED_CSV_CONTENTS = b"UUID,AIP Name,Created Date,Size,Size (bytes),Formats\r\n111111111111-1111-1111-11111111,Test AIP,2020-01-01 00:00:00,0 Bytes,0,fmt/43 (ACME File Format 0.0.0): 1 file|fmt/61 (ACME File Format 0.0.0): 1 file\r\n222222222222-2222-2222-22222222,Test AIP,2020-06-01 00:00:00,0 Bytes,0,x-fmt/111 (ACME File Format 0.0.0): 3 files|fmt/61 (ACME File Format 0.0.0): 2 files\r\n"
 
 
 def test_aip_contents(aip_contents):

diff --git a/AIPscan/Reporter/tests/test_aips_by_file_format.py b/AIPscan/Reporter/tests/test_aips_by_file_format.py
@@ -1,12 +1,8 @@
 import pytest
 from flask import current_app
 
-EXPECTED_CSV_ORIGINAL = (
-    b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB\r\n"
-)
-EXPECTED_CSV_PRESERVATION = (
-    b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB\r\n"
-)
+EXPECTED_CSV_ORIGINAL = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB,1000\r\n"
+EXPECTED_CSV_PRESERVATION = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB,2000\r\n"
 
 
 @pytest.mark.parametrize(

diff --git a/AIPscan/Reporter/tests/test_aips_by_puid.py b/AIPscan/Reporter/tests/test_aips_by_puid.py
@@ -7,12 +7,8 @@
 from AIPscan.models import File, FileType
 from AIPscan.Reporter.report_aips_by_puid import get_format_string_from_puid
 
-EXPECTED_CSV_ORIGINAL = (
-    b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB\r\n"
-)
-EXPECTED_CSV_PRESERVATION = (
-    b"AIP Name,UUID,Count,Size\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB\r\n"
-)
+EXPECTED_CSV_ORIGINAL = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,1.0 kB,1000\r\n"
+EXPECTED_CSV_PRESERVATION = b"AIP Name,UUID,Count,Size,Size (bytes)\r\nTest AIP,111111111111-1111-1111-11111111,1,2.0 kB,2000\r\n"
 
 FILE_WITH_FORMAT_ONLY = File(
     uuid=uuid.uuid4(),

diff --git a/AIPscan/Reporter/tests/test_format_versions_count.py b/AIPscan/Reporter/tests/test_format_versions_count.py
@@ -1,6 +1,6 @@
 from flask import current_app
 
-EXPECTED_CSV_CONTENTS = b"PUID,Format,Version,Count,Size\r\nfmt/44,JPEG,1.02,1,2.0 kB\r\nfmt/43,JPEG,1.01,1,1.0 kB\r\nfmt/468,ISO Disk Image File,,1,0 Bytes\r\n"
+EXPECTED_CSV_CONTENTS = b"PUID,Format,Version,Count,Size,Size (bytes)\r\nfmt/44,JPEG,1.02,1,2.0 kB,2000\r\nfmt/43,JPEG,1.01,1,1.0 kB,1000\r\nfmt/468,ISO Disk Image File,,1,0 Bytes,0\r\n"
 
 
 def test_format_versions_count(app_with_populated_format_versions):

diff --git a/AIPscan/Reporter/tests/test_formats_count.py b/AIPscan/Reporter/tests/test_formats_count.py
@@ -1,8 +1,6 @@
 from flask import current_app
 
-EXPECTED_CSV_CONTENTS = (
-    b"Format,Count,Size\r\nJPEG,2,3.0 kB\r\nISO Disk Image File,1,0 Bytes\r\n"
-)
+EXPECTED_CSV_CONTENTS = b"Format,Count,Size,Size (bytes)\r\nJPEG,2,3.0 kB,3000\r\nISO Disk Image File,1,0 Bytes,0\r\n"
 
 
 def test_formats_count(app_with_populated_format_versions):

diff --git a/AIPscan/Reporter/tests/test_helpers.py b/AIPscan/Reporter/tests/test_helpers.py
@@ -13,7 +13,8 @@
 from AIPscan.Data.tests import MOCK_STORAGE_SERVICE_ID as STORAGE_SERVICE_ID
 from AIPscan.models import File, FileType
 from AIPscan.Reporter import helpers
-from AIPscan.Reporter.report_aips_by_format import HEADERS
+from AIPscan.Reporter.report_aip_contents import CSV_HEADERS as AIP_CONTENTS_HEADERS
+from AIPscan.Reporter.report_aips_by_format import HEADERS as AIPS_BY_FORMAT_HEADERS
 
 ROWS_WITH_SIZE = [
     {
@@ -33,11 +34,13 @@
         fields.FIELD_AIP_UUID: "test uuid",
         fields.FIELD_AIP_NAME: "test name",
         fields.FIELD_SIZE: "1.6 MB",
+        fields.FIELD_SIZE_BYTES: 1560321,
     },
     {
         fields.FIELD_AIP_UUID: "test uuid2",
         fields.FIELD_AIP_NAME: "test name2",
         fields.FIELD_SIZE: "123.4 kB",
+        fields.FIELD_SIZE_BYTES: 123423,
     },
 ]
 
@@ -62,7 +65,7 @@ def test_download_csv(app_instance, mocker):
     mock_get_ss_name = mocker.patch("AIPscan.Data._get_storage_service")
     mock_get_ss_name.return_value = STORAGE_SERVICE
 
-    headers = helpers.translate_headers(HEADERS)
+    headers = helpers.translate_headers(AIPS_BY_FORMAT_HEADERS)
 
     report_data = aips_by_file_format(STORAGE_SERVICE_ID, "test")
     response = helpers.download_csv(headers, report_data[fields.FIELD_AIPS], CSV_FILE)
@@ -93,6 +96,32 @@ def test_download_csv(app_instance, mocker):
     assert line_count == len(query_results) + 1
 
 
+@pytest.mark.parametrize(
+    "data,expected_output",
+    [
+        # No adding of header for size in bytes
+        (
+            {"headers": AIPS_BY_FORMAT_HEADERS, "add_bytes_column": False},
+            ["AIP Name", "UUID", "Count", "Size"],
+        ),
+        # Adding of header for size in bytes at end of header list
+        (
+            {"headers": AIPS_BY_FORMAT_HEADERS, "add_bytes_column": True},
+            ["AIP Name", "UUID", "Count", "Size", "Size (bytes)"],
+        ),
+        # Adding of header for size in bytes not at end of header list
+        (
+            {"headers": AIP_CONTENTS_HEADERS, "add_bytes_column": True},
+            ["UUID", "AIP Name", "Created Date", "Size", "Size (bytes)", "Formats"],
+        ),
+    ],
+)
+def test_translate_headers(data, expected_output):
+    headers = helpers.translate_headers(data["headers"], data["add_bytes_column"])
+
+    assert headers == expected_output
+
+
 @pytest.mark.parametrize(
     "data,expected_output",
     [