Skip to content

Commit

Permalink
Add largest files report
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l committed Oct 20, 2020
1 parent a22c214 commit a25d742
Show file tree
Hide file tree
Showing 10 changed files with 449 additions and 6 deletions.
30 changes: 30 additions & 0 deletions AIPscan/API/namespace_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,33 @@ def get(self, storage_service_id):
"""AIP overview two"""
aip_data = data.derivative_overview(storage_service_id=storage_service_id)
return aip_data


@api.route("/largest-files/<storage_service_id>")
class LargestFileList(Resource):
@api.doc(
"list_formats",
params={
"file_type": {
"description": "Optional file type filter (original or preservation)",
"in": "query",
"type": "str",
},
"limit": {
"description": "Number of results to return (default is 20)",
"in": "query",
"type": "int",
},
},
)
def get(self, storage_service_id, file_type=None, limit=20):
"""Largest files"""
file_type = request.args.get("file_type", None)
try:
limit = int(request.args.get("limit", 20))
except ValueError:
pass
file_data = data.largest_files(
storage_service_id=storage_service_id, file_type=file_type, limit=limit
)
return file_data
94 changes: 91 additions & 3 deletions AIPscan/Data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
from AIPscan.models import AIP, File, FileType, StorageService


FIELD_AIP_NAME = "AipName"
FIELD_AIP = "AIP"
FIELD_AIP_ID = "AIPID"
FIELD_AIP_NAME = "AIPName"
FIELD_AIP_SIZE = "AIPSize"
FIELD_AIP_UUID = "AIPUUID"
FIELD_AIPS = "AIPs"
FIELD_AIP_SIZE = "AipSize"
FIELD_ALL_AIPS = "AllAips"
FIELD_ALL_AIPS = "AllAIPs"

FIELD_COUNT = "Count"
FIELD_CREATED_DATE = "CreatedDate"
Expand All @@ -17,16 +20,23 @@
FIELD_DERIVATIVE_FORMAT = "DerivativeFormat"
FIELD_DERIVATIVE_UUID = "DerivativeUUID"

FIELD_FILES = "Files"
FIELD_FILE_COUNT = "FileCount"
FIELD_FILE_TYPE = "FileType"
FIELD_FILENAME = "Filename"
FIELD_FORMAT = "Format"
FIELD_FORMATS = "Formats"

FIELD_NAME = "Name"

FIELD_ORIGINAL_UUID = "OriginalUUID"
FIELD_ORIGINAL_FORMAT = "OriginalFormat"

FIELD_PUID = "PUID"

FIELD_RELATED_PAIRING = "RelatedPairing"

FIELD_SIZE = "Size"
FIELD_STORAGE_NAME = "StorageName"

FIELD_TRANSFER_NAME = "TransferName"
Expand Down Expand Up @@ -204,3 +214,81 @@ def derivative_overview(storage_service_id):
report[FIELD_STORAGE_NAME] = storage_service.name

return report


def _largest_files_query(storage_service_id, file_type, limit):
"""Fetch file information from database for largest files query
This is separated into its own helper function to aid in testing.
"""
VALID_FILE_TYPES = set(item.value for item in FileType)
if file_type is not None and file_type in VALID_FILE_TYPES:
files = (
File.query.join(AIP)
.join(StorageService)
.filter(StorageService.id == storage_service_id)
.filter(File.file_type == file_type)
.order_by(File.size.desc())
.limit(limit)
)
else:
files = (
File.query.join(AIP)
.join(StorageService)
.filter(StorageService.id == storage_service_id)
.order_by(File.size.desc())
.limit(limit)
)
return files


def largest_files(storage_service_id, file_type=None, limit=20):
"""Return a summary of the largest files in a given Storage Service
:param storage_service_id: Storage Service ID.
:param file_type: Optional filter for type of file to return
(acceptable values are "original" or "preservation").
:param limit: Upper limit of number of results to return.
:returns: "report" dict containing following fields:
report["StorageName"]: Name of Storage Service queried
report["Files"]: List of result files ordered desc by size
"""
report = {}
report[FIELD_FILES] = []
storage_service = _get_storage_service(storage_service_id)
report[FIELD_STORAGE_NAME] = storage_service.name

files = _largest_files_query(storage_service_id, file_type, limit)

for file_ in files:
file_info = {}

file_info["id"] = file_.id
file_info[FIELD_UUID] = file_.uuid
file_info[FIELD_NAME] = file_.name
file_info[FIELD_SIZE] = int(file_.size)
file_info[FIELD_AIP_ID] = file_.aip_id
file_info[FIELD_FILE_TYPE] = file_.file_type.value

try:
file_info[FIELD_FORMAT] = file_.file_format
except AttributeError:
pass
try:
file_info[FIELD_VERSION] = file_.format_version
except AttributeError:
pass
try:
file_info[FIELD_PUID] = file_.puid
except AttributeError:
pass

matching_aip = AIP.query.get(file_.aip_id)
if matching_aip is not None:
file_info[FIELD_AIP_NAME] = matching_aip.transfer_name
file_info[FIELD_AIP_UUID] = matching_aip.uuid

report[FIELD_FILES].append(file_info)

return report
Empty file added AIPscan/Data/tests/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions AIPscan/Data/tests/test_largest_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-

import datetime
import pytest
import uuid

from AIPscan.Data import data
from AIPscan.models import AIP, File, FileType, StorageService

TEST_FILES = [
File(
uuid=uuid.uuid4(),
name="test.csv",
size=1234567,
aip_id=1,
file_type=FileType.original,
file_format="Comma Separated Values",
filepath="/path/to/file.csv",
date_created=datetime.datetime.now(),
checksum_type="md5",
checksum_value="fakemd5",
),
File(
uuid=uuid.uuid4(),
name="test.txt",
size=12345,
aip_id=2,
file_type=FileType.original,
file_format="Plain Text File",
puid="x-fmt/111",
filepath="/path/to/file.txt",
date_created=datetime.datetime.now(),
checksum_type="md5",
checksum_value="anotherfakemd5",
),
File(
uuid=uuid.uuid4(),
name="test.pdf",
size=12345678,
aip_id=1,
file_type=FileType.preservation,
file_format="Acrobat PDF/A - Portable Document Format",
format_version="1b",
filepath="/path/to/test.pdf",
date_created=datetime.datetime.now(),
checksum_type="md5",
checksum_value="yetanotherfakemd5",
original_file_id=1,
),
]

MOCK_STORAGE_SERVICE_ID = 1
MOCK_STORAGE_SERVICE_NAME = "some name"
TEST_STORAGE_SERVICE = StorageService(
name=MOCK_STORAGE_SERVICE_NAME,
url="http://example.com",
user_name="test",
api_key="test",
download_limit=20,
download_offset=10,
default=False,
)

MOCK_AIP_NAME = "Test transfer"
MOCK_AIP_UUID = uuid.uuid4()
TEST_AIP = AIP(
uuid=MOCK_AIP_UUID,
transfer_name=MOCK_AIP_NAME,
create_date=datetime.datetime.now(),
storage_service_id=MOCK_STORAGE_SERVICE_ID,
fetch_job_id=1,
)


@pytest.mark.parametrize(
"file_data, file_count", [([], 0), (TEST_FILES, 3), (TEST_FILES[:2], 2)]
)
def test_largest_files(mocker, file_data, file_count):
"""Test that return value conforms to expected structure.
"""
mock_query = mocker.patch("AIPscan.Data.data._largest_files_query")
mock_query.return_value = file_data

mock_get_ss = mocker.patch("AIPscan.Data.data._get_storage_service")
mock_get_ss.return_value = TEST_STORAGE_SERVICE

mock_get_aip = mocker.patch("sqlalchemy.orm.query.Query.get")
mock_get_aip.return_value = TEST_AIP

report = data.largest_files(MOCK_STORAGE_SERVICE_ID)
report_files = report[data.FIELD_FILES]
assert report[data.FIELD_STORAGE_NAME] == MOCK_STORAGE_SERVICE_NAME
assert len(report_files) == file_count


@pytest.mark.parametrize(
"test_file, has_format_version, has_puid",
[
(TEST_FILES[0], False, False),
(TEST_FILES[1], False, True),
(TEST_FILES[2], True, False),
],
)
def test_largest_files_elements(mocker, test_file, has_format_version, has_puid):
"""Test that returned file data matches expected values.
"""
mock_query = mocker.patch("AIPscan.Data.data._largest_files_query")
mock_query.return_value = [test_file]

mock_get_ss = mocker.patch("AIPscan.Data.data._get_storage_service")
mock_get_ss.return_value = TEST_STORAGE_SERVICE

mock_get_aip = mocker.patch("sqlalchemy.orm.query.Query.get")
mock_get_aip.return_value = TEST_AIP

report = data.largest_files(MOCK_STORAGE_SERVICE_ID)
report_file = report[data.FIELD_FILES][0]

# Required elements
assert test_file.name == report_file.get(data.FIELD_NAME)
assert test_file.file_format == report_file.get(data.FIELD_FORMAT)

# Optional elements
if has_format_version:
assert test_file.format_version == report_file.get(data.FIELD_VERSION)
else:
assert report_file.get(data.FIELD_VERSION) is None

if has_puid:
assert test_file.puid == report_file.get(data.FIELD_PUID)
else:
assert report_file.get(data.FIELD_PUID) is None

# AIP information
assert report_file.get(data.FIELD_AIP_NAME) == MOCK_AIP_NAME
assert report_file.get(data.FIELD_AIP_UUID) == MOCK_AIP_UUID
10 changes: 8 additions & 2 deletions AIPscan/Reporter/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,26 @@ def translate_headers(headers):
"""
field_lookup = {
data.FIELD_AIP_NAME: "AIP Name",
data.FIELD_AIP: "AIP",
data.FIELD_AIPS: "AIPs",
data.FIELD_AIP_SIZE: "Aip Size",
data.FIELD_ALL_AIPS: "All Aips",
data.FIELD_AIP_SIZE: "AIP Size",
data.FIELD_ALL_AIPS: "All AIPs",
data.FIELD_COUNT: "Count",
data.FIELD_CREATED_DATE: "Created Date",
data.FIELD_DERIVATIVE_COUNT: "Derivative Count",
data.FIELD_DERIVATIVE_FORMAT: "Derivative Format",
data.FIELD_DERIVATIVE_UUID: "Derivative UUID",
data.FIELD_FILE_COUNT: "File Count",
data.FIELD_FILE_TYPE: "Type",
data.FIELD_FILENAME: "Filename",
data.FIELD_FORMAT: "Format",
data.FIELD_FORMATS: "Formats",
data.FIELD_NAME: "Name",
data.FIELD_ORIGINAL_UUID: "Original UUID",
data.FIELD_ORIGINAL_FORMAT: "Original Format",
data.FIELD_PUID: "PUID",
data.FIELD_RELATED_PAIRING: "Related Pairing",
data.FIELD_SIZE: "Size",
data.FIELD_STORAGE_NAME: "Storage Service Name",
data.FIELD_TRANSFER_NAME: "Transfer Name",
data.FIELD_VERSION: "Version",
Expand Down
40 changes: 40 additions & 0 deletions AIPscan/Reporter/report_largest_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-

from flask import render_template, request

from AIPscan.Data import data
from AIPscan.Reporter import reporter, translate_headers


@reporter.route("/largest_files/", methods=["GET"])
def largest_files():
"""Return largest files."""
storage_service_id = request.args.get("amss_id")
file_type = request.args.get("file_type")
limit = 20
try:
limit = int(request.args.get("limit", 20))
except ValueError:
pass
# TODO: Make limit configurable - currently set to default of 20
file_data = data.largest_files(
storage_service_id=storage_service_id, file_type=file_type, limit=limit
)
storage_service_name = file_data[data.FIELD_STORAGE_NAME]
headers = [
data.FIELD_FILENAME,
data.FIELD_SIZE,
data.FIELD_FORMAT,
data.FIELD_PUID,
data.FIELD_FILE_TYPE,
data.FIELD_AIP,
]
return render_template(
"report_largest_files.html",
storage_service_id=storage_service_id,
storage_service_name=storage_service_name,
columns=translate_headers(headers),
files=file_data[data.FIELD_FILES],
file_type=file_type,
limit=limit,
)
3 changes: 2 additions & 1 deletion AIPscan/Reporter/templates/file.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
<h2 class="h3">File: {{ file_.name }}</h2>

<table class="table table-bordered table-condensed" style="margin-top: 20px;">
<tr><td width=20%><strong>Filepath</strong></td><td>{{ file_.filepath }}</td></tr>
<tr>
<td width=20%>
<strong>AIP</strong>
</td>
<td>
<a href="{{ url_for('reporter.view_aip', aip_id=aip.id ) }}">
{{ aip.transfer_name }} {{ aip.uuid }}
{{ aip.transfer_name }}-{{ aip.uuid }}
</a>
</td>
</tr>
Expand Down
Loading

0 comments on commit a25d742

Please sign in to comment.