Skip to content

Commit

Permalink
work
Browse files Browse the repository at this point in the history
  • Loading branch information
mcantelon committed Nov 24, 2024
1 parent 911875f commit 287b4da
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 60 deletions.
78 changes: 78 additions & 0 deletions AIPscan/Aggregator/mets_parse_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
"""
import lxml
import metsrw
import os
import requests

from AIPscan.Aggregator import database_helpers
from AIPscan.helpers import file_sha256_hash
from AIPscan.models import AIP
from AIPscan.Aggregator.task_helpers import (
create_numbered_subdirs,
get_mets_url,
Expand Down Expand Up @@ -101,3 +105,77 @@ def download_mets(
download_file = write_mets(mets_response, package_uuid, numbered_subdir)

return download_file


def import_from_mets(
filename,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
logger,
delete_file=False,
):
mets_name = os.path.basename(filename)
mets_hash = file_sha256_hash(filename)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
logger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
if delete_file:
os.remove(filename)
except OSError as err:
logger.warning("Unable to delete METS file: {}".format(err))
return

logger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(filename)
except METSError:
# An error we need to log and report back to the user.
return

try:
original_name = get_aip_original_name(mets)
except METSError:
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
logger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
)

database_helpers.process_aip_data(aip, mets)

# Delete METS file.
if delete_file:
try:
os.remove(filename)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))
68 changes: 11 additions & 57 deletions AIPscan/Aggregator/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
METSError,
download_mets,
get_aip_original_name,
import_from_mets,
parse_mets_with_metsrw,
)
from AIPscan.Aggregator.task_helpers import (
Expand Down Expand Up @@ -320,66 +321,19 @@ def get_mets(
timestamp_str,
package_list_no,
)
mets_name = os.path.basename(download_file)
mets_hash = file_sha256_hash(download_file)

# If METS file's hash matches an existing value, this is a duplicate of an
# existing AIP and we can safely ignore it.
matching_aip = AIP.query.filter_by(mets_sha256=mets_hash).first()
if matching_aip is not None:
tasklogger.info(
"Skipping METS file {} - identical to existing record".format(mets_name)
)
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))
return

tasklogger.info("Processing METS file {}".format(mets_name))

try:
mets = parse_mets_with_metsrw(download_file)
except METSError:
# An error we need to log and report back to the user.
return

try:
original_name = get_aip_original_name(mets)
except METSError:
# Some other error with the METS file that we might want to
# log and act upon.
original_name = package_uuid

# Delete records of any previous versions of this AIP, which will shortly
# be replaced by new records from the updated METS.
previous_aips = AIP.query.filter_by(uuid=package_uuid).all()
for previous_aip in previous_aips:
tasklogger.info(
"Deleting record for AIP {} to replace from newer METS".format(package_uuid)
)
database_helpers.delete_aip_object(previous_aip)

aip = database_helpers.create_aip_object(
package_uuid=package_uuid,
transfer_name=original_name,
create_date=mets.createdate,
mets_sha256=mets_hash,
size=aip_size,
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=origin_pipeline_id,
import_from_mets(
download_file,
aip_size,
package_uuid,
storage_service_id,
storage_location_id,
fetch_job_id,
origin_pipeline_id,
tasklogger,
delete_file=True
)

database_helpers.process_aip_data(aip, mets)

# Delete downloaded METS file.
try:
os.remove(download_file)
except OSError as err:
tasklogger.warning("Unable to delete METS file: {}".format(err))


@celery.task()
def delete_fetch_job(fetch_job_id):
Expand Down
9 changes: 6 additions & 3 deletions tools/app/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ def create_app_instance(configuration, db):
return app


def log_and_raise_click_error(logger, message):
logger.critical(message)

def raise_click_error(message):
err = click.ClickException(message)
err.exit_code = 1
raise err


def log_and_raise_click_error(logger, message):
logger.critical(message)
raise_click_error(message)

0 comments on commit 287b4da

Please sign in to comment.