Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write ASpace XLSX by default #44

Merged
merged 3 commits into from
Aug 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@

Analyze disk images and/or create ready-to-ingest SIPs from a directory of disk images and related files.

Version: 1.1.1
Version: 1.2.0

## Breaking Changes

Starting in v1.2.0, diskimageprocessor.py and the Processing mode of the GUI populate an ArchivesSpace description import XLSX instead of the previous ISAD-based CSV.

To have Disk Image Processor create the original ISAD-based description CSV instead, use the `-c` or `--csv` option (GUI support not yet added- for now use version 1.1.0 or before from the Releases tab for a GUI that writes the description CSV.

## Usage

Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.
Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.

### Analysis

Expand Down Expand Up @@ -48,7 +54,12 @@ For HFS file systems, files are exported from the disk image using CLI version o

For UDF file systems, files are copied from the mounted disk image and `walk_to_dfxml.py` is used to generate DFXML.

When complete, a "description.csv" spreadsheet is created containing some pre-populated archival description:
When complete, a description spreadsheet will be created containings ome pre-populated archival description.

From v1.2.0, Disk Image Processor will write this information into an ArchivesSpace description XLSX spreadsheet.

In previous versions or if the `"-c"/"--csv"` option is passed in v1.2.0+, a description.csv file will be created instead, containing the following columns:

* Date statement
* Date begin
* Date end
Expand Down
Binary file added aspace_template/aspace_import_template.xlsx
Binary file not shown.
189 changes: 187 additions & 2 deletions diskimageprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
import datetime
import itertools
import logging
import openpyxl
import os
import shutil
import stat
import subprocess
import sys
import time
Expand Down Expand Up @@ -256,6 +258,174 @@ def create_spreadsheet(args, sips, volumes, logger):
logger.info("Description CSV created.")


def create_aspace_excel_sheet(args, sips, volumes, logger):
"""Create new copy of ASpace XLSX and append rows describing disk images."""
xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx"))
template_path = os.path.abspath(
os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx")
)

try:
shutil.copyfile(template_path, xlsx_path)
except OSError as err:
logger.error(f"Unable to copy ASpace template to destination: {err}")

# Set ASpace file permissions
try:
os.chmod(
xlsx_path,
stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH,
)
except OSError as err:
logger.error(f"Error setting permissions: {err}")

workbook = openpyxl.load_workbook(filename=xlsx_path)
worksheet = workbook["Data"]

# TODO: Deduplicate with create_speadsheet
# Maybe create separate method that creates dict with info, and handle
# opening/writing csv or xlsx separately
for item in sorted(os.listdir(sips)):
sip_path = os.path.join(sips, item)

if not os.path.isdir(sip_path):
continue

disk_volumes = volumes[item]
number_volumes = len(disk_volumes)

date_earliest = ""
date_latest = ""

# Get and sum information from all DFXML files generated
dfxml_files = []
subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation")
if args.bagfiles:
subdoc_dir = os.path.join(
sip_path, "data", "metadata", "submissionDocumentation"
)
for root, _, files in os.walk(subdoc_dir):
for file in files:
if file.startswith("dfxml"):
dfxml_files.append(os.path.join(root, file))

dfxml_files_info = []
for dfxml_file in dfxml_files:
dfxml_info = _parse_dfxml(dfxml_file, logger)
if not dfxml_info:
logger.warning(
"No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format(
dfxml_file
)
)
continue
dfxml_files_info.append(dfxml_info)

file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info])
total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info])
file_systems = [volume["file_system"] for volume in disk_volumes]
# Deduplicate list
file_systems = list(dict.fromkeys(file_systems))
file_systems_str = ", ".join(file_systems)

for dfxml_info in dfxml_files_info:
if not date_earliest or dfxml_info["date_earliest"] < date_earliest:
date_earliest = dfxml_info["date_earliest"]
if not date_latest or dfxml_info["date_latest"] > date_latest:
date_latest = dfxml_info["date_latest"]

# Create list with empty string for each of template's columns
row_to_write = []
for _ in range(173):
row_to_write.append("")

# Row indices for fields to write
INDEX_FILENAME = 6
INDEX_LEVEL_OF_DESCRIPTION = 8
INDEX_DATE_START = 23
INDEX_DATE_END = 24
INDEX_EXTENT_NUMBER = 34
INDEX_EXTENT_TYPE = 35
INDEX_SIZE = 36
INDEX_SCOPE_CONTENTS = 170

# Fields that are always constant
row_to_write[INDEX_FILENAME] = item
row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File"

if file_count == 0:
row_to_write[
INDEX_SCOPE_CONTENTS
] = "Error gathering statistics from SIP directory"

worksheet.append(row_to_write)

logger.error("Unable to read DFXML files for {}".format(sip_path))
continue

# Get file formats from Brunnhilde
file_formats = []
file_format_csv = os.path.join(
sip_path,
"metadata",
"submissionDocumentation",
"brunnhilde",
"csv_reports",
"formats.csv",
)
if args.bagfiles:
file_format_csv = os.path.join(
sip_path,
"data",
"metadata",
"submissionDocumentation",
"brunnhilde",
"csv_reports",
"formats.csv",
)

try:
with open(file_format_csv, "r") as f:
reader = csv.reader(f)
next(reader)
for row in itertools.islice(reader, 5):
file_formats.append(row[0])
except:
file_formats.append(
"ERROR! No Brunnhilde formats.csv file to pull formats from."
)

file_formats = [element or "Unidentified" for element in file_formats]
file_formats_str = ", ".join(file_formats)

if number_volumes > 1:
scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format(
number_volumes, file_systems_str, file_formats_str
)
else:
scope_content = (
"Files exported from {} file system volume. File formats: {}".format(
disk_volumes[0]["file_system"], file_formats_str
)
)

row_to_write[INDEX_DATE_START] = str(date_earliest[:4])
row_to_write[INDEX_DATE_END] = str(date_latest[:4])
row_to_write[INDEX_EXTENT_NUMBER] = str(file_count)
row_to_write[INDEX_EXTENT_TYPE] = "digital files"
row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes))
row_to_write[INDEX_SCOPE_CONTENTS] = scope_content

worksheet.append(row_to_write)

logger.info("Described %s successfully." % (sip_path))

workbook.save(filename=xlsx_path)
workbook.close()

logger.info("ArchivesSpace description XLSX created.")


def _parse_dfxml(dfxml_path, logger, export_all=False):
"""Parse DFXML and return dict of information for spreadsheet."""
volume_info = {
Expand Down Expand Up @@ -423,6 +593,12 @@ def _make_parser():
help="Export AppleDouble resource forks from HFS-formatted disks",
action="store_true",
)
parser.add_argument(
"-c",
"--csv",
help="Write description CSV (old default) instead of ArchivesSpace XLSX",
action="store_true",
)
parser.add_argument("--quiet", action="store_true", help="Write only errors to log")
parser.add_argument(
"source", help="Source directory containing disk images (and related files)"
Expand Down Expand Up @@ -563,8 +739,17 @@ def main():
shell=True,
)

# write description CSV
create_spreadsheet(args, sips, volumes, logger)
# write description
if args.csv:
try:
create_spreadsheet(args, sips, volumes, logger)
except Exception as err:
logger.error(f"Error creating description csv: {err}")
else:
try:
create_aspace_excel_sheet(args, sips, volumes, logger)
except Exception as err:
logger.error(f"Error creating ArchivesSpace description xlsx: {err}")

# print unprocessed list
if unprocessed:
Expand Down
5 changes: 5 additions & 0 deletions install-bc2-ubuntu18.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ sudo cp LICENSE $dip_dir
sudo cp README.md $dip_dir
sudo cp -r disk_image_toolkit/ $dip_dir

if [ ! -d $dip_dir/aspace_template ]; then
sudo mkdir $dip_dir/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template

if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
fi
Expand Down
8 changes: 8 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

### Install script for CCA Disk Image Processor in Bitcurator 4/Ubuntu 22

sudo python3 -m pip install pyqt5
sudo python3 -m pip install -r requirements/base.txt

if [ ! -d /usr/share/ccatools ]; then
sudo mkdir /usr/share/ccatools
fi
Expand Down Expand Up @@ -30,6 +33,11 @@ sudo cp LICENSE $dip_dir
sudo cp README.md $dip_dir
sudo cp -r disk_image_toolkit/ $dip_dir

if [ ! -d $dip_dir/aspace_template ]; then
sudo mkdir $dip_dir/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template

if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
fi
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def about_dialog(self):
QMessageBox.information(
self,
"About",
"Disk Image Processor v1.1.1\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
"Disk Image Processor v1.2.0\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
)

def browse_analysis_source(self):
Expand Down
1 change: 1 addition & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
bagit>=1.7.0
brunnhilde>=1.9.1
openpyxl>=3.1.2
5 changes: 5 additions & 0 deletions test-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ sudo cp disk_image_toolkit/dfxml/dfxml.py /usr/share/ccatools/diskimageprocessor
sudo cp disk_image_toolkit/dfxml/objects.py /usr/share/ccatools/diskimageprocessor
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py /usr/share/ccatools/diskimageprocessor

if [ ! -d /usr/share/ccatools/diskimageprocessor/aspace_template ]; then
sudo mkdir /usr/share/ccatools/diskimageprocessor/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx /usr/share/ccatools/diskimageprocessor/aspace_template

sudo cp disk_image_toolkit/dfxml/dfxml.py .
sudo cp disk_image_toolkit/dfxml/objects.py .
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py .
Loading
Loading