Skip to content

Commit

Permalink
Merge pull request #60 from lyrasis/v3-0-1
Browse files Browse the repository at this point in the history
v3.0.1
  • Loading branch information
kspurgin authored Mar 11, 2024
2 parents c8530d7 + 7986e0e commit d7faec2
Show file tree
Hide file tree
Showing 17 changed files with 121 additions and 52 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ These changes are merged into the `main` branch, but have not been released. Aft

=== Deprecated/Will break in a future version

== 3.0.1 (2024-03-11)

=== Changes

* Speed up caching CSIDs and refnames by pipelining values directly to Redis instead of using individual collectionspace-refcache calls
* `vt add --csv` option now uses `ingest_dir` if configured and only a file name is given (https://github.com/lyrasis/collectionspace_migration_tools/issues/53[#53])

=== Bugfixes

* Fix for https://github.com/lyrasis/collectionspace_migration_tools/issues/55[#55] - `thor batch mtprep` command failing when all missing terms are vocabulary terms (not authority terms)
* When batch config's `batch_mode` is "date details", autocache vocabularies for the batch (https://github.com/lyrasis/collectionspace_migration_tools/issues/58[#58])

== 3.0.0 (2024-03-01)
=== Breaking

Expand Down
8 changes: 8 additions & 0 deletions lib/collectionspace_migration_tools.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def csid_cache
exit
end

def get_csv_path(csv)
config = CMT.config.client
return csv unless config.respond_to?(:ingest_dir)
return csv if ["~", "/"].any? { |char| csv.start_with?(char) }

File.join(config.ingest_dir, csv)
end

def domain
@domain ||= client.domain
end
Expand Down
4 changes: 2 additions & 2 deletions lib/collectionspace_migration_tools/batch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def map(id, autocache, clearcache)
end

def prep_missing_terms(id)
_split = yield(CMT::Batch::MissingTerms::ReportSplitter.call(batch_id: id))
batches = yield(CMT::Batch::MissingTerms::BatchCreator.call(batch_id: id))
_split = yield CMT::Batch::MissingTerms::ReportSplitter.call(batch_id: id)
batches = yield CMT::Batch::MissingTerms::BatchCreator.call(batch_id: id)

Success(batches)
end
Expand Down
10 changes: 1 addition & 9 deletions lib/collectionspace_migration_tools/batch/add.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def initialize

def call(id:, csv:, rectype:, action:)
valid_id = yield(validate_id(id))
csvpath = get_csv_path(csv)
csvpath = CMT.get_csv_path(csv)
valid_csv = yield(validate_csv(csvpath))
valid_rectype = yield(CMT::RecordTypes.valid_mappable?(rectype))
valid_action = yield(validate_action(action))
Expand All @@ -50,14 +50,6 @@ def call(id:, csv:, rectype:, action:)

attr_reader :path, :headers, :ids

def get_csv_path(csv)
config = CMT.config.client
return csv unless config.respond_to?(:ingest_dir)
return csv if ["~", "/"].any? { |char| csv.start_with?(char) }

File.join(config.ingest_dir, csv)
end

def allowed_actions
%w[create update delete]
end
Expand Down
4 changes: 4 additions & 0 deletions lib/collectionspace_migration_tools/batch/autocache_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def call
headers: first_row.headers,
mapper: mapper
)
batch_config = yield CMT::Parse::BatchConfig.call
if batch_config.dig("batch_mode") == "date details"
rn_deps << "vocabularies"
end

plan = yield CMT::Batch::CachingPlanner.call(
refname: rn_deps,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ module CollectionspaceMigrationTools
module Batch
module MissingTerms
# Splits missing_terms.csv into separate CSVs for each type/subtype.
# @todo optimize this for huge batches so that it reads/writes in chunks/parallel rather than holding
# all terms in memory. However, for initial go, we will assume there are not prohibitively large
# @TODO optimize this for huge batches so that it reads/writes in
# chunks/parallel rather than holding all terms in memory. However, for
# initial go, we will assume there are not prohibitively large
# missing term files to process
class ReportSplitter
include Dry::Monads[:result]
Expand All @@ -30,18 +31,20 @@ def initialize(
end

def call
batch = yield(CMT::Batch.find(batch_id))
term_ct = yield(batch.get("missing_terms"))
batch = yield CMT::Batch.find(batch_id)
term_ct = yield batch.get("missing_terms")
return Success("No missing terms to split") if term_ct == 0

batches_dir = CMT.config.client.batch_dir
batch_dir = yield(batch.get("dir"))
source = "#{batches_dir}/#{batch_dir}/missing_terms.csv"
prephash = yield(prepare_by_authority(source))
written = yield(write(prephash))
paths = yield(check_write(written))
vocabs_paths = yield(rewrite_vocab_terms(source))
_vocabs_final = yield(swap_vocab_file(vocabs_paths))
batch_dir = yield batch.get("dir")
source = File.join(batches_dir, batch_dir, "missing_terms.csv")

prephash = yield prepare_by_authority(source)
written = yield write(prephash)
paths = yield check_write(written)

vocab_dir = CMT.config.client.ingest_dir || batch_dir
_vocabs_paths = yield rewrite_vocab_terms(source, vocab_dir)
Success(paths)
end

Expand Down Expand Up @@ -74,19 +77,17 @@ def prepare_by_authority(source)
Success(by_auth)
end

def rewrite_vocab_terms(source)
target = "#{source}.tmp"
headers = CMT::Csv::BatchTermReporter.headers.first(4)
def rewrite_vocab_terms(source, vocab_dir)
vocab_terms = CSV.readlines(source)
.select { |arr| arr.first == "vocabularies" }
return Success("No vocab terms") if vocab_terms.empty?

target = File.join(vocab_dir, "#{batch_id}_vocabulary_terms.csv")
headers = %w[vocab term]

CSV.open(target, "wb") do |csv|
csv << headers
SmarterCSV.process(source) do |rowarr|
row = rowarr[0]
vocab = row[:vocabulary]
next unless vocab.start_with?("vocabularies-")

csv << row.values
end
vocab_terms.each { |arr| csv << [arr[1], arr[3]] }
end
rescue => err
msg = "#{err.message} IN #{err.backtrace[0]}"
Expand All @@ -96,16 +97,6 @@ def rewrite_vocab_terms(source)
Success([target, source])
end

def swap_vocab_file(vocabs_paths)
FileUtils.mv(vocabs_paths[0], vocabs_paths[1])
rescue => err
msg = "#{err.message} IN #{err.backtrace[0]}"
Failure(CMT::Failure.new(context: "#{self.class.name}.#{__callee__}",
message: msg))
else
Success()
end

def write(hash)
result = hash.map do |vocab, terms|
write_file(vocab, terms)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ def command
def signature(row)
[row["type"], row["subtype"], row["term"], row[cache_type.to_s]]
end

def key_val(row)
[
cache.send(:term_key, row["type"], row["subtype"], row["term"]),
row[cache_type.to_s]
]
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ def command
def signature(row)
[row["id"], row[cache_type.to_s]]
end

def key_val(row)
[
cache.send(:object_key, row["id"]),
row[cache_type.to_s]
]
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,19 @@ def command
end

def signature(row)
type = CMT::RecordTypes.mappable_type_to_service_path_mapping[row["type"]]
type = CMT::RecordTypes.mappable_type_to_service_path_mapping[
row["type"]
]
[type, row["id"], row[cache_type.to_s]]
end

def key_val(row)
type = CMT::RecordTypes.mappable_type_to_service_path_mapping[
row["type"]
]
key = cache.send(:procedure_key, type, row["id"])
[key, row[cache_type.to_s]]
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ def signature(row)
row[cache_type.to_s]]
end

def key_val(row)
[
cache.send(:relation_key, reltype(row), row["subjectcsid"],
row["objectcsid"]),
row[cache_type.to_s]
]
end

private

def reltype(row)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ def command
def signature(row)
[row["vocab"], row["term"], row[cache_type.to_s]]
end

def key_val(row)
[
cache.send(:vocab_term_key, row["vocab"], row["term"]),
row[cache_type.to_s]
]
end
end
end
end
Expand Down
8 changes: 6 additions & 2 deletions lib/collectionspace_migration_tools/cache/populator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def call(cache_type:, rec_type:, data:)
def initialize(cache_type:, rec_type:)
@cache_type = cache_type
@cache = CMT.send("#{cache_type}_cache".to_sym)
@redis = cache.instance_variable_get(:@cache)
.instance_variable_get(:@c)
@cache_name = cache_type.upcase
@rec_type = rec_type
extend record_type_mixin
Expand All @@ -37,7 +39,7 @@ def call(data)

private

attr_reader :cache_type, :cache, :cache_name, :rec_type
attr_reader :cache_type, :cache, :redis, :cache_name, :rec_type

def before_report(data)
puts "Populating #{cache_name} cache (current size: #{cache.size}) "\
Expand All @@ -50,7 +52,9 @@ def after_report
end

def do_population(data)
data.each { |row| cache.send(command, *signature(row)) }
redis.pipelined do |pipeline|
data.each { |row| pipeline.set(*key_val(row)) }
end
rescue => err
Failure(
CMT::Failure.new(context: "#{name}.#{__callee__}",
Expand Down
12 changes: 10 additions & 2 deletions lib/collectionspace_migration_tools/cli_helpers/pop.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,24 @@ def relations
end

def query_and_populate(rectypes, cache_type = nil)
starttime = Time.now
rectypes.each do |rectype|
meth = cache_type.nil? ? :populate_both_caches : "populate_#{cache_type}_cache".to_sym
meth = if cache_type.nil?
:populate_both_caches
else
"populate_#{cache_type}_cache".to_sym
end

rectype.send(meth).either(
->(success) { puts "Done" },
->(failure) {
puts "QUERY/POPULATE FAILED FOR #{rectype.to_s.upcase}\n#{failure}"
puts "QUERY/POPULATE FAILED FOR #{rectype.to_s.upcase}\n"\
"#{failure}"
}
)
end
duration = Time.now - starttime
puts "Elapsed caching time: #{duration}"
end

def db_disconnect
Expand Down
2 changes: 1 addition & 1 deletion lib/collectionspace_migration_tools/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module CollectionspaceMigrationTools
VERSION = "3.0.0"
VERSION = "3.0.1"
end
2 changes: 1 addition & 1 deletion lib/collectionspace_migration_tools/vocabulary_terms.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module VocabularyTerms

def add(csv_path)
processor = yield CMT::VocabularyTerms::TermsProcessorPreparer.call(
csv_path: csv_path
csv_path: CMT.get_csv_path(csv_path)
)
_result = yield processor.call

Expand Down
12 changes: 10 additions & 2 deletions lib/tasks/batch.thor
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,19 @@ class Batch < Thor
end

desc "mtprep BATCHID",
"Splits missing term report into term source specific CSVs and creates batches to add terms"
"Splits missing term report into term source specific CSVs and creates "\
"batches to add terms"
def mtprep(id)
CMT::Batch.prep_missing_terms(id).either(
->(success) {
puts "Created batches: #{success.join(", ")}"
if success == "No missing terms batches to create"
puts "No missing authority terms. Check ingest or batch directory "\
"for vocabulary terms to load"
else
puts "Created batches: #{success.join(", ")}"
puts "Check ingest or batch directory for missing vocabulary terms "\
"to load"
end
exit(0)
},
->(failure) {
Expand Down
5 changes: 4 additions & 1 deletion lib/tasks/vocabulary_terms.thor
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ class VocabularyTerms < Thor
namespace :vt

desc "add", "add new vocabulary terms from given CSV"
option :csv, required: true, type: :string
option :csv, required: true, type: :string,
desc: "File containing terms to load. If you have an `ingest_dir` "\
"configured, you may just give the filename. Otherwise, give the full "\
"to file."
def add
CMT::VocabularyTerms.add(options[:csv]).either(
->(success) { exit(0) },
Expand Down

0 comments on commit d7faec2

Please sign in to comment.