Merge pull request #60 from lyrasis/v3-0-1

v3.0.1
lyrasis · Mar 11, 2024 · d7faec2 · d7faec2
2 parents c8530d7 + 7986e0e
commit d7faec2
Show file tree

Hide file tree

Showing 17 changed files with 121 additions and 52 deletions.
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
@@ -27,6 +27,18 @@ These changes are merged into the `main` branch, but have not been released. Aft
 
 === Deprecated/Will break in a future version
 
+== 3.0.1 (2024-03-11)
+
+=== Changes
+
+* Speed up caching CSIDs and refnames by pipelining values directly to Redis instead of using individual collectionspace-refcache calls
+* `vt add --csv` option now uses `ingest_dir` if configured and only a file name is given (https://github.com/lyrasis/collectionspace_migration_tools/issues/53[#53])
+
+=== Bugfixes
+
+* Fix for https://github.com/lyrasis/collectionspace_migration_tools/issues/55[#55] - `thor batch mtprep` command failing when all missing terms are vocabulary terms (not authority terms)
+* When batch config's `batch_mode` is "date details", autocache vocabularies for the batch (https://github.com/lyrasis/collectionspace_migration_tools/issues/58[#58])
+
 == 3.0.0 (2024-03-01)
 === Breaking
 

diff --git a/lib/collectionspace_migration_tools.rb b/lib/collectionspace_migration_tools.rb
@@ -64,6 +64,14 @@ def csid_cache
       exit
     end
 
+    def get_csv_path(csv)
+      config = CMT.config.client
+      return csv unless config.respond_to?(:ingest_dir)
+      return csv if ["~", "/"].any? { |char| csv.start_with?(char) }
+
+      File.join(config.ingest_dir, csv)
+    end
+
     def domain
       @domain ||= client.domain
     end

diff --git a/lib/collectionspace_migration_tools/batch.rb b/lib/collectionspace_migration_tools/batch.rb
@@ -42,8 +42,8 @@ def map(id, autocache, clearcache)
     end
 
     def prep_missing_terms(id)
-      _split = yield(CMT::Batch::MissingTerms::ReportSplitter.call(batch_id: id))
-      batches = yield(CMT::Batch::MissingTerms::BatchCreator.call(batch_id: id))
+      _split = yield CMT::Batch::MissingTerms::ReportSplitter.call(batch_id: id)
+      batches = yield CMT::Batch::MissingTerms::BatchCreator.call(batch_id: id)
 
       Success(batches)
     end

diff --git a/lib/collectionspace_migration_tools/batch/add.rb b/lib/collectionspace_migration_tools/batch/add.rb
@@ -23,7 +23,7 @@ def initialize
 
       def call(id:, csv:, rectype:, action:)
         valid_id = yield(validate_id(id))
-        csvpath = get_csv_path(csv)
+        csvpath = CMT.get_csv_path(csv)
         valid_csv = yield(validate_csv(csvpath))
         valid_rectype = yield(CMT::RecordTypes.valid_mappable?(rectype))
         valid_action = yield(validate_action(action))
@@ -50,14 +50,6 @@ def call(id:, csv:, rectype:, action:)
 
       attr_reader :path, :headers, :ids
 
-      def get_csv_path(csv)
-        config = CMT.config.client
-        return csv unless config.respond_to?(:ingest_dir)
-        return csv if ["~", "/"].any? { |char| csv.start_with?(char) }
-
-        File.join(config.ingest_dir, csv)
-      end
-
       def allowed_actions
         %w[create update delete]
       end

diff --git a/lib/collectionspace_migration_tools/batch/autocache_runner.rb b/lib/collectionspace_migration_tools/batch/autocache_runner.rb
@@ -31,6 +31,10 @@ def call
           headers: first_row.headers,
           mapper: mapper
         )
+        batch_config = yield CMT::Parse::BatchConfig.call
+        if batch_config.dig("batch_mode") == "date details"
+          rn_deps << "vocabularies"
+        end
 
         plan = yield CMT::Batch::CachingPlanner.call(
           refname: rn_deps,

diff --git a/lib/collectionspace_migration_tools/batch/missing_terms/report_splitter.rb b/lib/collectionspace_migration_tools/batch/missing_terms/report_splitter.rb
@@ -8,8 +8,9 @@ module CollectionspaceMigrationTools
   module Batch
     module MissingTerms
       # Splits missing_terms.csv into separate CSVs for each type/subtype.
-      # @todo optimize this for huge batches so that it reads/writes in chunks/parallel rather than holding
-      #   all terms in memory. However, for initial go, we will assume there are not prohibitively large
+      # @TODO optimize this for huge batches so that it reads/writes in
+      #   chunks/parallel rather than holding all terms in memory. However, for
+      #   initial go, we will assume there are not prohibitively large
       #   missing term files to process
       class ReportSplitter
         include Dry::Monads[:result]
@@ -30,18 +31,20 @@ def initialize(
         end
 
         def call
-          batch = yield(CMT::Batch.find(batch_id))
-          term_ct = yield(batch.get("missing_terms"))
+          batch = yield CMT::Batch.find(batch_id)
+          term_ct = yield batch.get("missing_terms")
           return Success("No missing terms to split") if term_ct == 0
 
           batches_dir = CMT.config.client.batch_dir
-          batch_dir = yield(batch.get("dir"))
-          source = "#{batches_dir}/#{batch_dir}/missing_terms.csv"
-          prephash = yield(prepare_by_authority(source))
-          written = yield(write(prephash))
-          paths = yield(check_write(written))
-          vocabs_paths = yield(rewrite_vocab_terms(source))
-          _vocabs_final = yield(swap_vocab_file(vocabs_paths))
+          batch_dir = yield batch.get("dir")
+          source = File.join(batches_dir, batch_dir, "missing_terms.csv")
+
+          prephash = yield prepare_by_authority(source)
+          written = yield write(prephash)
+          paths = yield check_write(written)
+
+          vocab_dir = CMT.config.client.ingest_dir || batch_dir
+          _vocabs_paths = yield rewrite_vocab_terms(source, vocab_dir)
           Success(paths)
         end
 
@@ -74,19 +77,17 @@ def prepare_by_authority(source)
           Success(by_auth)
         end
 
-        def rewrite_vocab_terms(source)
-          target = "#{source}.tmp"
-          headers = CMT::Csv::BatchTermReporter.headers.first(4)
+        def rewrite_vocab_terms(source, vocab_dir)
+          vocab_terms = CSV.readlines(source)
+            .select { |arr| arr.first == "vocabularies" }
+          return Success("No vocab terms") if vocab_terms.empty?
+
+          target = File.join(vocab_dir, "#{batch_id}_vocabulary_terms.csv")
+          headers = %w[vocab term]
 
           CSV.open(target, "wb") do |csv|
             csv << headers
-            SmarterCSV.process(source) do |rowarr|
-              row = rowarr[0]
-              vocab = row[:vocabulary]
-              next unless vocab.start_with?("vocabularies-")
-
-              csv << row.values
-            end
+            vocab_terms.each { |arr| csv << [arr[1], arr[3]] }
           end
         rescue => err
           msg = "#{err.message} IN #{err.backtrace[0]}"
@@ -96,16 +97,6 @@ def rewrite_vocab_terms(source)
           Success([target, source])
         end
 
-        def swap_vocab_file(vocabs_paths)
-          FileUtils.mv(vocabs_paths[0], vocabs_paths[1])
-        rescue => err
-          msg = "#{err.message} IN #{err.backtrace[0]}"
-          Failure(CMT::Failure.new(context: "#{self.class.name}.#{__callee__}",
-            message: msg))
-        else
-          Success()
-        end
-
         def write(hash)
           result = hash.map do |vocab, terms|
             write_file(vocab, terms)

diff --git a/lib/collectionspace_migration_tools/cache/populate/types/auth_terms.rb b/lib/collectionspace_migration_tools/cache/populate/types/auth_terms.rb
@@ -12,6 +12,13 @@ def command
           def signature(row)
             [row["type"], row["subtype"], row["term"], row[cache_type.to_s]]
           end
+
+          def key_val(row)
+            [
+              cache.send(:term_key, row["type"], row["subtype"], row["term"]),
+              row[cache_type.to_s]
+            ]
+          end
         end
       end
     end

diff --git a/lib/collectionspace_migration_tools/cache/populate/types/objects.rb b/lib/collectionspace_migration_tools/cache/populate/types/objects.rb
@@ -12,6 +12,13 @@ def command
           def signature(row)
             [row["id"], row[cache_type.to_s]]
           end
+
+          def key_val(row)
+            [
+              cache.send(:object_key, row["id"]),
+              row[cache_type.to_s]
+            ]
+          end
         end
       end
     end

diff --git a/lib/collectionspace_migration_tools/cache/populate/types/procedures.rb b/lib/collectionspace_migration_tools/cache/populate/types/procedures.rb
@@ -10,9 +10,19 @@ def command
           end
 
           def signature(row)
-            type = CMT::RecordTypes.mappable_type_to_service_path_mapping[row["type"]]
+            type = CMT::RecordTypes.mappable_type_to_service_path_mapping[
+              row["type"]
+            ]
             [type, row["id"], row[cache_type.to_s]]
           end
+
+          def key_val(row)
+            type = CMT::RecordTypes.mappable_type_to_service_path_mapping[
+              row["type"]
+            ]
+            key = cache.send(:procedure_key, type, row["id"])
+            [key, row[cache_type.to_s]]
+          end
         end
       end
     end

diff --git a/lib/collectionspace_migration_tools/cache/populate/types/relations.rb b/lib/collectionspace_migration_tools/cache/populate/types/relations.rb
@@ -14,6 +14,14 @@ def signature(row)
               row[cache_type.to_s]]
           end
 
+          def key_val(row)
+            [
+              cache.send(:relation_key, reltype(row), row["subjectcsid"],
+                row["objectcsid"]),
+              row[cache_type.to_s]
+            ]
+          end
+
           private
 
           def reltype(row)

diff --git a/lib/collectionspace_migration_tools/cache/populate/types/vocab_terms.rb b/lib/collectionspace_migration_tools/cache/populate/types/vocab_terms.rb
@@ -12,6 +12,13 @@ def command
           def signature(row)
             [row["vocab"], row["term"], row[cache_type.to_s]]
           end
+
+          def key_val(row)
+            [
+              cache.send(:vocab_term_key, row["vocab"], row["term"]),
+              row[cache_type.to_s]
+            ]
+          end
         end
       end
     end

diff --git a/lib/collectionspace_migration_tools/cache/populator.rb b/lib/collectionspace_migration_tools/cache/populator.rb
@@ -16,6 +16,8 @@ def call(cache_type:, rec_type:, data:)
       def initialize(cache_type:, rec_type:)
         @cache_type = cache_type
         @cache = CMT.send("#{cache_type}_cache".to_sym)
+        @redis = cache.instance_variable_get(:@cache)
+          .instance_variable_get(:@c)
         @cache_name = cache_type.upcase
         @rec_type = rec_type
         extend record_type_mixin
@@ -37,7 +39,7 @@ def call(data)
 
       private
 
-      attr_reader :cache_type, :cache, :cache_name, :rec_type
+      attr_reader :cache_type, :cache, :redis, :cache_name, :rec_type
 
       def before_report(data)
         puts "Populating #{cache_name} cache (current size: #{cache.size}) "\
@@ -50,7 +52,9 @@ def after_report
       end
 
       def do_population(data)
-        data.each { |row| cache.send(command, *signature(row)) }
+        redis.pipelined do |pipeline|
+          data.each { |row| pipeline.set(*key_val(row)) }
+        end
       rescue => err
         Failure(
           CMT::Failure.new(context: "#{name}.#{__callee__}",

diff --git a/lib/collectionspace_migration_tools/cli_helpers/pop.rb b/lib/collectionspace_migration_tools/cli_helpers/pop.rb
@@ -26,16 +26,24 @@ def relations
       end
 
       def query_and_populate(rectypes, cache_type = nil)
+        starttime = Time.now
         rectypes.each do |rectype|
-          meth = cache_type.nil? ? :populate_both_caches : "populate_#{cache_type}_cache".to_sym
+          meth = if cache_type.nil?
+            :populate_both_caches
+          else
+            "populate_#{cache_type}_cache".to_sym
+          end
 
           rectype.send(meth).either(
             ->(success) { puts "Done" },
             ->(failure) {
-              puts "QUERY/POPULATE FAILED FOR #{rectype.to_s.upcase}\n#{failure}"
+              puts "QUERY/POPULATE FAILED FOR #{rectype.to_s.upcase}\n"\
+                "#{failure}"
             }
           )
         end
+        duration = Time.now - starttime
+        puts "Elapsed caching time: #{duration}"
       end
 
       def db_disconnect

diff --git a/lib/collectionspace_migration_tools/version.rb b/lib/collectionspace_migration_tools/version.rb
@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 
 module CollectionspaceMigrationTools
-  VERSION = "3.0.0"
+  VERSION = "3.0.1"
 end
diff --git a/lib/collectionspace_migration_tools/vocabulary_terms.rb b/lib/collectionspace_migration_tools/vocabulary_terms.rb
@@ -8,7 +8,7 @@ module VocabularyTerms
 
     def add(csv_path)
       processor = yield CMT::VocabularyTerms::TermsProcessorPreparer.call(
-        csv_path: csv_path
+        csv_path: CMT.get_csv_path(csv_path)
       )
       _result = yield processor.call
 

diff --git a/lib/tasks/batch.thor b/lib/tasks/batch.thor
@@ -149,11 +149,19 @@ class Batch < Thor
   end
 
   desc "mtprep BATCHID",
-    "Splits missing term report into term source specific CSVs and creates batches to add terms"
+    "Splits missing term report into term source specific CSVs and creates "\
+    "batches to add terms"
   def mtprep(id)
     CMT::Batch.prep_missing_terms(id).either(
       ->(success) {
-        puts "Created batches: #{success.join(", ")}"
+        if success == "No missing terms batches to create"
+          puts "No missing authority terms. Check ingest or batch directory "\
+            "for vocabulary terms to load"
+        else
+          puts "Created batches: #{success.join(", ")}"
+          puts "Check ingest or batch directory for missing vocabulary terms "\
+            "to load"
+        end
         exit(0)
       },
       ->(failure) {

diff --git a/lib/tasks/vocabulary_terms.thor b/lib/tasks/vocabulary_terms.thor
@@ -8,7 +8,10 @@ class VocabularyTerms < Thor
   namespace :vt
 
   desc "add", "add new vocabulary terms from given CSV"
-  option :csv, required: true, type: :string
+  option :csv, required: true, type: :string,
+    desc: "File containing terms to load. If you have an `ingest_dir` "\
+    "configured, you may just give the filename. Otherwise, give the full "\
+    "to file."
   def add
     CMT::VocabularyTerms.add(options[:csv]).either(
       ->(success) { exit(0) },