diff --git a/.rubocop.yml b/.rubocop.yml index d16abbe..bba2c39 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -24,3 +24,6 @@ Metrics/MethodLength: Metrics/AbcSize: Exclude: - "app/services/guide_card_loading_service.rb" + +Style/HashSyntax: + EnforcedShorthandSyntax: never diff --git a/Gemfile b/Gemfile index d450d59..d368327 100644 --- a/Gemfile +++ b/Gemfile @@ -8,6 +8,7 @@ ruby '3.2.0' # Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main" gem 'rails', '~> 7.0.4' +gem 'async' gem 'bcrypt_pbkdf' gem 'ed25519' diff --git a/Gemfile.lock b/Gemfile.lock index c2182d2..ff991f3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -71,6 +71,11 @@ GEM airbrussh (1.4.1) sshkit (>= 1.6.1, != 1.7.0) ast (2.4.2) + async (2.6.4) + console (~> 1.10) + fiber-annotation + io-event (~> 1.1) + timers (~> 4.1) bcrypt_pbkdf (1.1.0) bindex (0.8.1) bootsnap (1.16.0) @@ -103,6 +108,9 @@ GEM xpath (~> 3.2) coderay (1.1.3) concurrent-ruby (1.2.2) + console (1.23.2) + fiber-annotation + fiber-local coveralls_reborn (0.27.0) simplecov (~> 0.22.0) term-ansicolor (~> 1.7) @@ -118,6 +126,8 @@ GEM dry-cli (1.0.0) ed25519 (1.3.0) erubi (1.12.0) + fiber-annotation (0.2.0) + fiber-local (1.0.0) globalid (1.1.0) activesupport (>= 5.0) i18n (1.12.0) @@ -126,6 +136,7 @@ GEM actionpack (>= 6.0.0) railties (>= 6.0.0) io-console (0.6.0) + io-event (1.3.2) irb (1.6.3) reline (>= 0.3.0) jbuilder (2.11.5) @@ -295,6 +306,7 @@ GEM tins (~> 1.0) thor (1.2.1) timeout (0.3.2) + timers (4.3.5) tins (1.32.1) sync turbo-rails (1.4.0) @@ -330,6 +342,7 @@ PLATFORMS x86_64-linux DEPENDENCIES + async bcrypt_pbkdf bootsnap capistrano (~> 3.10) diff --git a/README.md b/README.md index 2fc7455..9ec1e40 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ To list all import services for the application: `rake -T | grep import` To import the GuideCard records (takes about 3 minutes): `rake import:import_guide_cards` To import the SubGuideCard records (takes about 2 minutes): `rake import:import_sub_guide_cards` -The CardImage records are the images that are included in the GuideCard and SubGuideCard records. There are 5,786,727 images. These take about 12 days to import. +The CardImage records are the images that are included in the GuideCard and SubGuideCard records. There are 5,786,727 images. These are estimated to take about 1 day to import. To import the CardImage records: `rake import:import_card_images` diff --git a/app/services/card_image_loading_service.rb b/app/services/card_image_loading_service.rb index 46a2988..d751acd 100644 --- a/app/services/card_image_loading_service.rb +++ b/app/services/card_image_loading_service.rb @@ -1,74 +1,80 @@ # frozen_string_literal: true require 'ruby-progressbar' +require 'ruby-progressbar/outputs/null' +require 'async/semaphore' +require 'async/barrier' # Class for card image loading service class CardImageLoadingService - attr_reader :progressbar + attr_reader :logger, :suppress_progress - def initialize(progressbar: nil) - @progressbar = progressbar || ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m") + def initialize(logger: nil, suppress_progress: false) + @logger = logger || Logger.new($stdout) + @suppress_progress = suppress_progress end def import - @progressbar = ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m") - import_guide_card_images - @progressbar = ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m") - import_sub_guide_images - end - - # For each SubGuideCard, take its path and query s3 to get all of the image names - # for that path. For each image file, create a CardImage object with the path and - # image name. + barrier = Async::Barrier.new + Sync do + semaphore = Async::Semaphore.new(22, parent: barrier) - def import_sub_guide_images - @progressbar.total = SubGuideCard.all.count - SubGuideCard.all.each_with_index do |sgc, index| - progress_bar_random_color if (index % 100).zero? - @progressbar.increment - image_array(sgc.path).each do |file_name| - create_card_image(sgc, file_name) - end + (1..22).map do |disk| + semaphore.async do + import_disk(disk) + end + end.map(&:wait) + ensure + barrier.stop end end - def import_guide_card_images - @progressbar.total = GuideCard.all.count - GuideCard.all.each_with_index do |gc, index| - progress_bar_random_color if (index % 100).zero? - @progressbar.increment - image_array(gc.path).each do |file_name| - create_card_image(gc, file_name) - end + def import_disk(disk) + logger.info("Fetching disk #{disk} file list") + filenames = disk_array(disk) + progress_bar.total += filenames.count + Sync do + semaphore = Async::Semaphore.new(10_000) + filenames.map do |file_name| + semaphore.async do + progress_bar.increment + find_or_create_card_image(file_name) + end + end.map(&:wait) end end + private + # returns something like # ["imagecat-disk9-0091-A3037-1358.0110.tif", "imagecat-disk9-0091-A3037-1358.0111.tif"] - def image_array(path) - s3_image_list(path).split("\n").map(&:split).map(&:last) + def disk_array(disk) + s3_disk_list(disk).split("\n").map(&:split).map(&:last) end # returns something like # "2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif\n2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif\n" - def s3_image_list(path) - s3_query = "aws s3 ls s3://puliiif-production/imagecat-disk#{path.tr('/', '-')}" - `#{s3_query}` + def s3_disk_list(disk) + `aws s3 ls s3://puliiif-production/imagecat-disk#{disk}-` end - private - - def create_card_image(sgc, file_name) - ci = CardImage.find_by(path: sgc.path, image_name: file_name) + def find_or_create_card_image(file_name) + path = file_name.gsub('imagecat-disk', '').split('-')[0..-2].join('/') + ci = CardImage.find_by(path: path, image_name: file_name) return if ci - ci = CardImage.new - ci.path = sgc.path - ci.image_name = file_name - ci.save + CardImage.create(path: path, image_name: file_name) + end + + def progress_bar + @progress_bar ||= ProgressBar.create(format: '%a %e %P% Loading: %c from %C', output: progress_output, total: 0, title: 'Image import') + end + + def progress_bar_old(total) + ProgressBar.create(format: '%a %e %P% Loading: %c from %C', total: total, output: progress_output) end - def progress_bar_random_color - @progressbar.format = "%t: |\e[#{rand(91..97)}m%B\e[0m|" + def progress_output + ProgressBar::Outputs::Null if suppress_progress end end diff --git a/spec/services/card_image_loading_service_spec.rb b/spec/services/card_image_loading_service_spec.rb index c0aa10c..dbbba27 100644 --- a/spec/services/card_image_loading_service_spec.rb +++ b/spec/services/card_image_loading_service_spec.rb @@ -1,75 +1,62 @@ # frozen_string_literal: true require 'rails_helper' -require 'ruby-progressbar/outputs/null' describe CardImageLoadingService do - let(:cils) { described_class.new(progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)) } - let(:gcls) do - GuideCardLoadingService.new(csv_location: Rails.root.join('spec', 'fixtures', 'guide_card_fixture.csv'), - progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)) + let(:card_image_loader) do + described_class.new( + logger: Logger.new(nil), + suppress_progress: true + ) + end + let(:guide_card_loader) do + GuideCardLoadingService.new( + csv_location: Rails.root.join('spec', 'fixtures', 'guide_card_fixture.csv'), + progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null) + ) + end + let(:sub_guide_card_loader) do + SubGuideLoadingService.new( + csv_location: Rails.root.join('spec', 'fixtures', 'subguide_card_fixture.csv'), + progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null) + ) + end + + let(:s3_response_disk9) do + <<~HERE + 2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif + 2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif + 2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0001.tif + 2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0002.tif + 2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0003.tif + 2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3067-0000.0048.tif + HERE + end + let(:s3_response_disk14) do + <<~HERE + 2023-07-19 14:39:38 3422 imagecat-disk14-0001-A1007-1358.0110.tif + 2023-07-19 14:39:38 7010 imagecat-disk14-0001-A1007-1358.0111.tif + 2023-07-19 14:39:38 7010 imagecat-disk14-0002-A1008-0000.0001.tif + HERE end - let(:sgls) do - SubGuideLoadingService.new(csv_location: Rails.root.join('spec', 'fixtures', 'subguide_card_fixture.csv'), - progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)) - end - let(:s3_response) do - "2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif\n2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif\n" - end - before do - allow(cils).to receive(:s3_image_list).with(anything).and_return(s3_response) - # Any card with a path of "sub" will not have any images - allow(cils).to receive(:s3_image_list).with('sub').and_return('') - end - - it 'can instantiate' do - expect(cils).to be_instance_of described_class - end - - it 'imports all SubGuideCard images' do - sgls.import - expect(CardImage.count).to eq 0 - cils.import_sub_guide_images - expect(CardImage.count).to eq 8 - images = CardImage.where(path: '9/0091/A3037') - expect(images.map(&:image_name)).to contain_exactly('imagecat-disk9-0091-A3037-1358.0110.tif', 'imagecat-disk9-0091-A3037-1358.0111.tif') - end - - it 'imports all GuideCard images' do - expect(GuideCard.count).to eq 0 - gcls.import - expect(GuideCard.count).to eq 12 - expect(CardImage.count).to eq 0 - cils.import_guide_card_images - expect(CardImage.count).to eq 22 - images = CardImage.where(path: '14/0001/A1003') - expect(images.map(&:image_name)).to contain_exactly('imagecat-disk9-0091-A3037-1358.0110.tif', 'imagecat-disk9-0091-A3037-1358.0111.tif') - end - - it 'gets a list of images from s3' do - image_list = cils.s3_image_list('9/0091/A3037') - expect(image_list.split("\n").count).to eq 2 - end - - it 'displays ruby-progress bar during import' do - expect(cils.progressbar.to_h['percentage']).to eq 0 - cils.import_sub_guide_images - expect(cils.progressbar.to_h['percentage']).to eq 100 - end - - it 'imports both GuideCard and SubGuideCard images' do - expect(GuideCard.count).to eq 0 - expect(SubGuideCard.count).to eq 0 - gcls.import - expect(GuideCard.count).to eq 12 - sgls.import - expect(SubGuideCard.count).to eq 7 - expect(CardImage.count).to eq 0 - cils.import - expect(CardImage.count).to eq 30 + allow(card_image_loader).to receive(:s3_disk_list).with(anything).and_return('') + allow(card_image_loader).to receive(:s3_disk_list).with(9).and_return(s3_response_disk9) + allow(card_image_loader).to receive(:s3_disk_list).with(14).and_return(s3_response_disk14) + end + + it 'creates CardImage database entries based on the s3 listing' do + card_image_loader.import + expect(CardImage.find_by(image_name: 'imagecat-disk14-0002-A1008-0000.0001.tif').path).to eq '14/0002/A1008' + expect(CardImage.count).to eq 9 + guide_card_loader.import + sub_guide_card_loader.import + gc = GuideCard.find_by(path: '14/0001/A1007') + expect(CardImage.where(path: gc.path).count).to eq 2 + sgc = SubGuideCard.find_by(path: '9/0091/A3038') + expect(CardImage.where(path: sgc.path).count).to eq 3 # If you reimport, it doesn't create duplicate rows - cils.import - expect(CardImage.count).to eq 30 + card_image_loader.import + expect(CardImage.count).to eq 9 end end