Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster image load #189

Merged
merged 10 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ Metrics/MethodLength:
Metrics/AbcSize:
Exclude:
- "app/services/guide_card_loading_service.rb"

Style/HashSyntax:
EnforcedShorthandSyntax: never
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ruby '3.2.0'
# Bundle edge Rails instead: gem "rails", github: "rails/rails", branch: "main"
gem 'rails', '~> 7.0.4'

gem 'async'
gem 'bcrypt_pbkdf'
gem 'ed25519'

Expand Down
13 changes: 13 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ GEM
airbrussh (1.4.1)
sshkit (>= 1.6.1, != 1.7.0)
ast (2.4.2)
async (2.6.4)
console (~> 1.10)
fiber-annotation
io-event (~> 1.1)
timers (~> 4.1)
bcrypt_pbkdf (1.1.0)
bindex (0.8.1)
bootsnap (1.16.0)
Expand Down Expand Up @@ -103,6 +108,9 @@ GEM
xpath (~> 3.2)
coderay (1.1.3)
concurrent-ruby (1.2.2)
console (1.23.2)
fiber-annotation
fiber-local
coveralls_reborn (0.27.0)
simplecov (~> 0.22.0)
term-ansicolor (~> 1.7)
Expand All @@ -118,6 +126,8 @@ GEM
dry-cli (1.0.0)
ed25519 (1.3.0)
erubi (1.12.0)
fiber-annotation (0.2.0)
fiber-local (1.0.0)
globalid (1.1.0)
activesupport (>= 5.0)
i18n (1.12.0)
Expand All @@ -126,6 +136,7 @@ GEM
actionpack (>= 6.0.0)
railties (>= 6.0.0)
io-console (0.6.0)
io-event (1.3.2)
irb (1.6.3)
reline (>= 0.3.0)
jbuilder (2.11.5)
Expand Down Expand Up @@ -295,6 +306,7 @@ GEM
tins (~> 1.0)
thor (1.2.1)
timeout (0.3.2)
timers (4.3.5)
tins (1.32.1)
sync
turbo-rails (1.4.0)
Expand Down Expand Up @@ -330,6 +342,7 @@ PLATFORMS
x86_64-linux

DEPENDENCIES
async
bcrypt_pbkdf
bootsnap
capistrano (~> 3.10)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ To list all import services for the application: `rake -T | grep import`
To import the GuideCard records (takes about 3 minutes): `rake import:import_guide_cards`
To import the SubGuideCard records (takes about 2 minutes): `rake import:import_sub_guide_cards`

The CardImage records are the images that are included in the GuideCard and SubGuideCard records. There are 5,786,727 images. These take about 12 days to import.
The CardImage records are the images that are included in the GuideCard and SubGuideCard records. There are 5,786,727 images. These are estimated to take about 1 day to import.

To import the CardImage records: `rake import:import_card_images`

Expand Down
91 changes: 48 additions & 43 deletions app/services/card_image_loading_service.rb
Original file line number Diff line number Diff line change
@@ -1,74 +1,79 @@
# frozen_string_literal: true

require 'ruby-progressbar'
require 'ruby-progressbar/outputs/null'
require 'async/semaphore'
require 'async/barrier'

# Class for card image loading service
class CardImageLoadingService
attr_reader :progressbar
attr_reader :logger, :suppress_progress

def initialize(progressbar: nil)
@progressbar = progressbar || ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m")
def initialize(logger: nil, suppress_progress: false)
@logger = logger || Logger.new($stdout)
@suppress_progress = suppress_progress
end

def import
@progressbar = ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m")
import_guide_card_images
@progressbar = ProgressBar.create(format: "\e[1;35m%t: |%B|\e[0m")
import_sub_guide_images
end

# For each SubGuideCard, take its path and query s3 to get all of the image names
# for that path. For each image file, create a CardImage object with the path and
# image name.
barrier = Async::Barrier.new
Sync do
semaphore = Async::Semaphore.new(10, parent: barrier)

def import_sub_guide_images
@progressbar.total = SubGuideCard.all.count
SubGuideCard.all.each_with_index do |sgc, index|
progress_bar_random_color if (index % 100).zero?
@progressbar.increment
image_array(sgc.path).each do |file_name|
create_card_image(sgc, file_name)
end
(1..22).map do |disk|
semaphore.async do
import_disk(disk)
end
end.map(&:wait)
ensure
barrier.stop
end
end

def import_guide_card_images
@progressbar.total = GuideCard.all.count
GuideCard.all.each_with_index do |gc, index|
progress_bar_random_color if (index % 100).zero?
@progressbar.increment
image_array(gc.path).each do |file_name|
create_card_image(gc, file_name)
end
def import_disk(disk)
logger.info("Fetching disk #{disk} file list")
filenames = disk_array(disk)
progress_bar.total += filenames.count
Sync do |task|
filenames.map do |file_name|
task.async do
progress_bar.increment
find_or_create_card_image(file_name)
end
end.map(&:wait)
end
end

private

# returns something like
# ["imagecat-disk9-0091-A3037-1358.0110.tif", "imagecat-disk9-0091-A3037-1358.0111.tif"]
def image_array(path)
s3_image_list(path).split("\n").map(&:split).map(&:last)
def disk_array(disk)
s3_disk_list(disk).split("\n").map(&:split).map(&:last)
end

# returns something like
# "2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif\n2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif\n"
def s3_image_list(path)
s3_query = "aws s3 ls s3://puliiif-production/imagecat-disk#{path.tr('/', '-')}"
`#{s3_query}`
def s3_disk_list(disk)
`aws s3 ls s3://puliiif-production/imagecat-disk#{disk}-`
end

private

def create_card_image(sgc, file_name)
ci = CardImage.find_by(path: sgc.path, image_name: file_name)
def find_or_create_card_image(file_name)
path = file_name.gsub('imagecat-disk', '').split('-')[0..-2].join('/')
ci = CardImage.find_by(path: path, image_name: file_name)
return if ci

ci = CardImage.new
ci.path = sgc.path
ci.image_name = file_name
ci.save
CardImage.create(path: path, image_name: file_name)
end

def progress_bar
@progress_bar ||= ProgressBar.create(format: '%a %e %P% Loading: %c from %C', output: progress_output, total: 0, title: 'Image import')
end

def progress_bar_old(total)
ProgressBar.create(format: '%a %e %P% Loading: %c from %C', total: total, output: progress_output)
end

def progress_bar_random_color
@progressbar.format = "%t: |\e[#{rand(91..97)}m%B\e[0m|"
def progress_output
ProgressBar::Outputs::Null if suppress_progress
end
end
117 changes: 52 additions & 65 deletions spec/services/card_image_loading_service_spec.rb
Original file line number Diff line number Diff line change
@@ -1,75 +1,62 @@
# frozen_string_literal: true

require 'rails_helper'
require 'ruby-progressbar/outputs/null'

describe CardImageLoadingService do
let(:cils) { described_class.new(progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)) }
let(:gcls) do
GuideCardLoadingService.new(csv_location: Rails.root.join('spec', 'fixtures', 'guide_card_fixture.csv'),
progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null))
let(:card_image_loader) do
described_class.new(
logger: Logger.new(nil),
suppress_progress: true
)
end
let(:guide_card_loader) do
GuideCardLoadingService.new(
csv_location: Rails.root.join('spec', 'fixtures', 'guide_card_fixture.csv'),
progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)
)
end
let(:sub_guide_card_loader) do
SubGuideLoadingService.new(
csv_location: Rails.root.join('spec', 'fixtures', 'subguide_card_fixture.csv'),
progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null)
)
end

let(:s3_response_disk9) do
<<~HERE
2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif
2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif
2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0001.tif
2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0002.tif
2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3038-0000.0003.tif
2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3067-0000.0048.tif
HERE
end
let(:s3_response_disk14) do
<<~HERE
2023-07-19 14:39:38 3422 imagecat-disk14-0001-A1007-1358.0110.tif
2023-07-19 14:39:38 7010 imagecat-disk14-0001-A1007-1358.0111.tif
2023-07-19 14:39:38 7010 imagecat-disk14-0002-A1008-0000.0001.tif
HERE
end
let(:sgls) do
SubGuideLoadingService.new(csv_location: Rails.root.join('spec', 'fixtures', 'subguide_card_fixture.csv'),
progressbar: ProgressBar.create(output: ProgressBar::Outputs::Null))
end
let(:s3_response) do
"2023-07-19 14:39:38 3422 imagecat-disk9-0091-A3037-1358.0110.tif\n2023-07-19 14:39:38 7010 imagecat-disk9-0091-A3037-1358.0111.tif\n"
end

before do
allow(cils).to receive(:s3_image_list).with(anything).and_return(s3_response)
# Any card with a path of "sub" will not have any images
allow(cils).to receive(:s3_image_list).with('sub').and_return('')
end

it 'can instantiate' do
expect(cils).to be_instance_of described_class
end

it 'imports all SubGuideCard images' do
sgls.import
expect(CardImage.count).to eq 0
cils.import_sub_guide_images
expect(CardImage.count).to eq 8
images = CardImage.where(path: '9/0091/A3037')
expect(images.map(&:image_name)).to contain_exactly('imagecat-disk9-0091-A3037-1358.0110.tif', 'imagecat-disk9-0091-A3037-1358.0111.tif')
end

it 'imports all GuideCard images' do
expect(GuideCard.count).to eq 0
gcls.import
expect(GuideCard.count).to eq 12
expect(CardImage.count).to eq 0
cils.import_guide_card_images
expect(CardImage.count).to eq 22
images = CardImage.where(path: '14/0001/A1003')
expect(images.map(&:image_name)).to contain_exactly('imagecat-disk9-0091-A3037-1358.0110.tif', 'imagecat-disk9-0091-A3037-1358.0111.tif')
end

it 'gets a list of images from s3' do
image_list = cils.s3_image_list('9/0091/A3037')
expect(image_list.split("\n").count).to eq 2
end

it 'displays ruby-progress bar during import' do
expect(cils.progressbar.to_h['percentage']).to eq 0
cils.import_sub_guide_images
expect(cils.progressbar.to_h['percentage']).to eq 100
end

it 'imports both GuideCard and SubGuideCard images' do
expect(GuideCard.count).to eq 0
expect(SubGuideCard.count).to eq 0
gcls.import
expect(GuideCard.count).to eq 12
sgls.import
expect(SubGuideCard.count).to eq 7
expect(CardImage.count).to eq 0
cils.import
expect(CardImage.count).to eq 30
allow(card_image_loader).to receive(:s3_disk_list).with(anything).and_return('')
allow(card_image_loader).to receive(:s3_disk_list).with(9).and_return(s3_response_disk9)
allow(card_image_loader).to receive(:s3_disk_list).with(14).and_return(s3_response_disk14)
end

it 'creates CardImage database entries based on the s3 listing' do
card_image_loader.import
expect(CardImage.find_by(image_name: 'imagecat-disk14-0002-A1008-0000.0001.tif').path).to eq '14/0002/A1008'
expect(CardImage.count).to eq 9
guide_card_loader.import
sub_guide_card_loader.import
gc = GuideCard.find_by(path: '14/0001/A1007')
expect(CardImage.where(path: gc.path).count).to eq 2
sgc = SubGuideCard.find_by(path: '9/0091/A3038')
expect(CardImage.where(path: sgc.path).count).to eq 3
# If you reimport, it doesn't create duplicate rows
cils.import
expect(CardImage.count).to eq 30
card_image_loader.import
expect(CardImage.count).to eq 9
end
end