From 32cc330a3861422d5d12af8378a6a86bde0b1621 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Thu, 2 Jan 2020 11:07:29 -0800 Subject: [PATCH 01/12] Adding migration and model changes in order to track downloads (and current download state) --- .../app/models/stash_engine/download_history.rb | 7 +++++++ .../app/models/stash_engine/file_upload.rb | 2 ++ .../app/models/stash_engine/resource.rb | 1 + ...2121_create_stash_engine_download_histories.rb | 15 +++++++++++++++ 4 files changed, 25 insertions(+) create mode 100644 stash/stash_engine/app/models/stash_engine/download_history.rb create mode 100644 stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb diff --git a/stash/stash_engine/app/models/stash_engine/download_history.rb b/stash/stash_engine/app/models/stash_engine/download_history.rb new file mode 100644 index 0000000000..a5d4401544 --- /dev/null +++ b/stash/stash_engine/app/models/stash_engine/download_history.rb @@ -0,0 +1,7 @@ +module StashEngine + class DownloadHistory < ActiveRecord::Base + belongs_to :resource, class_name: 'StashEngine::Resource' + belongs_to :file_upload, class_name: 'StashEngine::FileUpload' + enum state: %w[downloading finished].map { |i| [i.to_sym, i] }.to_h + end +end diff --git a/stash/stash_engine/app/models/stash_engine/file_upload.rb b/stash/stash_engine/app/models/stash_engine/file_upload.rb index f4445afbe5..35f26bc593 100644 --- a/stash/stash_engine/app/models/stash_engine/file_upload.rb +++ b/stash/stash_engine/app/models/stash_engine/file_upload.rb @@ -3,6 +3,8 @@ module StashEngine class FileUpload < ActiveRecord::Base belongs_to :resource, class_name: 'StashEngine::Resource' + has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy +' include StashEngine::Concerns::ResourceUpdated # mount_uploader :uploader, FileUploader # it seems like maybe I don't need this since I'm doing so much manually diff --git a/stash/stash_engine/app/models/stash_engine/resource.rb b/stash/stash_engine/app/models/stash_engine/resource.rb index 33a10f3f24..35a375d66d 100644 --- a/stash/stash_engine/app/models/stash_engine/resource.rb +++ b/stash/stash_engine/app/models/stash_engine/resource.rb @@ -24,6 +24,7 @@ class Resource < ActiveRecord::Base # rubocop:disable Metrics/ClassLength has_many :edit_histories, class_name: 'StashEngine::EditHistory', dependent: :destroy has_many :curation_activities, -> { order(id: :asc) }, class_name: 'StashEngine::CurationActivity', dependent: :destroy has_many :repo_queue_states, class_name: 'StashEngine::RepoQueueState', dependent: :destroy + has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy accepts_nested_attributes_for :curation_activities diff --git a/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb b/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb new file mode 100644 index 0000000000..8572c6a631 --- /dev/null +++ b/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb @@ -0,0 +1,15 @@ +class CreateStashEngineDownloadHistories < ActiveRecord::Migration + def change + create_table :stash_engine_download_histories do |t| + t.string :ip_address, index: true + t.text :user_agent + t.references :resource, index: true + t.references :file_upload, index: true + t.column :state, "ENUM('downloading', 'finished') DEFAULT 'downloading'" + t.timestamps null: false + + t.index :user_agent, length: { user_agent: 100 } + t.index :state + end + end +end From 4fc5c37619f3eda30c00d5dc5d2ea98bd8213dea Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Thu, 2 Jan 2020 11:14:32 -0800 Subject: [PATCH 02/12] Fixing rubocop complaints --- stash/stash_engine/app/models/stash_engine/file_upload.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stash/stash_engine/app/models/stash_engine/file_upload.rb b/stash/stash_engine/app/models/stash_engine/file_upload.rb index 35f26bc593..0cffd69897 100644 --- a/stash/stash_engine/app/models/stash_engine/file_upload.rb +++ b/stash/stash_engine/app/models/stash_engine/file_upload.rb @@ -1,10 +1,12 @@ require 'zaru' require 'cgi' + +# rubocop:disable Metrics/ClassLength module StashEngine class FileUpload < ActiveRecord::Base belongs_to :resource, class_name: 'StashEngine::Resource' has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy -' + include StashEngine::Concerns::ResourceUpdated # mount_uploader :uploader, FileUploader # it seems like maybe I don't need this since I'm doing so much manually @@ -145,3 +147,4 @@ def self.sanitize_file_name(name) end end end +# rubocop:enable Metrics/ClassLength From b48d91b11f23ab88239dbd5f5a785e4fe653d7f0 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Thu, 2 Jan 2020 15:24:19 -0800 Subject: [PATCH 03/12] Finding the right place to log download start and end and setting up info --- .../stash_engine/downloads_controller.rb | 17 +++++++---------- .../app/models/stash_engine/download_history.rb | 11 +++++++++++ stash/stash_engine/lib/stash/download/base.rb | 5 +++++ stash/stash_engine/lib/stash/download/file.rb | 4 +++- .../stash_engine/lib/stash/download/version.rb | 4 ++++ 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb index dc488afac4..9734b29f02 100644 --- a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb +++ b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb @@ -12,13 +12,15 @@ def setup_streaming @version_streamer = Stash::Download::Version.new(controller_context: self) @file_streamer = Stash::Download::File.new(controller_context: self) - # This is pretty hacky, but is a temporary fix since we are getting more bad actors hitting expensive downloads. + # This reads a text file with one line and a regular expression in it and blocks if the user-agent matches the regexp + agent_path = Rails.root.join('uploads', 'blacklist_agents.txt').to_s + if File.exist?(agent_path) && request.user_agent[Regexp.new(File.read(agent_path))] + head(429) + return + end + # This looks for uploads/blacklist.txt and if it's there matches IP addresses that start with things in the file-- # one IP address (or beginning of IP Address) per line. - # - # Can do something better than this when we have more time. This is a stopgap to stop excessive usage just before - # a holiday when we don't have other good ways to block yet without assistance from others. - block_path = Rails.root.join('uploads', 'blacklist.txt').to_s return unless File.exist?(block_path) @@ -120,11 +122,6 @@ def redirect_to_public ) end - def stream_download - CounterLogger.version_download_hit(request: request, resource: @resource) - Stash::Download::Version.stream_response(url: @resource.merritt_producer_download_uri, tenant: @resource.tenant) - end - def flash_download_unavailable flash[:notice] = [ 'This dataset was recently submitted and downloads are not yet available.', diff --git a/stash/stash_engine/app/models/stash_engine/download_history.rb b/stash/stash_engine/app/models/stash_engine/download_history.rb index a5d4401544..fcd9897762 100644 --- a/stash/stash_engine/app/models/stash_engine/download_history.rb +++ b/stash/stash_engine/app/models/stash_engine/download_history.rb @@ -3,5 +3,16 @@ class DownloadHistory < ActiveRecord::Base belongs_to :resource, class_name: 'StashEngine::Resource' belongs_to :file_upload, class_name: 'StashEngine::FileUpload' enum state: %w[downloading finished].map { |i| [i.to_sym, i] }.to_h + + def self.mark_start(ip:, user_agent:, resource_id:, file_id: nil) + create(ip_address: ip, user_agent: user_agent, resource_id: resource_id, file_upload_id: file_id, state: 'downloading') + end + + # this just changes the status so it is marked as finished + def self.mark_end(download_history: nil) + return if download_history.nil? + + download_history.update(state: 'finished') + end end end diff --git a/stash/stash_engine/lib/stash/download/base.rb b/stash/stash_engine/lib/stash/download/base.rb index 55581a8a58..5e9cf0950d 100644 --- a/stash/stash_engine/lib/stash/download/base.rb +++ b/stash/stash_engine/lib/stash/download/base.rb @@ -20,6 +20,7 @@ class Base def initialize(controller_context:) @cc = controller_context + @download_history = nil end # this is a method that should be overridden @@ -86,6 +87,9 @@ def send_stream(user_stream:, merritt_stream:) read_file = ::File.open(write_file, 'r') write_thread = Thread.new do + # tracking downloads needs to happen in the threads + @download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent, + resource_id: @resource_id, file_id: @file_id) # this only modifies the write file with contents of merritt stream save_to_file(merritt_stream: merritt_stream, write_file: write_file) end @@ -107,6 +111,7 @@ def send_stream(user_stream:, merritt_stream:) read_file&.close unless read_file&.closed? write_file&.close unless write_file&.closed? ::File.unlink(write_file&.path) if ::File.exist?(write_file&.path) + StashEngine::DownloadHistory.mark_end(download_history: @download_history) unless @download_history.nil? end # rubocop:enable Metrics/AbcSize, Metrics/MethodLength diff --git a/stash/stash_engine/lib/stash/download/file.rb b/stash/stash_engine/lib/stash/download/file.rb index 63024d998e..1d6f42f775 100644 --- a/stash/stash_engine/lib/stash/download/file.rb +++ b/stash/stash_engine/lib/stash/download/file.rb @@ -12,7 +12,9 @@ class File < Base # this downloads a file as a stream from Merritt express def download(file:) @file = file - # StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource) + # needs to be set for tracking downloads + @resource_id = @file&.resource&.id + @file_id = @file.id stream_response(url: file.merritt_express_url, tenant: file.resource.tenant, filename: disposition_filename) end diff --git a/stash/stash_engine/lib/stash/download/version.rb b/stash/stash_engine/lib/stash/download/version.rb index 2d5f99f041..0ea471a838 100644 --- a/stash/stash_engine/lib/stash/download/version.rb +++ b/stash/stash_engine/lib/stash/download/version.rb @@ -33,6 +33,10 @@ def download(resource:) if @async_download yield else + # needs to be set for tracking downloads + @resource_id = resource.id + @file_id = nil + StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource) # longer read timeout since Merritt may need to assemble zip file before starting download stream_response(url: resource.merritt_producer_download_uri, tenant: resource.tenant, filename: disposition_filename, read_timeout: 180) From f29fa195fcfaeaabc5c7f644ec36139be3ae30eb Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Thu, 2 Jan 2020 16:15:16 -0800 Subject: [PATCH 04/12] Getting previous tests to pass again. --- .../stash_engine/downloads_controller.rb | 21 ++++++++++--------- stash/stash_engine/lib/stash/download/base.rb | 6 +++--- .../spec/unit/stash/download/base_spec.rb | 4 ++++ .../spec/unit/stash/download/file_spec.rb | 7 +++++++ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb index 9734b29f02..f78a1f8ecd 100644 --- a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb +++ b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb @@ -5,20 +5,15 @@ # rubocop:disable Metrics/ClassLength module StashEngine class DownloadsController < ApplicationController - before_action :setup_streaming - - # set up the Merritt file & version objects so they have access to the controller context before continuing - def setup_streaming - @version_streamer = Stash::Download::Version.new(controller_context: self) - @file_streamer = Stash::Download::File.new(controller_context: self) + before_action :check_user_agent, :check_ip, :setup_streaming + def check_user_agent # This reads a text file with one line and a regular expression in it and blocks if the user-agent matches the regexp agent_path = Rails.root.join('uploads', 'blacklist_agents.txt').to_s - if File.exist?(agent_path) && request.user_agent[Regexp.new(File.read(agent_path))] - head(429) - return - end + head(429) if File.exist?(agent_path) && request.user_agent[Regexp.new(File.read(agent_path))] + end + def check_ip # This looks for uploads/blacklist.txt and if it's there matches IP addresses that start with things in the file-- # one IP address (or beginning of IP Address) per line. block_path = Rails.root.join('uploads', 'blacklist.txt').to_s @@ -34,6 +29,12 @@ def setup_streaming end end + # set up the Merritt file & version objects so they have access to the controller context before continuing + def setup_streaming + @version_streamer = Stash::Download::Version.new(controller_context: self) + @file_streamer = Stash::Download::File.new(controller_context: self) + end + # for downloading the full version def download_resource @resource = Resource.find(params[:resource_id]) diff --git a/stash/stash_engine/lib/stash/download/base.rb b/stash/stash_engine/lib/stash/download/base.rb index 5e9cf0950d..67d9cc2095 100644 --- a/stash/stash_engine/lib/stash/download/base.rb +++ b/stash/stash_engine/lib/stash/download/base.rb @@ -74,7 +74,7 @@ def send_headers(stream:, header_obj:, filename:) raise ex end - # rubocop:disable Metrics/AbcSize, Metrics/MethodLength + # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity def send_stream(user_stream:, merritt_stream:) # use this file to write contents of the stream FileUtils.mkdir_p(Rails.root.join('uploads')) # ensures this file is created if it doesn't exist, needed mostly for tests @@ -89,7 +89,7 @@ def send_stream(user_stream:, merritt_stream:) write_thread = Thread.new do # tracking downloads needs to happen in the threads @download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent, - resource_id: @resource_id, file_id: @file_id) + resource_id: @resource_id, file_id: @file_id) # this only modifies the write file with contents of merritt stream save_to_file(merritt_stream: merritt_stream, write_file: write_file) end @@ -113,7 +113,7 @@ def send_stream(user_stream:, merritt_stream:) ::File.unlink(write_file&.path) if ::File.exist?(write_file&.path) StashEngine::DownloadHistory.mark_end(download_history: @download_history) unless @download_history.nil? end - # rubocop:enable Metrics/AbcSize, Metrics/MethodLength + # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity def save_to_file(merritt_stream:, write_file:) chunk_size = 1024 * 512 # 512k diff --git a/stash/stash_engine/spec/unit/stash/download/base_spec.rb b/stash/stash_engine/spec/unit/stash/download/base_spec.rb index 6efe273b13..70a5aa50da 100644 --- a/stash/stash_engine/spec/unit/stash/download/base_spec.rb +++ b/stash/stash_engine/spec/unit/stash/download/base_spec.rb @@ -19,8 +19,12 @@ module Download # need to hack in Rails.root because our test framework setup sucks rails_root = Dir.mktmpdir('rails_root') allow(Rails).to receive(:root).and_return(Pathname.new(rails_root)) + @my_request = 'requesty' + allow(@my_request).to receive(:remote_ip).and_return('127.0.0.1') + allow(@my_request).to receive(:user_agent).and_return('HorseBrowser 1.1') @controller_context = 'blah' + allow(@controller_context).to receive(:request).and_return(@my_request) @logger = ActiveSupport::Logger.new(STDOUT) allow(@controller_context).to receive(:logger).and_return(@logger) end diff --git a/stash/stash_engine/spec/unit/stash/download/file_spec.rb b/stash/stash_engine/spec/unit/stash/download/file_spec.rb index c639617d03..6ac59e0e14 100644 --- a/stash/stash_engine/spec/unit/stash/download/file_spec.rb +++ b/stash/stash_engine/spec/unit/stash/download/file_spec.rb @@ -1,6 +1,8 @@ +require 'db_spec_helper' require 'spec_helper' require 'stash/download/file' require 'ostruct' +require_relative '../../../../../spec_helpers/factory_helper' # a base class for version and file downloads, providing some basic functions module Stash @@ -10,11 +12,16 @@ module Download describe '#download' do before(:each) do + # there is no db connection here so it blows up using real models + @identifier = create(:identifier) + @resource = create(:resource, identifier_id: @identifier.id) + @file = File.new(controller_context: OpenStruct.new(response_body: '', response: OpenStruct.new(headers: {}))) @file_upload = create(:file_upload) allow(@file_upload).to receive(:merritt_express_url).and_return('http://grah.example.com') allow(@file_upload).to receive_message_chain('resource.tenant') { 'hi, not really used' } + allow(@file_upload).to receive_message_chain('resource.id') { 22 } end it 'sets the @file automatically before downloading' do From 41e4012657b284804ffa6f6e7f2f3da16cbb41c2 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Fri, 3 Jan 2020 11:44:53 -0800 Subject: [PATCH 05/12] Adding before filter for concurrent download limit --- .../stash_engine/downloads_controller.rb | 10 +++++++++- .../models/stash_engine/download_history.rb | 3 +++ .../downloads/download_limit.html.erb | 20 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb diff --git a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb index f78a1f8ecd..2b5b6ca2b5 100644 --- a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb +++ b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb @@ -5,7 +5,10 @@ # rubocop:disable Metrics/ClassLength module StashEngine class DownloadsController < ApplicationController - before_action :check_user_agent, :check_ip, :setup_streaming + + CONCURRENT_DOWNLOAD_LIMIT = 2 + + before_action :check_user_agent, :check_ip, :stop_download_hogs, :setup_streaming def check_user_agent # This reads a text file with one line and a regular expression in it and blocks if the user-agent matches the regexp @@ -29,6 +32,11 @@ def check_ip end end + def stop_download_hogs + dl_count = DownloadHistory.where(ip_address: request&.remote_ip).downloading.count + render 'download_limit', status: 429 if dl_count >= CONCURRENT_DOWNLOAD_LIMIT + end + # set up the Merritt file & version objects so they have access to the controller context before continuing def setup_streaming @version_streamer = Stash::Download::Version.new(controller_context: self) diff --git a/stash/stash_engine/app/models/stash_engine/download_history.rb b/stash/stash_engine/app/models/stash_engine/download_history.rb index fcd9897762..73153b599d 100644 --- a/stash/stash_engine/app/models/stash_engine/download_history.rb +++ b/stash/stash_engine/app/models/stash_engine/download_history.rb @@ -4,6 +4,8 @@ class DownloadHistory < ActiveRecord::Base belongs_to :file_upload, class_name: 'StashEngine::FileUpload' enum state: %w[downloading finished].map { |i| [i.to_sym, i] }.to_h + scope :downloading, -> { where(state: 'downloading') } + def self.mark_start(ip:, user_agent:, resource_id:, file_id: nil) create(ip_address: ip, user_agent: user_agent, resource_id: resource_id, file_upload_id: file_id, state: 'downloading') end @@ -14,5 +16,6 @@ def self.mark_end(download_history: nil) download_history.update(state: 'finished') end + end end diff --git a/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb b/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb new file mode 100644 index 0000000000..2dc521f7a8 --- /dev/null +++ b/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb @@ -0,0 +1,20 @@ +<% @page_title = "Concurrent Download Limits" %> +

Concurrent download limits

+ +

+ In order to maintain a responsive service for all users, we limit concurrent downloads to + <%= StashEngine::DownloadsController::CONCURRENT_DOWNLOAD_LIMIT %> at a time. When an in-progress download + completes, you may start another. +

+ +

+ If you need to download all or most files in a dataset, please use the Download dataset button since it will give you + all the files in a single zip package and will usually also reduce your download time and size. +

+ +

+ If you are getting this message and you're using an automated robot or crawler, please reduce your usage to fewer + concurrent downloads at the same time. You should also be aware of the + robots.txt standard which is a voluntary + standard honored by most well-behaved crawlers. +

From c7dca191c2ef609e11b60d5bc624f162c57c5a40 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Fri, 3 Jan 2020 15:02:49 -0800 Subject: [PATCH 06/12] Adding rake task that can be run daily for cleaning up old download history to comply with privacy policy. --- spec/support/tasks.rb | 41 +++++++++++++++++++ spec/tasks/download_tracking_spec.rb | 33 +++++++++++++++ .../lib/tasks/download_tracking.rake | 16 ++++++++ 3 files changed, 90 insertions(+) create mode 100644 spec/support/tasks.rb create mode 100644 spec/tasks/download_tracking_spec.rb create mode 100644 stash/stash_engine/lib/tasks/download_tracking.rake diff --git a/spec/support/tasks.rb b/spec/support/tasks.rb new file mode 100644 index 0000000000..a74a68bf60 --- /dev/null +++ b/spec/support/tasks.rb @@ -0,0 +1,41 @@ +# see https://raw.githubusercontent.com/eliotsykes/rails-testing-toolbox/master/tasks.rb +# and https://www.eliotsykes.com/test-rails-rake-tasks-with-rspec +# For testing rake tasks +require 'rake' + +# Task names should be used in the top-level describe, with an optional +# "rake "-prefix for better documentation. Both of these will work: +# +# 1) describe 'foo:bar' do ... end +# +# 2) describe 'rake foo:bar' do ... end +# +# Favor including 'rake '-prefix as in the 2nd example above as it produces +# doc output that makes it clear a rake task is under test and how it is +# invoked. +module TaskExampleGroup + extend ActiveSupport::Concern + + included do + let(:task_name) { self.class.top_level_description.sub(/\Arake /, '') } + let(:tasks) { Rake::Task } + subject(:task) { tasks[task_name] } + + after(:each) do + task.all_prerequisite_tasks.each { |prerequisite| tasks[prerequisite].reenable } + task.reenable + end + end +end + +RSpec.configure do |config| + config.define_derived_metadata(file_path: %r{/spec/tasks/}) do |metadata| + metadata[:type] = :task + end + + config.include TaskExampleGroup, type: :task + + config.before(:suite) do + Rails.application.load_tasks + end +end diff --git a/spec/tasks/download_tracking_spec.rb b/spec/tasks/download_tracking_spec.rb new file mode 100644 index 0000000000..55aad0c67a --- /dev/null +++ b/spec/tasks/download_tracking_spec.rb @@ -0,0 +1,33 @@ +require 'rails_helper' +require 'byebug' + +describe 'download_tracking:cleanup', type: :task do + + it 'preloads the Rails environment' do + expect(task.prerequisites).to include 'environment' + end + + it 'logs to stdout' do + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + end + + it 'cleans up old entries more than 60 days old' do + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 65.days.ago) + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1') + expect(StashEngine::DownloadHistory.all.count).to eq(2) + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + # it removed the old one + expect(StashEngine::DownloadHistory.all.count).to eq(1) + end + + it 'fixes download status for non-updated items more than one day old' do + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 3.days.ago, state: 'downloading') + dlh = StashEngine::DownloadHistory.all + expect(dlh.count).to eq(1) + item = dlh.first + expect(item.state).to eq('downloading') + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + item.reload + expect(item.state).to eq('finished') + end +end diff --git a/stash/stash_engine/lib/tasks/download_tracking.rake b/stash/stash_engine/lib/tasks/download_tracking.rake new file mode 100644 index 0000000000..b075a9083c --- /dev/null +++ b/stash/stash_engine/lib/tasks/download_tracking.rake @@ -0,0 +1,16 @@ +namespace :download_tracking do + + # because of our privacy policy we don't keep most logs or log-like info for more than 60 days as I understand it + desc 'cleanup download history' + task cleanup: :environment do + # Reset downloads from table that show longer than 24 hours in progress, since most likely the status is wrong. + # If the server is shut down in a disorderly way, the 'ensure' clause may not run to close out download info. + # It seems to happen quite rarely (also leaves stray download files in our EFS mount in the rare cases). + StashEngine::DownloadHistory.downloading.where('created_at < ?', 1.day.ago).update_all(state: 'finished') + + # clean up information older than 60 days to comply with privacy and no callbacks + StashEngine::DownloadHistory.where('created_at < ?', 60.days.ago).delete_all + puts "#{Time.new.iso8601} Finished DownloadHistory cleanup" + end + +end From bd26a06226e41a9b7974ff00e9b8bf6fecb11921 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Fri, 3 Jan 2020 16:19:29 -0800 Subject: [PATCH 07/12] Tests for DownloadHistory methods. --- .../stash_engine/download_history_spec.rb | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 spec/models/stash_engine/download_history_spec.rb diff --git a/spec/models/stash_engine/download_history_spec.rb b/spec/models/stash_engine/download_history_spec.rb new file mode 100644 index 0000000000..ed1016879f --- /dev/null +++ b/spec/models/stash_engine/download_history_spec.rb @@ -0,0 +1,35 @@ +# Trying to add new models to this area instead of inside 'stash' subdirectories since it's much easier to maintain tests +# inside of here than in the engines. Also the loading of tests isn't hacked together in some unmaintainable and non-standard +# way like they are inside the engine tests. + +require 'rails_helper' + +module StashEngine + RSpec.describe DownloadHistory, type: :model do + + describe :mark_event do + before(:each) do + DownloadHistory.mark_start(ip: '168.10.0.1', user_agent: 'HorseStomping Browser', resource_id: 37, file_id: 88) + end + + it 'adds started downloads to the database' do + expect(DownloadHistory.all.count).to eq(1) + dlh = DownloadHistory.all.first + expect(dlh.ip_address).to eq('168.10.0.1') + expect(dlh.user_agent).to eq('HorseStomping Browser') + expect(dlh.state).to eq('downloading') + end + + it 'modifies finished download to change state and updated_at timestamp' do + expect(DownloadHistory.all.count).to eq(1) + dlh = DownloadHistory.all.first + updated_at_before = dlh.updated_at + sleep 1 # just to be sure timestamps are different seconds + DownloadHistory.mark_end(download_history: dlh) + dlh.reload + expect(dlh.updated_at).not_to eq(updated_at_before) + expect(dlh.state).to eq('finished') + end + end + end +end \ No newline at end of file From 6efa188d7967eea9f678294cfafda26b15c2890b Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Fri, 3 Jan 2020 17:04:18 -0800 Subject: [PATCH 08/12] I think this will terminate downloads from Merritt if a user terminates the download early. --- stash/stash_engine/lib/stash/download/base.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/stash/stash_engine/lib/stash/download/base.rb b/stash/stash_engine/lib/stash/download/base.rb index 67d9cc2095..2bb44dc404 100644 --- a/stash/stash_engine/lib/stash/download/base.rb +++ b/stash/stash_engine/lib/stash/download/base.rb @@ -86,18 +86,20 @@ def send_stream(user_stream:, merritt_stream:) # read past the end of what has been written so far read_file = ::File.open(write_file, 'r') + download_canceled = false + write_thread = Thread.new do # tracking downloads needs to happen in the threads @download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent, resource_id: @resource_id, file_id: @file_id) # this only modifies the write file with contents of merritt stream - save_to_file(merritt_stream: merritt_stream, write_file: write_file) + save_to_file(merritt_stream: merritt_stream, write_file: write_file, download_canceled: download_canceled) end read_thread = Thread.new do # this only modifies the user stream based on the contents of read file. The other other # objects are only read or have state checked. Ensures it doesn't read past the end of content. - stream_from_file(read_file: read_file, write_file: write_file, user_stream: user_stream) + stream_from_file(read_file: read_file, write_file: write_file, user_stream: user_stream, download_canceled: download_canceled) end write_thread.join @@ -115,11 +117,12 @@ def send_stream(user_stream:, merritt_stream:) end # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity - def save_to_file(merritt_stream:, write_file:) + def save_to_file(merritt_stream:, write_file:, download_canceled:) chunk_size = 1024 * 512 # 512k while (chunk = merritt_stream.readpartial(chunk_size)) write_file.write(chunk) + break if download_canceled end rescue EOFError => ex # I believe Ruby has this error with certain kinds of IO objects such as StringIO in testing, but seems to have written @@ -132,7 +135,7 @@ def save_to_file(merritt_stream:, write_file:) end # rubocop:disable Metrics/CyclomaticComplexity - def stream_from_file(read_file:, write_file:, user_stream:) + def stream_from_file(read_file:, write_file:, user_stream:, download_canceled:) read_chunk_size = 1024 * 16 # 16k until read_file.closed? @@ -151,6 +154,7 @@ def stream_from_file(read_file:, write_file:, user_stream:) ensure read_file.close unless read_file.closed? user_stream.close unless user_stream.closed? + download_canceled = true # set user download canceled (finished) to true in shared state to notify other thread to terminate its download end # rubocop:enable Metrics/CyclomaticComplexity end From be876b740e90b81a6453b646757f081d7f3c49b3 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Fri, 3 Jan 2020 17:13:58 -0800 Subject: [PATCH 09/12] Trying to fix concurrency issue with download being canceled so it doesn't continue reading all of file from Merritt on a download cancel by user --- stash/stash_engine/lib/stash/download/base.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stash/stash_engine/lib/stash/download/base.rb b/stash/stash_engine/lib/stash/download/base.rb index 2bb44dc404..81e1c70365 100644 --- a/stash/stash_engine/lib/stash/download/base.rb +++ b/stash/stash_engine/lib/stash/download/base.rb @@ -86,20 +86,20 @@ def send_stream(user_stream:, merritt_stream:) # read past the end of what has been written so far read_file = ::File.open(write_file, 'r') - download_canceled = false + @download_canceled = false write_thread = Thread.new do # tracking downloads needs to happen in the threads @download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent, resource_id: @resource_id, file_id: @file_id) # this only modifies the write file with contents of merritt stream - save_to_file(merritt_stream: merritt_stream, write_file: write_file, download_canceled: download_canceled) + save_to_file(merritt_stream: merritt_stream, write_file: write_file) end read_thread = Thread.new do # this only modifies the user stream based on the contents of read file. The other other # objects are only read or have state checked. Ensures it doesn't read past the end of content. - stream_from_file(read_file: read_file, write_file: write_file, user_stream: user_stream, download_canceled: download_canceled) + stream_from_file(read_file: read_file, write_file: write_file, user_stream: user_stream) end write_thread.join @@ -117,12 +117,12 @@ def send_stream(user_stream:, merritt_stream:) end # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity - def save_to_file(merritt_stream:, write_file:, download_canceled:) + def save_to_file(merritt_stream:, write_file:) chunk_size = 1024 * 512 # 512k while (chunk = merritt_stream.readpartial(chunk_size)) write_file.write(chunk) - break if download_canceled + break if @download_canceled end rescue EOFError => ex # I believe Ruby has this error with certain kinds of IO objects such as StringIO in testing, but seems to have written @@ -135,7 +135,7 @@ def save_to_file(merritt_stream:, write_file:, download_canceled:) end # rubocop:disable Metrics/CyclomaticComplexity - def stream_from_file(read_file:, write_file:, user_stream:, download_canceled:) + def stream_from_file(read_file:, write_file:, user_stream:) read_chunk_size = 1024 * 16 # 16k until read_file.closed? @@ -154,7 +154,7 @@ def stream_from_file(read_file:, write_file:, user_stream:, download_canceled:) ensure read_file.close unless read_file.closed? user_stream.close unless user_stream.closed? - download_canceled = true # set user download canceled (finished) to true in shared state to notify other thread to terminate its download + @download_canceled = true # set user download canceled (finished) to true in shared state to notify other thread to terminate its download end # rubocop:enable Metrics/CyclomaticComplexity end From b607541867966badea9ff143a35407242dff3f36 Mon Sep 17 00:00:00 2001 From: Ryan Scherle Date: Mon, 6 Jan 2020 16:41:36 +0000 Subject: [PATCH 10/12] rubocop --- spec/models/stash_engine/download_history_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/models/stash_engine/download_history_spec.rb b/spec/models/stash_engine/download_history_spec.rb index ed1016879f..8aaa0a9c99 100644 --- a/spec/models/stash_engine/download_history_spec.rb +++ b/spec/models/stash_engine/download_history_spec.rb @@ -32,4 +32,4 @@ module StashEngine end end end -end \ No newline at end of file +end From ee13d64e0e2b48120552396711b30404e2963ef2 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Mon, 6 Jan 2020 11:42:19 -0800 Subject: [PATCH 11/12] Fix some weirdness with the year changing with %G formatting of year vs %Y. It was producing 2020-52 instead of 2019-52 for the last week of last year. --- stash/stash_engine/app/models/stash_engine/counter_stat.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stash/stash_engine/app/models/stash_engine/counter_stat.rb b/stash/stash_engine/app/models/stash_engine/counter_stat.rb index 36bd906119..0b085cd20e 100644 --- a/stash/stash_engine/app/models/stash_engine/counter_stat.rb +++ b/stash/stash_engine/app/models/stash_engine/counter_stat.rb @@ -53,9 +53,9 @@ def update_citation_count! def calendar_week(time) # %W calculates weeks based on starting Monday and not Sunday, %U is Sunday and %V is ???. # This produces year-week string. - return (Time.new - 30.days).strftime('%G-%W') if time.nil? || !time.is_a?(Time) + return (30.days.ago).strftime('%Y-%W') if time.nil? || !time.is_a?(Time) - time.strftime('%G-%W') + time.strftime('%Y-%W') end end From 9e15338f3b8a239b602be4612b7ca87b6f73d904 Mon Sep 17 00:00:00 2001 From: Scott Fisher Date: Mon, 6 Jan 2020 11:54:46 -0800 Subject: [PATCH 12/12] rubocop fix --- stash/stash_engine/app/models/stash_engine/counter_stat.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stash/stash_engine/app/models/stash_engine/counter_stat.rb b/stash/stash_engine/app/models/stash_engine/counter_stat.rb index 0b085cd20e..23c82190f2 100644 --- a/stash/stash_engine/app/models/stash_engine/counter_stat.rb +++ b/stash/stash_engine/app/models/stash_engine/counter_stat.rb @@ -53,7 +53,7 @@ def update_citation_count! def calendar_week(time) # %W calculates weeks based on starting Monday and not Sunday, %U is Sunday and %V is ???. # This produces year-week string. - return (30.days.ago).strftime('%Y-%W') if time.nil? || !time.is_a?(Time) + return 30.days.ago.strftime('%Y-%W') if time.nil? || !time.is_a?(Time) time.strftime('%Y-%W') end