diff --git a/spec/models/stash_engine/download_history_spec.rb b/spec/models/stash_engine/download_history_spec.rb new file mode 100644 index 0000000000..8aaa0a9c99 --- /dev/null +++ b/spec/models/stash_engine/download_history_spec.rb @@ -0,0 +1,35 @@ +# Trying to add new models to this area instead of inside 'stash' subdirectories since it's much easier to maintain tests +# inside of here than in the engines. Also the loading of tests isn't hacked together in some unmaintainable and non-standard +# way like they are inside the engine tests. + +require 'rails_helper' + +module StashEngine + RSpec.describe DownloadHistory, type: :model do + + describe :mark_event do + before(:each) do + DownloadHistory.mark_start(ip: '168.10.0.1', user_agent: 'HorseStomping Browser', resource_id: 37, file_id: 88) + end + + it 'adds started downloads to the database' do + expect(DownloadHistory.all.count).to eq(1) + dlh = DownloadHistory.all.first + expect(dlh.ip_address).to eq('168.10.0.1') + expect(dlh.user_agent).to eq('HorseStomping Browser') + expect(dlh.state).to eq('downloading') + end + + it 'modifies finished download to change state and updated_at timestamp' do + expect(DownloadHistory.all.count).to eq(1) + dlh = DownloadHistory.all.first + updated_at_before = dlh.updated_at + sleep 1 # just to be sure timestamps are different seconds + DownloadHistory.mark_end(download_history: dlh) + dlh.reload + expect(dlh.updated_at).not_to eq(updated_at_before) + expect(dlh.state).to eq('finished') + end + end + end +end diff --git a/spec/support/tasks.rb b/spec/support/tasks.rb new file mode 100644 index 0000000000..a74a68bf60 --- /dev/null +++ b/spec/support/tasks.rb @@ -0,0 +1,41 @@ +# see https://raw.githubusercontent.com/eliotsykes/rails-testing-toolbox/master/tasks.rb +# and https://www.eliotsykes.com/test-rails-rake-tasks-with-rspec +# For testing rake tasks +require 'rake' + +# Task names should be used in the top-level describe, with an optional +# "rake "-prefix for better documentation. Both of these will work: +# +# 1) describe 'foo:bar' do ... end +# +# 2) describe 'rake foo:bar' do ... end +# +# Favor including 'rake '-prefix as in the 2nd example above as it produces +# doc output that makes it clear a rake task is under test and how it is +# invoked. +module TaskExampleGroup + extend ActiveSupport::Concern + + included do + let(:task_name) { self.class.top_level_description.sub(/\Arake /, '') } + let(:tasks) { Rake::Task } + subject(:task) { tasks[task_name] } + + after(:each) do + task.all_prerequisite_tasks.each { |prerequisite| tasks[prerequisite].reenable } + task.reenable + end + end +end + +RSpec.configure do |config| + config.define_derived_metadata(file_path: %r{/spec/tasks/}) do |metadata| + metadata[:type] = :task + end + + config.include TaskExampleGroup, type: :task + + config.before(:suite) do + Rails.application.load_tasks + end +end diff --git a/spec/tasks/download_tracking_spec.rb b/spec/tasks/download_tracking_spec.rb new file mode 100644 index 0000000000..55aad0c67a --- /dev/null +++ b/spec/tasks/download_tracking_spec.rb @@ -0,0 +1,33 @@ +require 'rails_helper' +require 'byebug' + +describe 'download_tracking:cleanup', type: :task do + + it 'preloads the Rails environment' do + expect(task.prerequisites).to include 'environment' + end + + it 'logs to stdout' do + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + end + + it 'cleans up old entries more than 60 days old' do + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 65.days.ago) + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1') + expect(StashEngine::DownloadHistory.all.count).to eq(2) + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + # it removed the old one + expect(StashEngine::DownloadHistory.all.count).to eq(1) + end + + it 'fixes download status for non-updated items more than one day old' do + StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 3.days.ago, state: 'downloading') + dlh = StashEngine::DownloadHistory.all + expect(dlh.count).to eq(1) + item = dlh.first + expect(item.state).to eq('downloading') + expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout + item.reload + expect(item.state).to eq('finished') + end +end diff --git a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb index dc488afac4..2b5b6ca2b5 100644 --- a/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb +++ b/stash/stash_engine/app/controllers/stash_engine/downloads_controller.rb @@ -5,20 +5,20 @@ # rubocop:disable Metrics/ClassLength module StashEngine class DownloadsController < ApplicationController - before_action :setup_streaming - # set up the Merritt file & version objects so they have access to the controller context before continuing - def setup_streaming - @version_streamer = Stash::Download::Version.new(controller_context: self) - @file_streamer = Stash::Download::File.new(controller_context: self) + CONCURRENT_DOWNLOAD_LIMIT = 2 + + before_action :check_user_agent, :check_ip, :stop_download_hogs, :setup_streaming - # This is pretty hacky, but is a temporary fix since we are getting more bad actors hitting expensive downloads. + def check_user_agent + # This reads a text file with one line and a regular expression in it and blocks if the user-agent matches the regexp + agent_path = Rails.root.join('uploads', 'blacklist_agents.txt').to_s + head(429) if File.exist?(agent_path) && request.user_agent[Regexp.new(File.read(agent_path))] + end + + def check_ip # This looks for uploads/blacklist.txt and if it's there matches IP addresses that start with things in the file-- # one IP address (or beginning of IP Address) per line. - # - # Can do something better than this when we have more time. This is a stopgap to stop excessive usage just before - # a holiday when we don't have other good ways to block yet without assistance from others. - block_path = Rails.root.join('uploads', 'blacklist.txt').to_s return unless File.exist?(block_path) @@ -32,6 +32,17 @@ def setup_streaming end end + def stop_download_hogs + dl_count = DownloadHistory.where(ip_address: request&.remote_ip).downloading.count + render 'download_limit', status: 429 if dl_count >= CONCURRENT_DOWNLOAD_LIMIT + end + + # set up the Merritt file & version objects so they have access to the controller context before continuing + def setup_streaming + @version_streamer = Stash::Download::Version.new(controller_context: self) + @file_streamer = Stash::Download::File.new(controller_context: self) + end + # for downloading the full version def download_resource @resource = Resource.find(params[:resource_id]) @@ -120,11 +131,6 @@ def redirect_to_public ) end - def stream_download - CounterLogger.version_download_hit(request: request, resource: @resource) - Stash::Download::Version.stream_response(url: @resource.merritt_producer_download_uri, tenant: @resource.tenant) - end - def flash_download_unavailable flash[:notice] = [ 'This dataset was recently submitted and downloads are not yet available.', diff --git a/stash/stash_engine/app/models/stash_engine/counter_stat.rb b/stash/stash_engine/app/models/stash_engine/counter_stat.rb index 36bd906119..23c82190f2 100644 --- a/stash/stash_engine/app/models/stash_engine/counter_stat.rb +++ b/stash/stash_engine/app/models/stash_engine/counter_stat.rb @@ -53,9 +53,9 @@ def update_citation_count! def calendar_week(time) # %W calculates weeks based on starting Monday and not Sunday, %U is Sunday and %V is ???. # This produces year-week string. - return (Time.new - 30.days).strftime('%G-%W') if time.nil? || !time.is_a?(Time) + return 30.days.ago.strftime('%Y-%W') if time.nil? || !time.is_a?(Time) - time.strftime('%G-%W') + time.strftime('%Y-%W') end end diff --git a/stash/stash_engine/app/models/stash_engine/download_history.rb b/stash/stash_engine/app/models/stash_engine/download_history.rb new file mode 100644 index 0000000000..73153b599d --- /dev/null +++ b/stash/stash_engine/app/models/stash_engine/download_history.rb @@ -0,0 +1,21 @@ +module StashEngine + class DownloadHistory < ActiveRecord::Base + belongs_to :resource, class_name: 'StashEngine::Resource' + belongs_to :file_upload, class_name: 'StashEngine::FileUpload' + enum state: %w[downloading finished].map { |i| [i.to_sym, i] }.to_h + + scope :downloading, -> { where(state: 'downloading') } + + def self.mark_start(ip:, user_agent:, resource_id:, file_id: nil) + create(ip_address: ip, user_agent: user_agent, resource_id: resource_id, file_upload_id: file_id, state: 'downloading') + end + + # this just changes the status so it is marked as finished + def self.mark_end(download_history: nil) + return if download_history.nil? + + download_history.update(state: 'finished') + end + + end +end diff --git a/stash/stash_engine/app/models/stash_engine/file_upload.rb b/stash/stash_engine/app/models/stash_engine/file_upload.rb index f4445afbe5..0cffd69897 100644 --- a/stash/stash_engine/app/models/stash_engine/file_upload.rb +++ b/stash/stash_engine/app/models/stash_engine/file_upload.rb @@ -1,8 +1,12 @@ require 'zaru' require 'cgi' + +# rubocop:disable Metrics/ClassLength module StashEngine class FileUpload < ActiveRecord::Base belongs_to :resource, class_name: 'StashEngine::Resource' + has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy + include StashEngine::Concerns::ResourceUpdated # mount_uploader :uploader, FileUploader # it seems like maybe I don't need this since I'm doing so much manually @@ -143,3 +147,4 @@ def self.sanitize_file_name(name) end end end +# rubocop:enable Metrics/ClassLength diff --git a/stash/stash_engine/app/models/stash_engine/resource.rb b/stash/stash_engine/app/models/stash_engine/resource.rb index 33a10f3f24..35a375d66d 100644 --- a/stash/stash_engine/app/models/stash_engine/resource.rb +++ b/stash/stash_engine/app/models/stash_engine/resource.rb @@ -24,6 +24,7 @@ class Resource < ActiveRecord::Base # rubocop:disable Metrics/ClassLength has_many :edit_histories, class_name: 'StashEngine::EditHistory', dependent: :destroy has_many :curation_activities, -> { order(id: :asc) }, class_name: 'StashEngine::CurationActivity', dependent: :destroy has_many :repo_queue_states, class_name: 'StashEngine::RepoQueueState', dependent: :destroy + has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy accepts_nested_attributes_for :curation_activities diff --git a/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb b/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb new file mode 100644 index 0000000000..2dc521f7a8 --- /dev/null +++ b/stash/stash_engine/app/views/stash_engine/downloads/download_limit.html.erb @@ -0,0 +1,20 @@ +<% @page_title = "Concurrent Download Limits" %> +
+ In order to maintain a responsive service for all users, we limit concurrent downloads to + <%= StashEngine::DownloadsController::CONCURRENT_DOWNLOAD_LIMIT %> at a time. When an in-progress download + completes, you may start another. +
+ ++ If you need to download all or most files in a dataset, please use the Download dataset button since it will give you + all the files in a single zip package and will usually also reduce your download time and size. +
+ ++ If you are getting this message and you're using an automated robot or crawler, please reduce your usage to fewer + concurrent downloads at the same time. You should also be aware of the + robots.txt standard which is a voluntary + standard honored by most well-behaved crawlers. +
diff --git a/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb b/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb new file mode 100644 index 0000000000..8572c6a631 --- /dev/null +++ b/stash/stash_engine/db/migrate/20200102182121_create_stash_engine_download_histories.rb @@ -0,0 +1,15 @@ +class CreateStashEngineDownloadHistories < ActiveRecord::Migration + def change + create_table :stash_engine_download_histories do |t| + t.string :ip_address, index: true + t.text :user_agent + t.references :resource, index: true + t.references :file_upload, index: true + t.column :state, "ENUM('downloading', 'finished') DEFAULT 'downloading'" + t.timestamps null: false + + t.index :user_agent, length: { user_agent: 100 } + t.index :state + end + end +end diff --git a/stash/stash_engine/lib/stash/download/base.rb b/stash/stash_engine/lib/stash/download/base.rb index 55581a8a58..81e1c70365 100644 --- a/stash/stash_engine/lib/stash/download/base.rb +++ b/stash/stash_engine/lib/stash/download/base.rb @@ -20,6 +20,7 @@ class Base def initialize(controller_context:) @cc = controller_context + @download_history = nil end # this is a method that should be overridden @@ -73,7 +74,7 @@ def send_headers(stream:, header_obj:, filename:) raise ex end - # rubocop:disable Metrics/AbcSize, Metrics/MethodLength + # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity def send_stream(user_stream:, merritt_stream:) # use this file to write contents of the stream FileUtils.mkdir_p(Rails.root.join('uploads')) # ensures this file is created if it doesn't exist, needed mostly for tests @@ -85,7 +86,12 @@ def send_stream(user_stream:, merritt_stream:) # read past the end of what has been written so far read_file = ::File.open(write_file, 'r') + @download_canceled = false + write_thread = Thread.new do + # tracking downloads needs to happen in the threads + @download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent, + resource_id: @resource_id, file_id: @file_id) # this only modifies the write file with contents of merritt stream save_to_file(merritt_stream: merritt_stream, write_file: write_file) end @@ -107,14 +113,16 @@ def send_stream(user_stream:, merritt_stream:) read_file&.close unless read_file&.closed? write_file&.close unless write_file&.closed? ::File.unlink(write_file&.path) if ::File.exist?(write_file&.path) + StashEngine::DownloadHistory.mark_end(download_history: @download_history) unless @download_history.nil? end - # rubocop:enable Metrics/AbcSize, Metrics/MethodLength + # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity def save_to_file(merritt_stream:, write_file:) chunk_size = 1024 * 512 # 512k while (chunk = merritt_stream.readpartial(chunk_size)) write_file.write(chunk) + break if @download_canceled end rescue EOFError => ex # I believe Ruby has this error with certain kinds of IO objects such as StringIO in testing, but seems to have written @@ -146,6 +154,7 @@ def stream_from_file(read_file:, write_file:, user_stream:) ensure read_file.close unless read_file.closed? user_stream.close unless user_stream.closed? + @download_canceled = true # set user download canceled (finished) to true in shared state to notify other thread to terminate its download end # rubocop:enable Metrics/CyclomaticComplexity end diff --git a/stash/stash_engine/lib/stash/download/file.rb b/stash/stash_engine/lib/stash/download/file.rb index 63024d998e..1d6f42f775 100644 --- a/stash/stash_engine/lib/stash/download/file.rb +++ b/stash/stash_engine/lib/stash/download/file.rb @@ -12,7 +12,9 @@ class File < Base # this downloads a file as a stream from Merritt express def download(file:) @file = file - # StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource) + # needs to be set for tracking downloads + @resource_id = @file&.resource&.id + @file_id = @file.id stream_response(url: file.merritt_express_url, tenant: file.resource.tenant, filename: disposition_filename) end diff --git a/stash/stash_engine/lib/stash/download/version.rb b/stash/stash_engine/lib/stash/download/version.rb index 2d5f99f041..0ea471a838 100644 --- a/stash/stash_engine/lib/stash/download/version.rb +++ b/stash/stash_engine/lib/stash/download/version.rb @@ -33,6 +33,10 @@ def download(resource:) if @async_download yield else + # needs to be set for tracking downloads + @resource_id = resource.id + @file_id = nil + StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource) # longer read timeout since Merritt may need to assemble zip file before starting download stream_response(url: resource.merritt_producer_download_uri, tenant: resource.tenant, filename: disposition_filename, read_timeout: 180) diff --git a/stash/stash_engine/lib/tasks/download_tracking.rake b/stash/stash_engine/lib/tasks/download_tracking.rake new file mode 100644 index 0000000000..b075a9083c --- /dev/null +++ b/stash/stash_engine/lib/tasks/download_tracking.rake @@ -0,0 +1,16 @@ +namespace :download_tracking do + + # because of our privacy policy we don't keep most logs or log-like info for more than 60 days as I understand it + desc 'cleanup download history' + task cleanup: :environment do + # Reset downloads from table that show longer than 24 hours in progress, since most likely the status is wrong. + # If the server is shut down in a disorderly way, the 'ensure' clause may not run to close out download info. + # It seems to happen quite rarely (also leaves stray download files in our EFS mount in the rare cases). + StashEngine::DownloadHistory.downloading.where('created_at < ?', 1.day.ago).update_all(state: 'finished') + + # clean up information older than 60 days to comply with privacy and no callbacks + StashEngine::DownloadHistory.where('created_at < ?', 60.days.ago).delete_all + puts "#{Time.new.iso8601} Finished DownloadHistory cleanup" + end + +end diff --git a/stash/stash_engine/spec/unit/stash/download/base_spec.rb b/stash/stash_engine/spec/unit/stash/download/base_spec.rb index 6efe273b13..70a5aa50da 100644 --- a/stash/stash_engine/spec/unit/stash/download/base_spec.rb +++ b/stash/stash_engine/spec/unit/stash/download/base_spec.rb @@ -19,8 +19,12 @@ module Download # need to hack in Rails.root because our test framework setup sucks rails_root = Dir.mktmpdir('rails_root') allow(Rails).to receive(:root).and_return(Pathname.new(rails_root)) + @my_request = 'requesty' + allow(@my_request).to receive(:remote_ip).and_return('127.0.0.1') + allow(@my_request).to receive(:user_agent).and_return('HorseBrowser 1.1') @controller_context = 'blah' + allow(@controller_context).to receive(:request).and_return(@my_request) @logger = ActiveSupport::Logger.new(STDOUT) allow(@controller_context).to receive(:logger).and_return(@logger) end diff --git a/stash/stash_engine/spec/unit/stash/download/file_spec.rb b/stash/stash_engine/spec/unit/stash/download/file_spec.rb index c639617d03..6ac59e0e14 100644 --- a/stash/stash_engine/spec/unit/stash/download/file_spec.rb +++ b/stash/stash_engine/spec/unit/stash/download/file_spec.rb @@ -1,6 +1,8 @@ +require 'db_spec_helper' require 'spec_helper' require 'stash/download/file' require 'ostruct' +require_relative '../../../../../spec_helpers/factory_helper' # a base class for version and file downloads, providing some basic functions module Stash @@ -10,11 +12,16 @@ module Download describe '#download' do before(:each) do + # there is no db connection here so it blows up using real models + @identifier = create(:identifier) + @resource = create(:resource, identifier_id: @identifier.id) + @file = File.new(controller_context: OpenStruct.new(response_body: '', response: OpenStruct.new(headers: {}))) @file_upload = create(:file_upload) allow(@file_upload).to receive(:merritt_express_url).and_return('http://grah.example.com') allow(@file_upload).to receive_message_chain('resource.tenant') { 'hi, not really used' } + allow(@file_upload).to receive_message_chain('resource.id') { 22 } end it 'sets the @file automatically before downloading' do