Skip to content

Commit

Permalink
Merge pull request #39 from CDL-Dryad/download-limits
Browse files Browse the repository at this point in the history
Limit simultaneous downloads by ip address
  • Loading branch information
ryscher authored Jan 6, 2020
2 parents 2036e2e + 9e15338 commit 3432042
Show file tree
Hide file tree
Showing 16 changed files with 239 additions and 20 deletions.
35 changes: 35 additions & 0 deletions spec/models/stash_engine/download_history_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Trying to add new models to this area instead of inside 'stash' subdirectories since it's much easier to maintain tests
# inside of here than in the engines. Also the loading of tests isn't hacked together in some unmaintainable and non-standard
# way like they are inside the engine tests.

require 'rails_helper'

module StashEngine
RSpec.describe DownloadHistory, type: :model do

describe :mark_event do
before(:each) do
DownloadHistory.mark_start(ip: '168.10.0.1', user_agent: 'HorseStomping Browser', resource_id: 37, file_id: 88)
end

it 'adds started downloads to the database' do
expect(DownloadHistory.all.count).to eq(1)
dlh = DownloadHistory.all.first
expect(dlh.ip_address).to eq('168.10.0.1')
expect(dlh.user_agent).to eq('HorseStomping Browser')
expect(dlh.state).to eq('downloading')
end

it 'modifies finished download to change state and updated_at timestamp' do
expect(DownloadHistory.all.count).to eq(1)
dlh = DownloadHistory.all.first
updated_at_before = dlh.updated_at
sleep 1 # just to be sure timestamps are different seconds
DownloadHistory.mark_end(download_history: dlh)
dlh.reload
expect(dlh.updated_at).not_to eq(updated_at_before)
expect(dlh.state).to eq('finished')
end
end
end
end
41 changes: 41 additions & 0 deletions spec/support/tasks.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# see https://raw.githubusercontent.com/eliotsykes/rails-testing-toolbox/master/tasks.rb
# and https://www.eliotsykes.com/test-rails-rake-tasks-with-rspec
# For testing rake tasks
require 'rake'

# Task names should be used in the top-level describe, with an optional
# "rake "-prefix for better documentation. Both of these will work:
#
# 1) describe 'foo:bar' do ... end
#
# 2) describe 'rake foo:bar' do ... end
#
# Favor including 'rake '-prefix as in the 2nd example above as it produces
# doc output that makes it clear a rake task is under test and how it is
# invoked.
module TaskExampleGroup
extend ActiveSupport::Concern

included do
let(:task_name) { self.class.top_level_description.sub(/\Arake /, '') }
let(:tasks) { Rake::Task }
subject(:task) { tasks[task_name] }

after(:each) do
task.all_prerequisite_tasks.each { |prerequisite| tasks[prerequisite].reenable }
task.reenable
end
end
end

RSpec.configure do |config|
config.define_derived_metadata(file_path: %r{/spec/tasks/}) do |metadata|
metadata[:type] = :task
end

config.include TaskExampleGroup, type: :task

config.before(:suite) do
Rails.application.load_tasks
end
end
33 changes: 33 additions & 0 deletions spec/tasks/download_tracking_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
require 'rails_helper'
require 'byebug'

describe 'download_tracking:cleanup', type: :task do

it 'preloads the Rails environment' do
expect(task.prerequisites).to include 'environment'
end

it 'logs to stdout' do
expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout
end

it 'cleans up old entries more than 60 days old' do
StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 65.days.ago)
StashEngine::DownloadHistory.create(ip_address: '127.0.0.1')
expect(StashEngine::DownloadHistory.all.count).to eq(2)
expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout
# it removed the old one
expect(StashEngine::DownloadHistory.all.count).to eq(1)
end

it 'fixes download status for non-updated items more than one day old' do
StashEngine::DownloadHistory.create(ip_address: '127.0.0.1', created_at: 3.days.ago, state: 'downloading')
dlh = StashEngine::DownloadHistory.all
expect(dlh.count).to eq(1)
item = dlh.first
expect(item.state).to eq('downloading')
expect { task.execute }.to output(/Finished DownloadHistory cleanup/).to_stdout
item.reload
expect(item.state).to eq('finished')
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
# rubocop:disable Metrics/ClassLength
module StashEngine
class DownloadsController < ApplicationController
before_action :setup_streaming

# set up the Merritt file & version objects so they have access to the controller context before continuing
def setup_streaming
@version_streamer = Stash::Download::Version.new(controller_context: self)
@file_streamer = Stash::Download::File.new(controller_context: self)
CONCURRENT_DOWNLOAD_LIMIT = 2

before_action :check_user_agent, :check_ip, :stop_download_hogs, :setup_streaming

# This is pretty hacky, but is a temporary fix since we are getting more bad actors hitting expensive downloads.
def check_user_agent
# This reads a text file with one line and a regular expression in it and blocks if the user-agent matches the regexp
agent_path = Rails.root.join('uploads', 'blacklist_agents.txt').to_s
head(429) if File.exist?(agent_path) && request.user_agent[Regexp.new(File.read(agent_path))]
end

def check_ip
# This looks for uploads/blacklist.txt and if it's there matches IP addresses that start with things in the file--
# one IP address (or beginning of IP Address) per line.
#
# Can do something better than this when we have more time. This is a stopgap to stop excessive usage just before
# a holiday when we don't have other good ways to block yet without assistance from others.

block_path = Rails.root.join('uploads', 'blacklist.txt').to_s
return unless File.exist?(block_path)

Expand All @@ -32,6 +32,17 @@ def setup_streaming
end
end

def stop_download_hogs
dl_count = DownloadHistory.where(ip_address: request&.remote_ip).downloading.count
render 'download_limit', status: 429 if dl_count >= CONCURRENT_DOWNLOAD_LIMIT
end

# set up the Merritt file & version objects so they have access to the controller context before continuing
def setup_streaming
@version_streamer = Stash::Download::Version.new(controller_context: self)
@file_streamer = Stash::Download::File.new(controller_context: self)
end

# for downloading the full version
def download_resource
@resource = Resource.find(params[:resource_id])
Expand Down Expand Up @@ -120,11 +131,6 @@ def redirect_to_public
)
end

def stream_download
CounterLogger.version_download_hit(request: request, resource: @resource)
Stash::Download::Version.stream_response(url: @resource.merritt_producer_download_uri, tenant: @resource.tenant)
end

def flash_download_unavailable
flash[:notice] = [
'This dataset was recently submitted and downloads are not yet available.',
Expand Down
4 changes: 2 additions & 2 deletions stash/stash_engine/app/models/stash_engine/counter_stat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ def update_citation_count!
def calendar_week(time)
# %W calculates weeks based on starting Monday and not Sunday, %U is Sunday and %V is ???.
# This produces year-week string.
return (Time.new - 30.days).strftime('%G-%W') if time.nil? || !time.is_a?(Time)
return 30.days.ago.strftime('%Y-%W') if time.nil? || !time.is_a?(Time)

time.strftime('%G-%W')
time.strftime('%Y-%W')
end

end
Expand Down
21 changes: 21 additions & 0 deletions stash/stash_engine/app/models/stash_engine/download_history.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module StashEngine
class DownloadHistory < ActiveRecord::Base
belongs_to :resource, class_name: 'StashEngine::Resource'
belongs_to :file_upload, class_name: 'StashEngine::FileUpload'
enum state: %w[downloading finished].map { |i| [i.to_sym, i] }.to_h

scope :downloading, -> { where(state: 'downloading') }

def self.mark_start(ip:, user_agent:, resource_id:, file_id: nil)
create(ip_address: ip, user_agent: user_agent, resource_id: resource_id, file_upload_id: file_id, state: 'downloading')
end

# this just changes the status so it is marked as finished
def self.mark_end(download_history: nil)
return if download_history.nil?

download_history.update(state: 'finished')
end

end
end
5 changes: 5 additions & 0 deletions stash/stash_engine/app/models/stash_engine/file_upload.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
require 'zaru'
require 'cgi'

# rubocop:disable Metrics/ClassLength
module StashEngine
class FileUpload < ActiveRecord::Base
belongs_to :resource, class_name: 'StashEngine::Resource'
has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy

include StashEngine::Concerns::ResourceUpdated
# mount_uploader :uploader, FileUploader # it seems like maybe I don't need this since I'm doing so much manually

Expand Down Expand Up @@ -143,3 +147,4 @@ def self.sanitize_file_name(name)
end
end
end
# rubocop:enable Metrics/ClassLength
1 change: 1 addition & 0 deletions stash/stash_engine/app/models/stash_engine/resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Resource < ActiveRecord::Base # rubocop:disable Metrics/ClassLength
has_many :edit_histories, class_name: 'StashEngine::EditHistory', dependent: :destroy
has_many :curation_activities, -> { order(id: :asc) }, class_name: 'StashEngine::CurationActivity', dependent: :destroy
has_many :repo_queue_states, class_name: 'StashEngine::RepoQueueState', dependent: :destroy
has_many :download_histories, class_name: 'StashEngine::DownloadHistory', dependent: :destroy

accepts_nested_attributes_for :curation_activities

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<% @page_title = "Concurrent Download Limits" %>
<h1 class="o-heading__level1">Concurrent download limits</h1>

<p>
In order to maintain a responsive service for all users, we limit concurrent downloads to
<%= StashEngine::DownloadsController::CONCURRENT_DOWNLOAD_LIMIT %> at a time. When an in-progress download
completes, you may start another.
</p>

<p>
If you need to download all or most files in a dataset, please use the <em>Download dataset</em> button since it will give you
all the files in a single zip package and will usually also reduce your download time and size.
</p>

<p>
If you are getting this message and you're using an automated robot or crawler, please reduce your usage to fewer
concurrent downloads at the same time. You should also be aware of the
<a href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">robots.txt</a> standard which is a voluntary
standard honored by most well-behaved crawlers.
</p>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class CreateStashEngineDownloadHistories < ActiveRecord::Migration
def change
create_table :stash_engine_download_histories do |t|
t.string :ip_address, index: true
t.text :user_agent
t.references :resource, index: true
t.references :file_upload, index: true
t.column :state, "ENUM('downloading', 'finished') DEFAULT 'downloading'"
t.timestamps null: false

t.index :user_agent, length: { user_agent: 100 }
t.index :state
end
end
end
13 changes: 11 additions & 2 deletions stash/stash_engine/lib/stash/download/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Base

def initialize(controller_context:)
@cc = controller_context
@download_history = nil
end

# this is a method that should be overridden
Expand Down Expand Up @@ -73,7 +74,7 @@ def send_headers(stream:, header_obj:, filename:)
raise ex
end

# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
def send_stream(user_stream:, merritt_stream:)
# use this file to write contents of the stream
FileUtils.mkdir_p(Rails.root.join('uploads')) # ensures this file is created if it doesn't exist, needed mostly for tests
Expand All @@ -85,7 +86,12 @@ def send_stream(user_stream:, merritt_stream:)
# read past the end of what has been written so far
read_file = ::File.open(write_file, 'r')

@download_canceled = false

write_thread = Thread.new do
# tracking downloads needs to happen in the threads
@download_history = StashEngine::DownloadHistory.mark_start(ip: cc.request.remote_ip, user_agent: cc.request.user_agent,
resource_id: @resource_id, file_id: @file_id)
# this only modifies the write file with contents of merritt stream
save_to_file(merritt_stream: merritt_stream, write_file: write_file)
end
Expand All @@ -107,14 +113,16 @@ def send_stream(user_stream:, merritt_stream:)
read_file&.close unless read_file&.closed?
write_file&.close unless write_file&.closed?
::File.unlink(write_file&.path) if ::File.exist?(write_file&.path)
StashEngine::DownloadHistory.mark_end(download_history: @download_history) unless @download_history.nil?
end
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity

def save_to_file(merritt_stream:, write_file:)
chunk_size = 1024 * 512 # 512k

while (chunk = merritt_stream.readpartial(chunk_size))
write_file.write(chunk)
break if @download_canceled
end
rescue EOFError => ex
# I believe Ruby has this error with certain kinds of IO objects such as StringIO in testing, but seems to have written
Expand Down Expand Up @@ -146,6 +154,7 @@ def stream_from_file(read_file:, write_file:, user_stream:)
ensure
read_file.close unless read_file.closed?
user_stream.close unless user_stream.closed?
@download_canceled = true # set user download canceled (finished) to true in shared state to notify other thread to terminate its download
end
# rubocop:enable Metrics/CyclomaticComplexity
end
Expand Down
4 changes: 3 additions & 1 deletion stash/stash_engine/lib/stash/download/file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ class File < Base
# this downloads a file as a stream from Merritt express
def download(file:)
@file = file
# StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource)
# needs to be set for tracking downloads
@resource_id = @file&.resource&.id
@file_id = @file.id

stream_response(url: file.merritt_express_url, tenant: file.resource.tenant, filename: disposition_filename)
end
Expand Down
4 changes: 4 additions & 0 deletions stash/stash_engine/lib/stash/download/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def download(resource:)
if @async_download
yield
else
# needs to be set for tracking downloads
@resource_id = resource.id
@file_id = nil

StashEngine::CounterLogger.version_download_hit(request: cc.request, resource: resource)
# longer read timeout since Merritt may need to assemble zip file before starting download
stream_response(url: resource.merritt_producer_download_uri, tenant: resource.tenant, filename: disposition_filename, read_timeout: 180)
Expand Down
16 changes: 16 additions & 0 deletions stash/stash_engine/lib/tasks/download_tracking.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
namespace :download_tracking do

# because of our privacy policy we don't keep most logs or log-like info for more than 60 days as I understand it
desc 'cleanup download history'
task cleanup: :environment do
# Reset downloads from table that show longer than 24 hours in progress, since most likely the status is wrong.
# If the server is shut down in a disorderly way, the 'ensure' clause may not run to close out download info.
# It seems to happen quite rarely (also leaves stray download files in our EFS mount in the rare cases).
StashEngine::DownloadHistory.downloading.where('created_at < ?', 1.day.ago).update_all(state: 'finished')

# clean up information older than 60 days to comply with privacy and no callbacks
StashEngine::DownloadHistory.where('created_at < ?', 60.days.ago).delete_all
puts "#{Time.new.iso8601} Finished DownloadHistory cleanup"
end

end
4 changes: 4 additions & 0 deletions stash/stash_engine/spec/unit/stash/download/base_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ module Download
# need to hack in Rails.root because our test framework setup sucks
rails_root = Dir.mktmpdir('rails_root')
allow(Rails).to receive(:root).and_return(Pathname.new(rails_root))
@my_request = 'requesty'
allow(@my_request).to receive(:remote_ip).and_return('127.0.0.1')
allow(@my_request).to receive(:user_agent).and_return('HorseBrowser 1.1')

@controller_context = 'blah'
allow(@controller_context).to receive(:request).and_return(@my_request)
@logger = ActiveSupport::Logger.new(STDOUT)
allow(@controller_context).to receive(:logger).and_return(@logger)
end
Expand Down
Loading

0 comments on commit 3432042

Please sign in to comment.