Skip to content

Commit

Permalink
updated scripts and added new rake tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
alinvetian committed Dec 17, 2024
1 parent 8707972 commit e71145b
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 21 deletions.
35 changes: 30 additions & 5 deletions app/models/stash_engine/ror_org.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ class RorOrg < ApplicationRecord

# Search the RorOrgs for the given string. This will search name, acronyms, aliases, etc.
# @return an Array of Hashes { id: 'https://ror.org/12345', name: 'Sample University' }
def self.find_by_ror_name(query, max_results = ROR_MAX_RESULTS)
def self.find_by_ror_name(query)
return [] unless query.present?

query = query.downcase
# First, find matches at the beginning of the name string, and exact matches in the acronyms/aliases
resp = where("LOWER(name) LIKE ? OR JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", query.to_s, query.to_s).limit(max_results)
"#{query}%", query.to_s, query.to_s).limit(ROR_MAX_RESULTS)
results = resp.map do |r|
{ id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end

# If we don't have enough results, find matches at the beginning of the acronyms/aliases
if results.size < max_results
if results.size < ROR_MAX_RESULTS
resp = where("JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", "#{query}%").limit(ROR_MAX_RESULTS - results.size)
resp.each do |r|
Expand All @@ -43,8 +43,8 @@ def self.find_by_ror_name(query, max_results = ROR_MAX_RESULTS)
end

# If we don't have enough results, find matches elsewhere in the name string
if results.size < max_results
resp = where('LOWER(name) LIKE ?', "%#{query}%").limit(max_results - results.size)
if results.size < ROR_MAX_RESULTS
resp = where('LOWER(name) LIKE ?', "%#{query}%").limit(ROR_MAX_RESULTS - results.size)
resp.each do |r|
results << { id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end
Expand All @@ -53,6 +53,31 @@ def self.find_by_ror_name(query, max_results = ROR_MAX_RESULTS)
results.flatten.uniq
end

# Search the RorOrgs for the given string. This will search name, acronyms, aliases, etc.
# @return an Array of Hashes { id: 'https://ror.org/12345', name: 'Sample University' }
# This method is used for auto-matching scripts, where no human has to confirm the match.
def self.find_by_name_for_auto_matching(query)
max_results = 10
return [] unless query.present?

query = query.downcase
# First, find matches at the beginning of the name string, and exact matches in the acronyms/aliases
resp = where("LOWER(name) LIKE ? OR JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", query.to_s, query.to_s).limit(max_results)
results = resp.map do |r|
{ id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end

return results if results.any?

# If we don't have enough results, find matches at the beginning of the acronyms/aliases
resp = where("JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", "#{query}%").limit(max_results)
resp.map do |r|
{ id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end
end

# Return the first match for the given name
# @return a StashEngine::RorOrg or nil
def self.find_first_by_ror_name(ror_name)
Expand Down
8 changes: 6 additions & 2 deletions cron/monthly.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ bundle exec rails link_out:seed_genbank_ids >> /home/ec2-user/deploy/shared/log/
bundle exec rails link_out:publish >> /home/ec2-user/deploy/shared/log/link_out_publish.log 2>&1

# Update ROR organizations
bundle exec rails affiliation_import:update_ror_orgs >>/home/ec2-user/deploy/shared/log/ror_update.log 2>&1
bundle exec rails affiliation_import:update_affiliations_names >>/home/ec2-user/deploy/shared/log/affiliations_name_updates.log 2>&1
bundle exec rails affiliation_import:update_ror_orgs >> /home/ec2-user/deploy/shared/log/ror_update.log 2>&1
bundle exec rails affiliation_import:update_affiliations_names >> /home/ec2-user/deploy/shared/log/affiliations_name_updates.log 2>&1

# Cleanup affiliation/contributor records
bundle exec rails cleanup:affiliations_wo_ror >> /home/ec2-user/deploy/shared/log/affiliations_wo_ror_cleanup.log 2>&1
bundle exec rails cleanup:contributors_wo_ror >> /home/ec2-user/deploy/shared/log/contributors_wo_ror_cleanup.log 2>&1
11 changes: 4 additions & 7 deletions lib/stash/organization/affiliation_ror_matcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def connect_to_ror(item, ror)
rep ||= StashDatacite::Affiliation.from_ror_id(ror_id: ror_id)
to_fix = StashDatacite::Affiliation.where(ror_id: nil, long_name: item.long_name)

update_affiliation_name(rep, ror) if ror[:name] != rep.long_name
update_affiliation_name(rep, ror)

message = 'Replacing affiliations with'
puts " - #{message} name \"#{item.long_name}\" (ids: #{to_fix.ids}) with \"#{ror[:name]}\" (id: #{rep.id || 'new'})"
Expand All @@ -25,11 +25,6 @@ def connect_to_ror(item, ror)
return unless perform_updates

to_fix.each do |aff|
# updating contributors affiliation with new affiliation
aff.contributors.each do |contributor|
contributor.affiliations << rep unless contributor.affiliations.include?(rep)
end

# updating authors affiliation with new affiliation
aff.authors.each do |author|
author.affiliation = rep
Expand All @@ -39,8 +34,10 @@ def connect_to_ror(item, ror)
end

def update_affiliation_name(rep, ror)
return if ror[:name] == rep.long_name

rep.update(long_name: ror[:name]) if perform_updates
message = 'Updating existing affiliation name with'
message = 'Updating existing affiliation name'
puts " - #{message} \"#{rep.long_name}\" (id: #{rep.id}) with \"#{ror[:name]}\""
@csv_rows << [rep.id, rep.long_name, rep.authors.count, message, ror[:name]]
end
Expand Down
15 changes: 8 additions & 7 deletions lib/stash/organization/base_ror_matcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class BaseRorMatcher

attr_reader :perform_updates

def initialize(perform_updates: false, start_id: nil, end_id: nil, start_created_at: nil, end_created_at: nil)
def initialize(perform_updates: true, start_id: nil, end_id: nil, start_created_at: nil, end_created_at: nil)
@perform_updates = perform_updates
@end_id = end_id
@start_id = start_id
Expand Down Expand Up @@ -58,8 +58,8 @@ def end_report
[],
[@text.gsub('Processing', 'From')],
[" - Updated: #{@updates_count} records."],
[" - No ROR found: #{@no_ror_found_count} records."],
[" - Multiple RORs found: #{@multiple_ror_found_count} records."]
[" - Multiple RORs found: #{@multiple_ror_found_count} records."],
[" - No ROR found: #{@no_ror_found_count} records."]
]
update_csv_report(messages)

Expand Down Expand Up @@ -112,13 +112,14 @@ def handle_item(item, item_name, index)
return
end

rors = StashEngine::RorOrg.find_by_ror_name(item_name, 1)
rors = StashEngine::RorOrg.find_by_name_for_auto_matching(item_name)
case rors.count
when 0
@no_ror_found_count += 1
message = 'Could not find ROR'
@csv_rows << [item.id, item_name, message]
puts " - #{message} for \"#{item_name}\""
# Do not add to CSV report, nor log file, as it will increase the file size too much
# message = 'Could not find ROR'
# @csv_rows << [item.id, item_name, message]
# puts " - #{message} for \"#{item_name}\""
when 1
connect_to_ror(item, rors.first)
else
Expand Down
18 changes: 18 additions & 0 deletions lib/tasks/cleanup.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# :nocov:
namespace :cleanup do

# example usage: RAILS_ENV=development bundle exec rake cleanup:affiliations_wo_ror
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/affiliations.md#cleaning-affiliation-names
desc 'Match Affiliations with ROR organizations'
task affiliations_wo_ror: :environment do
Stash::Organization::AffiliationRorMatcher.new.perform
end

# example usage: RAILS_ENV=development bundle exec rake cleanup:contributors_wo_ror
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/contributors.md#cleaning-contributor-names
desc 'Match Contributors with ROR organizations'
task contributors_wo_ror: :environment do
Stash::Organization::ContributorRorMatcher.new.perform
end
end
# :nocov:

0 comments on commit e71145b

Please sign in to comment.