Skip to content

Commit

Permalink
add exclude regex filter
Browse files Browse the repository at this point in the history
  • Loading branch information
terrywbrady committed Dec 20, 2023
1 parent f256a94 commit cfe4208
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 13 deletions.
11 changes: 10 additions & 1 deletion coll-health-obj-analysis/config/os_queries.ssm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ options:
start: 0
limit: 10
max_file_per_object: 1000
output: console
output: console,
max_file_size: 1000000000
outputs:
console:
class: ConsoleOutput
Expand All @@ -18,6 +19,8 @@ outputs:
class: FilesOutput
fits:
class: FitsOutput
fits-filtered:
class: FitsFilteredOutput

queries:
default:
Expand Down Expand Up @@ -55,3 +58,9 @@ queries:
query:
match:
tests.results.mime-not-found: FAIL
# bundle exec ruby object_health_query.rb --fmt=octet-stream --file_mime_regex=octet-stream --output=fits --max_file_per_object=5
octet-stream:
class: OSFilesFormatter
query:
match:
build.producer.mime_type: octet-stream
52 changes: 44 additions & 8 deletions coll-health-obj-analysis/fits_outputter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def run_fits(fname)
system("#{fitscmd} -f #{fitscfg} -i '#{fname}' > #{fits_output} 2> #{fits_err}")
end

def record_stat(stat)
# override in derived class
end

def format_fits_output
return unless File.exist?(fits_output)

Expand All @@ -56,15 +60,16 @@ def format_fits_output
stat = 'NA' if stat.empty?
c = doc.xpath('count(identity)')
count = c > 1 ? "(#{c.to_i} identities)" : ''
puts "\t\tStatus: #{stat} #{count}"
write "\t\tStatus: #{stat} #{count}"
record_stat(stat)
doc.xpath('identity').each do |id|
tools = []
id.xpath('tool').each do |t|
tools.append(t.xpath('@toolname').text)
end
puts "\t\t #{id.xpath('@format')} (#{id.xpath('@mimetype')}): #{tools}"
write "\t\t #{id.xpath('@format')} (#{id.xpath('@mimetype')}): #{tools}"
id.xpath('externalIdentifier').each do |ei|
puts "\t\t #{ei.xpath('@type')}: #{ei.text}"
write "\t\t #{ei.xpath('@type')}: #{ei.text}"
end
end
end
Expand All @@ -79,7 +84,7 @@ def format_fits_output
end
msg = doc.xpath('message[1]').text
msg = "Msg: #{msg}" unless msg.empty?
puts "\t\t#{fswf}. #{fsv}. #{msg}" unless fswf.empty? && fsv.empty? && msg.empty?
write "\t\t#{fswf}. #{fsv}. #{msg}" unless fswf.empty? && fsv.empty? && msg.empty?
end
# xml.xpath('/fits/fileinfo').each do |doc|
# doc.xpath('creatingApplicationName').each do |app|
Expand All @@ -97,16 +102,47 @@ def format_fits_output
end

def output(rec, index)
puts "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)"
write "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)"
rec.fetch(:files, []).each do |f|
puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')})"
puts
sz = f.fetch(:billable_size, 0)
write "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')}) #{ObjectHealthUtil.num_format(sz)}"
write
fname = "#{fileid_basename}#{f[:ext]}"
cleanup_last_fileid
download_file_to_identify(fname, f[:url])
run_fits(fname)
format_fits_output
puts
write
end
flush
end

def write(str = '')
puts str
end

def flush; end
end

# Invoke FITS, only generate output if at least one file returns a status other than UNKNOWN
class FitsFilteredOutput < FitsOutput
def initialize(merritt_config)
@msg = []
@interesting = false
super(merritt_config)
end

def write(str = '')
@msg.append(str)
end

def flush
puts @msg if @interesting
@msg = []
@interesting = false
end

def record_stat(stat)
@interesting = true unless stat == 'UNKNOWN'
end
end
9 changes: 8 additions & 1 deletion coll-health-obj-analysis/object_health_query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,20 @@ def make_options(argv)
opts.on('--file_path_regex=REGEX', 'Regex to filter files to return by pathname') do |n|
options[:file_path_regex] = Regexp.new(n)
end
opts.on('--exclude_file_path_regex=REGEX', 'Exclude Regex to filter files to return by pathname') do |n|
options[:exclude_file_path_regex] = Regexp.new(n)
end
opts.on('--file_mime_regex=REGEX', 'Regex to filter files to return by mime_type') do |n|
options[:file_mime_regex] = Regexp.new(n)
end
end.parse(argv)

# the default extractor does not pull file details... change the formatter if needed
options[:fmt] = :files if options[:fmt] == :default && (options[:output] == :files || options[:output] == :fits)
if options[:fmt] == :default &&
(options[:output] == :files || options[:output] == :fits || options[:output] == :'fits-filtered')
options[:fmt] =
:files
end
options
end

Expand Down
6 changes: 4 additions & 2 deletions coll-health-obj-analysis/os_formatter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ def file_test(file)
end

def file_filters(file)
b = true
b = file['billable_size'] < @options[:max_file_size]
b &= file['pathname'] =~ @options[:file_path_regex] if @options[:file_path_regex]
b &= file['pathname'] !~ @options[:exclude_file_path_regex] if @options[:exclude_file_path_regex]
b &= file['mime_type'] =~ @options[:file_mime_regex] if @options[:file_mime_regex]
b
end
Expand All @@ -126,7 +127,8 @@ def files
path: "#{v}/#{p}",
url: "#{file_url}/#{v}/#{pesc}",
mime_type: f.fetch('mime_type', ''),
ext: f.fetch('ext', '')
ext: f.fetch('ext', ''),
billable_size: f.fetch('billable_size', 0)
}
)
break if rfiles.length >= @options.fetch(:max_file_per_object, 5)
Expand Down
3 changes: 2 additions & 1 deletion coll-health-obj-analysis/outputters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ class ConsoleOutput < OutputConfig
def output(rec, index)
puts "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)"
rec.fetch(:files, []).each do |f|
puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')})"
sz = f.fetch(:billable_size, 0)
puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')}) #{ObjectHealthUtil.num_format(sz)}"
end
end
end
Expand Down

0 comments on commit cfe4208

Please sign in to comment.