diff --git a/coll-health-obj-analysis/config/os_queries.ssm.yml b/coll-health-obj-analysis/config/os_queries.ssm.yml index 8d269ad..307a55e 100644 --- a/coll-health-obj-analysis/config/os_queries.ssm.yml +++ b/coll-health-obj-analysis/config/os_queries.ssm.yml @@ -8,7 +8,8 @@ options: start: 0 limit: 10 max_file_per_object: 1000 - output: console + output: console, + max_file_size: 1000000000 outputs: console: class: ConsoleOutput @@ -18,6 +19,8 @@ outputs: class: FilesOutput fits: class: FitsOutput + fits-filtered: + class: FitsFilteredOutput queries: default: @@ -55,3 +58,9 @@ queries: query: match: tests.results.mime-not-found: FAIL + # bundle exec ruby object_health_query.rb --fmt=octet-stream --file_mime_regex=octet-stream --output=fits --max_file_per_object=5 + octet-stream: + class: OSFilesFormatter + query: + match: + build.producer.mime_type: octet-stream diff --git a/coll-health-obj-analysis/fits_outputter.rb b/coll-health-obj-analysis/fits_outputter.rb index e3f9892..8657884 100644 --- a/coll-health-obj-analysis/fits_outputter.rb +++ b/coll-health-obj-analysis/fits_outputter.rb @@ -46,6 +46,10 @@ def run_fits(fname) system("#{fitscmd} -f #{fitscfg} -i '#{fname}' > #{fits_output} 2> #{fits_err}") end + def record_stat(stat) + # override in derived class + end + def format_fits_output return unless File.exist?(fits_output) @@ -56,15 +60,16 @@ def format_fits_output stat = 'NA' if stat.empty? c = doc.xpath('count(identity)') count = c > 1 ? "(#{c.to_i} identities)" : '' - puts "\t\tStatus: #{stat} #{count}" + write "\t\tStatus: #{stat} #{count}" + record_stat(stat) doc.xpath('identity').each do |id| tools = [] id.xpath('tool').each do |t| tools.append(t.xpath('@toolname').text) end - puts "\t\t #{id.xpath('@format')} (#{id.xpath('@mimetype')}): #{tools}" + write "\t\t #{id.xpath('@format')} (#{id.xpath('@mimetype')}): #{tools}" id.xpath('externalIdentifier').each do |ei| - puts "\t\t #{ei.xpath('@type')}: #{ei.text}" + write "\t\t #{ei.xpath('@type')}: #{ei.text}" end end end @@ -79,7 +84,7 @@ def format_fits_output end msg = doc.xpath('message[1]').text msg = "Msg: #{msg}" unless msg.empty? - puts "\t\t#{fswf}. #{fsv}. #{msg}" unless fswf.empty? && fsv.empty? && msg.empty? + write "\t\t#{fswf}. #{fsv}. #{msg}" unless fswf.empty? && fsv.empty? && msg.empty? end # xml.xpath('/fits/fileinfo').each do |doc| # doc.xpath('creatingApplicationName').each do |app| @@ -97,16 +102,47 @@ def format_fits_output end def output(rec, index) - puts "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)" + write "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)" rec.fetch(:files, []).each do |f| - puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')})" - puts + sz = f.fetch(:billable_size, 0) + write "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')}) #{ObjectHealthUtil.num_format(sz)}" + write fname = "#{fileid_basename}#{f[:ext]}" cleanup_last_fileid download_file_to_identify(fname, f[:url]) run_fits(fname) format_fits_output - puts + write end + flush + end + + def write(str = '') + puts str + end + + def flush; end +end + +# Invoke FITS, only generate output if at least one file returns a status other than UNKNOWN +class FitsFilteredOutput < FitsOutput + def initialize(merritt_config) + @msg = [] + @interesting = false + super(merritt_config) + end + + def write(str = '') + @msg.append(str) + end + + def flush + puts @msg if @interesting + @msg = [] + @interesting = false + end + + def record_stat(stat) + @interesting = true unless stat == 'UNKNOWN' end end diff --git a/coll-health-obj-analysis/object_health_query.rb b/coll-health-obj-analysis/object_health_query.rb index 46dcb04..f760db2 100644 --- a/coll-health-obj-analysis/object_health_query.rb +++ b/coll-health-obj-analysis/object_health_query.rb @@ -100,13 +100,20 @@ def make_options(argv) opts.on('--file_path_regex=REGEX', 'Regex to filter files to return by pathname') do |n| options[:file_path_regex] = Regexp.new(n) end + opts.on('--exclude_file_path_regex=REGEX', 'Exclude Regex to filter files to return by pathname') do |n| + options[:exclude_file_path_regex] = Regexp.new(n) + end opts.on('--file_mime_regex=REGEX', 'Regex to filter files to return by mime_type') do |n| options[:file_mime_regex] = Regexp.new(n) end end.parse(argv) # the default extractor does not pull file details... change the formatter if needed - options[:fmt] = :files if options[:fmt] == :default && (options[:output] == :files || options[:output] == :fits) + if options[:fmt] == :default && + (options[:output] == :files || options[:output] == :fits || options[:output] == :'fits-filtered') + options[:fmt] = + :files + end options end diff --git a/coll-health-obj-analysis/os_formatter.rb b/coll-health-obj-analysis/os_formatter.rb index 2fbf545..234724b 100644 --- a/coll-health-obj-analysis/os_formatter.rb +++ b/coll-health-obj-analysis/os_formatter.rb @@ -104,8 +104,9 @@ def file_test(file) end def file_filters(file) - b = true + b = file['billable_size'] < @options[:max_file_size] b &= file['pathname'] =~ @options[:file_path_regex] if @options[:file_path_regex] + b &= file['pathname'] !~ @options[:exclude_file_path_regex] if @options[:exclude_file_path_regex] b &= file['mime_type'] =~ @options[:file_mime_regex] if @options[:file_mime_regex] b end @@ -126,7 +127,8 @@ def files path: "#{v}/#{p}", url: "#{file_url}/#{v}/#{pesc}", mime_type: f.fetch('mime_type', ''), - ext: f.fetch('ext', '') + ext: f.fetch('ext', ''), + billable_size: f.fetch('billable_size', 0) } ) break if rfiles.length >= @options.fetch(:max_file_per_object, 5) diff --git a/coll-health-obj-analysis/outputters.rb b/coll-health-obj-analysis/outputters.rb index c360950..4443f26 100644 --- a/coll-health-obj-analysis/outputters.rb +++ b/coll-health-obj-analysis/outputters.rb @@ -12,7 +12,8 @@ class ConsoleOutput < OutputConfig def output(rec, index) puts "#{index}. #{rec[:ark]} (#{rec[:producer_count]} files)" rec.fetch(:files, []).each do |f| - puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')})" + sz = f.fetch(:billable_size, 0) + puts "\t#{f.fetch(:path, '')} (#{f.fetch(:mime_type, '')}) #{ObjectHealthUtil.num_format(sz)}" end end end