From a3e595ed8c98d3b9cbc2a9ef5340b45a4fff441b Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Thu, 10 Oct 2013 20:43:26 +0100 Subject: [PATCH 1/5] delete file of files and reduce mcl ram --- lib/Bio/PanGenome/External/Mcl.pm | 2 +- lib/Bio/PanGenome/PostAnalysis.pm | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Bio/PanGenome/External/Mcl.pm b/lib/Bio/PanGenome/External/Mcl.pm index d43ca0c..166a1b7 100644 --- a/lib/Bio/PanGenome/External/Mcl.pm +++ b/lib/Bio/PanGenome/External/Mcl.pm @@ -37,7 +37,7 @@ sub _build__memory_required_in_mb my ($self) = @_; # Todo: implement this equation for memory estimation if this hardcoded value proves too unstable. # http://micans.org/mcl/man/mcl.html#opt-how-much-ram - return 2000; + return 1000; } diff --git a/lib/Bio/PanGenome/PostAnalysis.pm b/lib/Bio/PanGenome/PostAnalysis.pm index 2e2f912..81c8c4d 100644 --- a/lib/Bio/PanGenome/PostAnalysis.pm +++ b/lib/Bio/PanGenome/PostAnalysis.pm @@ -91,6 +91,8 @@ sub run { unlink( $self->clusters_filename); unlink( $self->clusters_filename . '.clstr' ); unlink( $self->clusters_filename . '.bak.clstr' ); + unlink('_gff_files'); + unlink('_fasta_files'); } From d42cad03a86d30a4e9cf73206c58bb7d0d792871 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Tue, 22 Oct 2013 09:45:26 +0100 Subject: [PATCH 2/5] read in newick tree and output order of leafs --- lib/Bio/PanGenome/SampleOrder.pm | 42 ++++++++++++++++++++++++++++++++ t/Bio/PanGenome/SampleOrder.t | 20 +++++++++++++++ t/data/reorder_isolates.tre | 1 + 3 files changed, 63 insertions(+) create mode 100644 lib/Bio/PanGenome/SampleOrder.pm create mode 100644 t/Bio/PanGenome/SampleOrder.t create mode 100644 t/data/reorder_isolates.tre diff --git a/lib/Bio/PanGenome/SampleOrder.pm b/lib/Bio/PanGenome/SampleOrder.pm new file mode 100644 index 0000000..44ba118 --- /dev/null +++ b/lib/Bio/PanGenome/SampleOrder.pm @@ -0,0 +1,42 @@ +package Bio::PanGenome::SampleOrder; + +# ABSTRACT: Take in a tree file and return an ordering of the samples + +=head1 SYNOPSIS + +Take in a tree file and return an ordering of the samples + use Bio::PanGenome::SampleOrder; + + my $obj = Bio::PanGenome::SampleOrder->new( + tree_file => $tree_file, + ); + $obj->ordered_samples(); + +=cut + +use Moose; +use Bio::TreeIO; + +has 'tree_file' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'tree_format' => ( is => 'ro', isa => 'Str', default => 'newick' ); +has 'ordered_samples' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_samples' ); + +sub _build_ordered_samples { + my ($self) = @_; + my $input = Bio::TreeIO->new( + -file => $self->tree_file, + -format => $self->tree_format + ); + my $tree = $input->next_tree; + my @taxa; + for my $leaf_node ( $tree->get_leaf_nodes ) { + push( @taxa, $leaf_node->id ); + } + return \@taxa; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; + diff --git a/t/Bio/PanGenome/SampleOrder.t b/t/Bio/PanGenome/SampleOrder.t new file mode 100644 index 0000000..29df1b9 --- /dev/null +++ b/t/Bio/PanGenome/SampleOrder.t @@ -0,0 +1,20 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Data::Dumper; +use File::Slurp; + +BEGIN { unshift( @INC, './lib' ) } + +BEGIN { + use Test::Most; + use_ok('Bio::PanGenome::SampleOrder'); +} + +ok(my $obj = Bio::PanGenome::SampleOrder->new( + tree_file => 't/data/reorder_isolates.tre', + ), 'initialise sample order object'); + +is_deeply($obj->ordered_samples(),['query_1', 'query_3','query_4','query_2'],'order of sample names matches the tree'); + +done_testing(); diff --git a/t/data/reorder_isolates.tre b/t/data/reorder_isolates.tre new file mode 100644 index 0000000..0119bd7 --- /dev/null +++ b/t/data/reorder_isolates.tre @@ -0,0 +1 @@ +(query_1:6.0,(query_3:5.0,query_4:3.0):5.0,query_2:11.0); \ No newline at end of file From 46e94e79dcc59e3760ad6f6532a2d18fa67d22e9 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Tue, 22 Oct 2013 14:41:57 +0100 Subject: [PATCH 3/5] reorder columns and output new spreadsheet --- lib/Bio/PanGenome/ReorderSpreadsheet.pm | 166 ++++++++++++++++++++ t/Bio/PanGenome/ReorderSpreadsheet.t | 37 +++++ t/data/reorder_isolates_expected_output.csv | 8 + t/data/reorder_isolates_input.csv | 8 + 4 files changed, 219 insertions(+) create mode 100644 lib/Bio/PanGenome/ReorderSpreadsheet.pm create mode 100644 t/Bio/PanGenome/ReorderSpreadsheet.t create mode 100644 t/data/reorder_isolates_expected_output.csv create mode 100644 t/data/reorder_isolates_input.csv diff --git a/lib/Bio/PanGenome/ReorderSpreadsheet.pm b/lib/Bio/PanGenome/ReorderSpreadsheet.pm new file mode 100644 index 0000000..f2f43a3 --- /dev/null +++ b/lib/Bio/PanGenome/ReorderSpreadsheet.pm @@ -0,0 +1,166 @@ +package Bio::PanGenome::ReorderSpreadsheet; + +# ABSTRACT: Take in a tree file and a spreadsheet and output a spreadsheet with reordered columns + +=head1 SYNOPSIS + +Take in a tree file and a spreadsheet and output a spreadsheet with reordered columns + use Bio::PanGenome::ReorderSpreadsheet; + + my $obj = Bio::PanGenome::ReorderSpreadsheet->new( + tree_file => $tree_file, + spreadsheet => 'groups.csv' + ); + $obj->reorder_spreadsheet(); + +=cut + +use Moose; +use Text::CSV; +use Bio::PanGenome::SampleOrder; +use Bio::PanGenome::GroupStatistics; + +has 'tree_file' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'tree_format' => ( is => 'ro', isa => 'Str', default => 'newick' ); + +has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reordered_groups_stats.csv' ); +has '_sample_order' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__sample_order' ); +has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); +has '_output_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_spreadsheet_fh' ); +has '_column_mappings' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__column_mappings' ); + +has '_column_offset' => ( is => 'ro', isa => 'Int', default => 6 ); + +has '_fixed_headers' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__fixed_headers' ); +has '_num_fixed_headers' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__num_fixed_headers' ); +has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); + +sub BUILD { + my ($self) = @_; + # read the headers first + $self->_column_mappings; +} + + +sub reorder_spreadsheet { + my ($self) = @_; + + # make sure the file handle is at the start + seek($self->_input_spreadsheet_fh ,0,0); + while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) + { + $self->_csv_parser->print($self->_output_spreadsheet_fh, $self->_remap_columns($row)); + } + + close($self->_output_spreadsheet_fh); + close($self->_input_spreadsheet_fh); + return 1; +} + +sub _remap_columns +{ + my ($self, $row) = @_; + + my @output_row; + for(my $output_index = 0; $output_index < @{$self->_column_mappings}; $output_index++) + { + my $input_index = $self->_column_mappings->[$output_index]; + push(@output_row, $row->[$input_index]); + } + return \@output_row; +} + +sub _column_mappings_populate_fixed_headers +{ + my ($self, $column_mappings,$header_row) = @_; + my $column_counter = 0; + for($column_counter = 0; $column_counter < $self->_num_fixed_headers; $column_counter++) + { + push(@{$column_mappings}, $column_counter); + shift(@{$header_row}); + } + return $column_counter; +} + +sub _build__column_mappings +{ + my ($self) = @_; + my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); + + my @column_mappings; + my $column_counter = $self->_column_mappings_populate_fixed_headers(\@column_mappings, $header_row); + + # put the input column names into an array where the key is the name and the value is the order + my %input_sample_order; + for(my $i = 0; $i < @{$header_row}; $i++) + { + $input_sample_order{$header_row->[$i]} = $i + $column_counter; + } + + # Go through the order of the samples from the tree and see if the headers exist + for my $sample_name (@{$self->_sample_order}) + { + if(defined($input_sample_order{$sample_name})) + { + push(@column_mappings, $input_sample_order{$sample_name}); + delete($input_sample_order{$sample_name}); + } + $column_counter++; + } + + # Add any columns not in the tree to the end + for my $sample_name (keys %input_sample_order) + { + push(@column_mappings, $input_sample_order{$sample_name}); + delete($input_sample_order{$sample_name}); + $column_counter++; + } + return \@column_mappings; +} + +sub _build__num_fixed_headers +{ + my ($self) = @_; + return @{$self->_fixed_headers}; +} + +sub _build__fixed_headers +{ + my ($self) = @_; + my @fixed_headers = @{Bio::PanGenome::GroupStatistics->fixed_headers()}; + return \@fixed_headers; +} + +sub _build__csv_parser +{ + my ($self) = @_; + return Text::CSV->new( { binary => 1, always_quote => 1, eol => $/} ); +} + +sub _build__input_spreadsheet_fh { + my ($self) = @_; + open( my $fh, $self->spreadsheet ); + return $fh; +} + +sub _build__output_spreadsheet_fh { + my ($self) = @_; + open( my $fh, '>', $self->output_filename ); + return $fh; +} + +sub _build__sample_order { + my ($self) = @_; + my $obj = Bio::PanGenome::SampleOrder->new( + tree_file => $self->tree_file, + tree_format => $self->tree_format + ); + return $obj->ordered_samples(); +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; + diff --git a/t/Bio/PanGenome/ReorderSpreadsheet.t b/t/Bio/PanGenome/ReorderSpreadsheet.t new file mode 100644 index 0000000..67cd898 --- /dev/null +++ b/t/Bio/PanGenome/ReorderSpreadsheet.t @@ -0,0 +1,37 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Data::Dumper; +use File::Slurp; + +BEGIN { unshift( @INC, './lib' ) } + +BEGIN { + use Test::Most; + use_ok('Bio::PanGenome::ReorderSpreadsheet'); +} + +ok( + my $obj = Bio::PanGenome::ReorderSpreadsheet->new( + tree_file => 't/data/reorder_isolates.tre', + spreadsheet => 't/data/reorder_isolates_input.csv', + output_filename => 'reorder_isolates_output.csv' + ), + 'initialise reordering the spreadsheet' +); + +is_deeply($obj->_column_mappings,[0,1,2,3,4,5,6,8,9,7],'Column mappings with fixed in same order and end columns ordered by tree file'); +ok( $obj->reorder_spreadsheet(), 'run the reorder method' ); +ok( -e $obj->output_filename, 'check the output file exists' ); + + + +is( + read_file('t/data/reorder_isolates_expected_output.csv'), + read_file('reorder_isolates_output.csv'), + 'content of the spreadsheet as expected' +); + +unlink('reorder_isolates_output.csv'); + +done_testing(); diff --git a/t/data/reorder_isolates_expected_output.csv b/t/data/reorder_isolates_expected_output.csv new file mode 100644 index 0000000..51a5b1f --- /dev/null +++ b/t/data/reorder_isolates_expected_output.csv @@ -0,0 +1,8 @@ +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_3","query_4","query_2" +"hly","","Alpha-toxin","4","4","1","P","P","P","P" +"argF","","Ornithine carbamoyltransferase","2","2","1","P","P","","" +"group_4","","","2","2","1","","P","","P" +"speH","","hypothetical protein","2","2","1","P","","","P" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","P" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","P","","" +"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","","" diff --git a/t/data/reorder_isolates_input.csv b/t/data/reorder_isolates_input.csv new file mode 100644 index 0000000..db4212f --- /dev/null +++ b/t/data/reorder_isolates_input.csv @@ -0,0 +1,8 @@ +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3","query_4" +"hly","","Alpha-toxin","4","4","1","P","P","P","P" +"argF","","Ornithine carbamoyltransferase","2","2","1","P","","P","" +"group_4","","","2","2","1","","P","P","" +"speH","","hypothetical protein","2","2","1","P","P","","" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","P","","" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","","P","" +"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","","" From 0a3b2abf0fd739d41e912e83290c0061e8ebc5ec Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Wed, 23 Oct 2013 10:49:29 +0100 Subject: [PATCH 4/5] script to reorder columns in a spreadsheet according to a newick tree --- bin/pan_genome_reorder_spreadsheet | 19 +++++ .../PanGenomeReorderSpreadsheet.pm | 81 +++++++++++++++++++ lib/Bio/PanGenome/GroupStatistics.pm | 9 ++- lib/Bio/PanGenome/ReorderSpreadsheet.pm | 11 +-- .../CommandLine/PanGenomeReorderSpreadsheet.t | 28 +++++++ 5 files changed, 140 insertions(+), 8 deletions(-) create mode 100755 bin/pan_genome_reorder_spreadsheet create mode 100644 lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm create mode 100644 t/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.t diff --git a/bin/pan_genome_reorder_spreadsheet b/bin/pan_genome_reorder_spreadsheet new file mode 100755 index 0000000..b0842f5 --- /dev/null +++ b/bin/pan_genome_reorder_spreadsheet @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +package Bio::PanGenome::Main::PanGenomeReorderSpreadsheet; + +# ABSTRACT: Take in a tree and a spreadsheet and output a reordered spreadsheet +# PODNAME: pan_genome_reorder_spreadsheet + +=head1 SYNOPSIS + +Take in a tree and a spreadsheet and output a reordered spreadsheet + +=cut + +BEGIN { unshift( @INC, '../lib' ) } +BEGIN { unshift( @INC, './lib' ) } +BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) } +use Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet; + +Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet->new(args => \@ARGV, script_name => $0)->run; diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm b/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm new file mode 100644 index 0000000..60edab4 --- /dev/null +++ b/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm @@ -0,0 +1,81 @@ +package Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet; + +# ABSTRACT: Take in a tree and a spreadsheet and output a reordered spreadsheet + +=head1 SYNOPSIS + +Take in a tree and a spreadsheet and output a reordered spreadsheet + +=cut + +use Moose; +use Getopt::Long qw(GetOptionsFromArray); +use Bio::PanGenome::ReorderSpreadsheet; + +has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); + +has 'tree_file' => ( is => 'rw', isa => 'Str' ); +has 'spreadsheet_filename' => ( is => 'rw', isa => 'Str' ); +has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'reordered_spreadsheet.csv' ); +has 'tree_format' => ( is => 'rw', isa => 'Str', default => 'newick' ); + +sub BUILD { + my ($self) = @_; + + my ( $output_filename, $tree_file, $tree_format, $spreadsheet_filename, $help ); + + GetOptionsFromArray( + $self->args, + 'o|output_filename=s' => \$output_filename, + 't|tree_file=s' => \$tree_file, + 'f|tree_format=s' => \$tree_format, + 's|spreadsheet_filename=s' => \$spreadsheet_filename, + 'h|help' => \$help, + ); + + $self->output_filename($output_filename) if ( defined($output_filename) ); + $self->tree_file($tree_file) if ( defined($tree_file) ); + $self->tree_format($tree_format) if ( defined($tree_format) ); + $self->spreadsheet_filename($spreadsheet_filename) if ( defined($spreadsheet_filename) ); + +} + +sub run { + my ($self) = @_; + + ( ( -e $self->spreadsheet_filename ) && ( -e $self->tree_file ) && ( !$self->help ) ) or die $self->usage_text; + + my $obj = Bio::PanGenome::ReorderSpreadsheet->new( + tree_file => $self->tree_file, + spreadsheet => $self->spreadsheet_filename, + output_filename => $self->output_filename + ); + $obj->reorder_spreadsheet(); + +} + +sub usage_text { + my ($self) = @_; + + return <meta->make_immutable; +no Moose; +1; diff --git a/lib/Bio/PanGenome/GroupStatistics.pm b/lib/Bio/PanGenome/GroupStatistics.pm index f3a87a1..5a86d20 100644 --- a/lib/Bio/PanGenome/GroupStatistics.pm +++ b/lib/Bio/PanGenome/GroupStatistics.pm @@ -45,9 +45,16 @@ sub _build__text_csv_obj { return Text::CSV->new( { binary => 1, always_quote => 1, eol => "\r\n" } ); } +sub fixed_headers +{ + my ($self) = @_; + my @header = ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate'); + return \@header; +} + sub _header { my ($self) = @_; - my @header = ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate'); + my @header = @{$self->fixed_headers}; for my $filename (@{$self->_sorted_file_names}) { diff --git a/lib/Bio/PanGenome/ReorderSpreadsheet.pm b/lib/Bio/PanGenome/ReorderSpreadsheet.pm index f2f43a3..83e5792 100644 --- a/lib/Bio/PanGenome/ReorderSpreadsheet.pm +++ b/lib/Bio/PanGenome/ReorderSpreadsheet.pm @@ -23,18 +23,15 @@ use Bio::PanGenome::GroupStatistics; has 'tree_file' => ( is => 'ro', isa => 'Str', required => 1 ); has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); has 'tree_format' => ( is => 'ro', isa => 'Str', default => 'newick' ); - has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reordered_groups_stats.csv' ); + has '_sample_order' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__sample_order' ); has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); has '_output_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_spreadsheet_fh' ); has '_column_mappings' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__column_mappings' ); - -has '_column_offset' => ( is => 'ro', isa => 'Int', default => 6 ); - -has '_fixed_headers' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__fixed_headers' ); -has '_num_fixed_headers' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__num_fixed_headers' ); -has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); +has '_fixed_headers' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__fixed_headers' ); +has '_num_fixed_headers' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__num_fixed_headers' ); +has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); sub BUILD { my ($self) = @_; diff --git a/t/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.t b/t/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.t new file mode 100644 index 0000000..38f7e30 --- /dev/null +++ b/t/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.t @@ -0,0 +1,28 @@ +#!/usr/bin/env perl +use Moose; +use Data::Dumper; +use File::Slurp; +use Cwd; + +BEGIN { unshift( @INC, './lib' ) } +BEGIN { unshift( @INC, './t/lib' ) } +with 'TestHelper'; + +BEGIN { + use Test::Most; + use_ok('Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet'); +} +my $script_name = 'Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet'; + +my %scripts_and_expected_files = ( + '-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv' => + [ 'reordered_spreadsheet.csv', 't/data/reorder_isolates_expected_output.csv' ], + '-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv -o different_output_name.csv' => + [ 'different_output_name.csv', 't/data/reorder_isolates_expected_output.csv' ], + '-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv -f newick' => + [ 'reordered_spreadsheet.csv', 't/data/reorder_isolates_expected_output.csv' ], +); + +mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files ); + +done_testing(); From afa562921bc566ae8af0e0de5ab61c9b24ef5cb5 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Wed, 23 Oct 2013 11:30:31 +0100 Subject: [PATCH 5/5] input should guess eol char, output should have rn --- .../PanGenomeReorderSpreadsheet.pm | 2 +- lib/Bio/PanGenome/ReorderSpreadsheet.pm | 11 +- t/Bio/PanGenome/SampleOrder.t | 145 ++++++++++++++++++ t/data/raxml.tre | 1 + t/data/reorder_isolates_expected_output.csv | 16 +- 5 files changed, 164 insertions(+), 11 deletions(-) create mode 100644 t/data/raxml.tre diff --git a/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm b/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm index 60edab4..2ebde6b 100644 --- a/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm +++ b/lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm @@ -45,7 +45,7 @@ sub BUILD { sub run { my ($self) = @_; - ( ( -e $self->spreadsheet_filename ) && ( -e $self->tree_file ) && ( !$self->help ) ) or die $self->usage_text; + ( defined($self->spreadsheet_filename) && defined($self->tree_file) && ( -e $self->spreadsheet_filename ) && ( -e $self->tree_file ) && ( !$self->help ) ) or die $self->usage_text; my $obj = Bio::PanGenome::ReorderSpreadsheet->new( tree_file => $self->tree_file, diff --git a/lib/Bio/PanGenome/ReorderSpreadsheet.pm b/lib/Bio/PanGenome/ReorderSpreadsheet.pm index 83e5792..58b1006 100644 --- a/lib/Bio/PanGenome/ReorderSpreadsheet.pm +++ b/lib/Bio/PanGenome/ReorderSpreadsheet.pm @@ -32,6 +32,7 @@ has '_column_mappings' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, has '_fixed_headers' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__fixed_headers' ); has '_num_fixed_headers' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__num_fixed_headers' ); has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); +has '_csv_output' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_output' ); sub BUILD { my ($self) = @_; @@ -47,7 +48,7 @@ sub reorder_spreadsheet { seek($self->_input_spreadsheet_fh ,0,0); while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { - $self->_csv_parser->print($self->_output_spreadsheet_fh, $self->_remap_columns($row)); + $self->_csv_output->print($self->_output_spreadsheet_fh, $self->_remap_columns($row)); } close($self->_output_spreadsheet_fh); @@ -132,7 +133,13 @@ sub _build__fixed_headers sub _build__csv_parser { my ($self) = @_; - return Text::CSV->new( { binary => 1, always_quote => 1, eol => $/} ); + return Text::CSV->new( { binary => 1, always_quote => 1} ); +} + +sub _build__csv_output +{ + my ($self) = @_; + return Text::CSV->new( { binary => 1, always_quote => 1, eol => "\r\n"} ); } sub _build__input_spreadsheet_fh { diff --git a/t/Bio/PanGenome/SampleOrder.t b/t/Bio/PanGenome/SampleOrder.t index 29df1b9..92404b1 100644 --- a/t/Bio/PanGenome/SampleOrder.t +++ b/t/Bio/PanGenome/SampleOrder.t @@ -17,4 +17,149 @@ ok(my $obj = Bio::PanGenome::SampleOrder->new( is_deeply($obj->ordered_samples(),['query_1', 'query_3','query_4','query_2'],'order of sample names matches the tree'); + +ok( $obj = Bio::PanGenome::SampleOrder->new( + tree_file => 't/data/raxml.tre', + ), 'initialise sample order object with raxml tree'); + +is_deeply($obj->ordered_samples(),[ + 'efgh_7#3', + 'abcd_4#15', + 'abcd_3#9', + 'abcd_4#17', + 'abcd_3#20', + 'abcd_3#96', + 'abcd_3#7', + '6753_5#30', + 'abcd_3#8', + 'abcd_3#12', + 'abcd_4#13', + 'abcd_3#4', + 'abcd_3#47', + 'abcd_4#36', + 'abcd_3#90', + 'abcd_3#45', + 'abcd_3#51', + 'abcd_3#76', + 'abcd_4#22', + 'abcd_3#94', + 'abcd_3#72', + 'abcd_3#18', + 'abcd_3#82', + 'abcd_3#88', + 'abcd_3#87', + 'abcd_3#58', + 'abcd_3#85', + 'abcd_4#24', + 'abcd_3#86', + 'abcd_4#38', + 'abcd_3#70', + 'abcd_3#89', + 'abcd_3#19', + 'abcd_3#84', + 'abcd_3#60', + 'abcd_4#21', + 'abcd_3#35', + 'abcd_3#32', + 'abcd_4#20', + 'abcd_3#11', + 'abcd_4#28', + 'abcd_4#27', + 'abcd_3#54', + 'abcd_3#53', + 'abcd_3#43', + 'abcd_3#50', + 'abcd_4#12', + 'abcd_3#15', + 'abcd_3#21', + 'abcd_3#91', + 'abcd_3#73', + 'abcd_3#61', + 'abcd_4#35', + 'abcd_3#17', + 'abcd_3#67', + 'abcd_3#27', + 'abcd_3#13', + 'abcd_3#24', + 'abcd_3#95', + 'abcd_3#23', + 'abcd_3#29', + 'abcd_3#75', + 'abcd_3#25', + 'abcd_4#16', + 'abcd_4#30', + 'abcd_4#26', + 'abcd_3#36', + 'abcd_4#25', + 'abcd_3#64', + 'abcd_3#44', + 'abcd_3#68', + 'abcd_3#69', + 'efgh_7#12', + 'abcd_3#55', + 'abcd_4#1', + 'abcd_3#56', + 'abcd_3#14', + 'abcd_4#7', + 'abcd_4#8', + 'abcd_3#26', + 'abcd_4#9', + 'abcd_4#40', + 'abcd_4#10', + 'abcd_4#6', + 'abcd_4#5', + 'abcd_3#3', + 'abcd_3#33', + 'abcd_3#28', + 'abcd_3#6', + 'abcd_3#16', + 'abcd_3#79', + 'abcd_3#77', + 'abcd_4#41', + 'abcd_4#34', + 'abcd_3#5', + 'abcd_3#74', + 'abcd_3#34', + 'abcd_3#2', + 'abcd_3#22', + 'abcd_4#32', + 'abcd_3#92', + 'abcd_4#19', + 'abcd_4#23', + 'abcd_4#18', + 'abcd_3#37', + 'abcd_3#59', + 'abcd_3#30', + 'abcd_3#1', + 'abcd_4#42', + 'abcd_3#10', + 'abcd_4#37', + 'abcd_3#81', + 'abcd_3#80', + 'abcd_3#83', + 'abcd_4#33', + 'abcd_4#31', + 'abcd_3#71', + 'abcd_3#78', + 'abcd_4#39', + 'abcd_3#41', + 'abcd_4#29', + 'abcd_4#14', + 'abcd_3#31', + 'abcd_3#93', + 'abcd_3#62', + 'abcd_3#48', + 'abcd_3#42', + 'abcd_3#52', + 'abcd_3#57', + 'abcd_3#49', + 'abcd_4#3', + 'abcd_4#2', + 'abcd_3#39', + 'abcd_3#38', + 'abcd_4#11', + '5749_2#1' + ],'order of sample names matches the raxml tree'); + + done_testing(); diff --git a/t/data/raxml.tre b/t/data/raxml.tre new file mode 100644 index 0000000..ba35f18 --- /dev/null +++ b/t/data/raxml.tre @@ -0,0 +1 @@ +((efgh_7#3:0.02316815548247504186,(((((abcd_4#15:0.00000132226186997362,((abcd_3#9:0.00077142536151366802,abcd_4#17:0.00231509323793084891)66:0.00000132226186997362,abcd_3#20:0.00000132226186997362)69:0.00000132226186997362)100:0.00465923394294907780,abcd_3#96:0.00855943508912884905)97:0.00305706239572176106,abcd_3#7:0.00702776231555758951)100:0.01189241259037208259,((6753_5#30:0.00077274606169234225,((abcd_3#8:0.00232303086183083740,abcd_3#12:0.00000132226186997362)15:0.00000132226186997362,abcd_4#13:0.00077458209519013735)34:0.00000132226186997362)100:0.00543005878845978333,abcd_3#4:0.00627124124511847011)100:0.02250612800623742402)94:0.00229388045702955557,(abcd_3#47:0.01341623695325115141,((abcd_4#36:0.00000132226186997362,abcd_3#90:0.00000132226186997362)100:0.01406707131121531958,(((abcd_3#45:0.00000132226186997362,abcd_3#51:0.00645148529478506948)88:0.00213347922463543761,((abcd_3#76:0.00623728515995487526,(abcd_4#22:0.00627252597618067984,((abcd_3#94:0.00000132226186997362,abcd_3#72:0.00000132226186997362)98:0.00310391698506840095,abcd_3#18:0.00000132226186997362)100:0.00866004226368127794)83:0.00074194767764187573)62:0.00078073745490957029,((abcd_3#82:0.00622652688081153961,(abcd_3#88:0.00000132226186997362,abcd_3#87:0.00000132226186997362)100:0.00857673644197349747)57:0.00072923797818549103,(abcd_3#58:0.00000132226186997362,(abcd_3#85:0.00000132226186997362,(abcd_4#24:0.00076942622569081123,abcd_3#86:0.00154436764242521811)67:0.00000132226186997362)95:0.00153994529775838417)100:0.00622988517781090934)99:0.00391098343731804791)56:0.00000132226186997362)84:0.00155245684874360406,(abcd_4#38:0.00000132226186997362,(((abcd_3#70:0.00000132226186997362,abcd_3#89:0.00077092094232594885)67:0.00077141909097768792,abcd_3#19:0.00000132226186997362)64:0.00077185290304121294,abcd_3#84:0.00077268682225444766)46:0.00000132226186997362)100:0.01884135086335945314)84:0.00000132226186997362)98:0.00231665969313109722)100:0.00869061762598819128)100:0.00712310831326986617)100:0.00000132226186997362,(((((((((abcd_3#60:0.00000132226186997362,(abcd_4#21:0.00000132226186997362,abcd_3#35:0.00000132226186997362)63:0.00077435429455191866)100:0.00067951772289146634,abcd_3#32:0.00631217588430617402)100:0.02994628172895110949,(abcd_4#20:0.00156522339577581807,(abcd_3#11:0.00000132226186997362,(abcd_4#28:0.00000132226186997362,abcd_4#27:0.00000132226186997362)99:0.00233669665377406015)91:0.00077239513539845042)100:0.01606095283607060151)100:0.00462461859626085801,(((abcd_3#54:0.01180235074075351893,abcd_3#53:0.00857612881135592746)67:0.00089112701883803304,abcd_3#43:0.00393186422217672599)100:0.00538409429284307343,(abcd_3#50:0.00622392842077187317,((abcd_4#12:0.00076758051025639250,(abcd_3#15:0.00000132226186997362,abcd_3#21:0.00385987594678770642)83:0.00000132226186997362)100:0.01960407345618833327,((abcd_3#91:0.00000132226186997362,abcd_3#73:0.00000132226186997362)100:0.00620531368446218968,(((abcd_3#61:0.00388805278349425333,(abcd_4#35:0.00778153754178156389,abcd_3#17:0.01089247524165314410)65:0.00000132226186997362)64:0.00077356760974459440,(abcd_3#67:0.00312895358465437121,(abcd_3#27:0.00000132226186997362,(abcd_3#13:0.00077547546351081618,((((abcd_3#24:0.00000132226186997362,abcd_3#95:0.00000132226186997362)50:0.00000132226186997362,(abcd_3#23:0.00000132226186997362,(abcd_3#29:0.00000132226186997362,(abcd_3#75:0.00000132226186997362,abcd_3#25:0.00000132226186997362)8:0.00000132226186997362)4:0.00000132226186997362)3:0.00000132226186997362)14:0.00000132226186997362,abcd_4#16:0.00077326015058246165)18:0.00000132226186997362,(abcd_4#30:0.00000132226186997362,(abcd_4#26:0.00000132226186997362,(abcd_3#36:0.00000132226186997362,abcd_4#25:0.00000132226186997362)32:0.00000132226186997362)29:0.00000132226186997362)83:0.00154603891415671085)42:0.00000132226186997362)65:0.00077376150782699756)100:0.00702342907247293334)64:0.00075456473366204040)12:0.00000132226186997362,((abcd_3#64:0.00000132226186997362,abcd_3#44:0.00000132226186997362)65:0.00077365659659393512,(abcd_3#68:0.00000132226186997362,abcd_3#69:0.00000132226186997362)100:0.00932174039768971646)15:0.00000132226186997362)23:0.00000132226186997362)56:0.00077344029318351904)61:0.00000132226186997362)100:0.00465345295130407000)100:0.01111214870825547735)99:0.00374408351410862681,(efgh_7#12:0.01833223611065578143,(abcd_3#55:0.02324957283752214152,((abcd_4#1:0.00469566203917269956,(abcd_3#56:0.00546936779545076270,abcd_3#14:0.00622833716792515427)100:0.00151084301588890889)100:0.00789157197361813056,(abcd_4#7:0.00233293316007452626,(abcd_4#8:0.00077311598970405320,((abcd_3#26:0.00077550158683622301,abcd_4#9:0.00000132226186997362)39:0.00000132226186997362,(abcd_4#40:0.00231679829332564514,abcd_4#10:0.00000132226186997362)89:0.00154601555763803750)40:0.00000132226186997362)100:0.00780916089311263408)97:0.00307287638606677186)100:0.00953794433170505190)100:0.00869196764035643291)100:0.00977872108322918980)80:0.00156873804149701489,(abcd_4#6:0.00000132226186997362,abcd_4#5:0.00000132226186997362)100:0.02610847709952213158)100:0.00115877341875185490,((abcd_3#3:0.01670319394504033925,(abcd_3#33:0.00000132226186997362,abcd_3#28:0.00000132226186997362)100:0.02055825834093799945)100:0.00631193742166692457,((abcd_3#6:0.00000132226186997362,abcd_3#16:0.00000132226186997362)100:0.01660913514998323473,(((abcd_3#79:0.00000132226186997362,abcd_3#77:0.00000132226186997362)100:0.00854956715195766097,((abcd_4#41:0.00154085601563807195,(abcd_4#34:0.00000132226186997362,((abcd_3#5:0.00000132226186997362,(((abcd_3#74:0.00076882224000264889,abcd_3#34:0.00154339011642515137)3:0.00000132226186997362,(abcd_3#2:0.00076882393169920289,abcd_3#22:0.00076882393169920289)2:0.00000132226186997362)0:0.00000132226186997362,(((abcd_4#32:0.00000132226186997362,abcd_3#92:0.00076869510795057395)60:0.00076875756629854659,abcd_4#19:0.00076905889272679622)6:0.00000132226186997362,(abcd_4#23:0.00153888458435903590,(abcd_4#18:0.00230933031152180277,abcd_3#37:0.00154338804217775882)9:0.00000132226186997362)3:0.00000132226186997362)0:0.00000132226186997362)1:0.00000132226186997362)3:0.00000132226186997362,abcd_3#59:0.00000132226186997362)2:0.00000132226186997362)54:0.00076857718755298703)65:0.00077555059848881977,(abcd_3#30:0.00000132226186997362,abcd_3#1:0.00000132226186997362)99:0.00153650731415773379)100:0.00621766977560912586)89:0.00151241128294433351,((abcd_4#42:0.00000132226186997362,abcd_3#10:0.00000132226186997362)91:0.00154222559231584149,((abcd_4#37:0.00000132226186997362,((abcd_3#81:0.00000132226186997362,(abcd_3#80:0.00000132226186997362,abcd_3#83:0.00000132226186997362)62:0.00000132226186997362)70:0.00077071378050518611,(abcd_4#33:0.00154280535353011465,(abcd_4#31:0.00076943370157726051,((abcd_3#71:0.00000132226186997362,abcd_3#78:0.00000132226186997362)49:0.00000132226186997362,((abcd_4#39:0.00076989789528285400,(abcd_3#41:0.00000132226186997362,(abcd_4#29:0.00000132226186997362,abcd_4#14:0.00000132226186997362)81:0.00000132226186997362)78:0.00076966766619406121)82:0.00154202631463747943,abcd_3#31:0.00153982188992232857)52:0.00000132226186997362)46:0.00000132226186997362)87:0.00154081390428061960)46:0.00000132226186997362)15:0.00000132226186997362)20:0.00000132226186997362,abcd_3#93:0.00154841596303632542)70:0.00000132226186997362)95:0.00234308767561589113)100:0.01268000193507385319)70:0.00000132226186997362)100:0.01948160300069467707)100:0.03080643698088052218,abcd_3#62:0.05958621335563133586)100:0.00000132226186997362,(abcd_3#48:0.01495824524344147380,(abcd_3#42:0.00466602633570538643,(abcd_3#52:0.00390697663604849376,(abcd_3#57:0.00701624022983718919,(abcd_3#49:0.00000132226186997362,(abcd_4#3:0.00077677169701956478,(abcd_4#2:0.00000132226186997362,(abcd_3#39:0.00000132226186997362,(abcd_3#38:0.00077936967227037839,abcd_4#11:0.00077652476200712472)44:0.00000132226186997362)98:0.00233380194325215166)82:0.00155750275978298273)87:0.00077833329157559123)100:0.00389755574822029088)100:0.00000132226186997362)100:0.00552477386573709244)100:0.00983188219808936101)100:0.13376681553139949110)100:0.03578419742883675453,5749_2#1:0.02465538922215411086); diff --git a/t/data/reorder_isolates_expected_output.csv b/t/data/reorder_isolates_expected_output.csv index 51a5b1f..bccdd62 100644 --- a/t/data/reorder_isolates_expected_output.csv +++ b/t/data/reorder_isolates_expected_output.csv @@ -1,8 +1,8 @@ -"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_3","query_4","query_2" -"hly","","Alpha-toxin","4","4","1","P","P","P","P" -"argF","","Ornithine carbamoyltransferase","2","2","1","P","P","","" -"group_4","","","2","2","1","","P","","P" -"speH","","hypothetical protein","2","2","1","P","","","P" -"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","P" -"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","P","","" -"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","","" +"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_3","query_4","query_2" +"hly","","Alpha-toxin","4","4","1","P","P","P","P" +"argF","","Ornithine carbamoyltransferase","2","2","1","P","P","","" +"group_4","","","2","2","1","","P","","P" +"speH","","hypothetical protein","2","2","1","P","","","P" +"group_7","","Gonococcal growth inhibitor III","1","1","1","","","","P" +"yfnB","","Putative HAD-hydrolase yfnB","1","1","1","","P","","" +"group_6","","Gonococcal growth inhibitor III","1","1","1","P","","",""