Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorder spreadsheet #40

Merged
merged 5 commits into from
Oct 23, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions bin/pan_genome_reorder_spreadsheet
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env perl

package Bio::PanGenome::Main::PanGenomeReorderSpreadsheet;

# ABSTRACT: Take in a tree and a spreadsheet and output a reordered spreadsheet
# PODNAME: pan_genome_reorder_spreadsheet

=head1 SYNOPSIS

Take in a tree and a spreadsheet and output a reordered spreadsheet

=cut

BEGIN { unshift( @INC, '../lib' ) }
BEGIN { unshift( @INC, './lib' ) }
BEGIN { unshift( @INC, '/software/pathogen/internal/prod/lib/' ) }
use Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet;

Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet->new(args => \@ARGV, script_name => $0)->run;
81 changes: 81 additions & 0 deletions lib/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet;

# ABSTRACT: Take in a tree and a spreadsheet and output a reordered spreadsheet

=head1 SYNOPSIS

Take in a tree and a spreadsheet and output a reordered spreadsheet

=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::ReorderSpreadsheet;

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'tree_file' => ( is => 'rw', isa => 'Str' );
has 'spreadsheet_filename' => ( is => 'rw', isa => 'Str' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'reordered_spreadsheet.csv' );
has 'tree_format' => ( is => 'rw', isa => 'Str', default => 'newick' );

sub BUILD {
my ($self) = @_;

my ( $output_filename, $tree_file, $tree_format, $spreadsheet_filename, $help );

GetOptionsFromArray(
$self->args,
'o|output_filename=s' => \$output_filename,
't|tree_file=s' => \$tree_file,
'f|tree_format=s' => \$tree_format,
's|spreadsheet_filename=s' => \$spreadsheet_filename,
'h|help' => \$help,
);

$self->output_filename($output_filename) if ( defined($output_filename) );
$self->tree_file($tree_file) if ( defined($tree_file) );
$self->tree_format($tree_format) if ( defined($tree_format) );
$self->spreadsheet_filename($spreadsheet_filename) if ( defined($spreadsheet_filename) );

}

sub run {
my ($self) = @_;

( defined($self->spreadsheet_filename) && defined($self->tree_file) && ( -e $self->spreadsheet_filename ) && ( -e $self->tree_file ) && ( !$self->help ) ) or die $self->usage_text;

my $obj = Bio::PanGenome::ReorderSpreadsheet->new(
tree_file => $self->tree_file,
spreadsheet => $self->spreadsheet_filename,
output_filename => $self->output_filename
);
$obj->reorder_spreadsheet();

}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: pan_genome_reorder_spreadsheet [options]
Take in a tree and a spreadsheet from the pan genome pipeline and output a spreadsheet with the columns ordered by the tree.
By default it expects the tree to be in newick format.

# Reorder the spreadsheet columns to match the order of the samples in the tree
pan_genome_reorder_spreadsheet -t my_tree.tre -s my_spreadsheet.csv

# Specify an output filename
pan_genome_reorder_spreadsheet -t my_tree.tre -s my_spreadsheet.csv -o output_spreadsheet.csv

# This help message
pan_genome_reorder_spreadsheet -h

USAGE
}

__PACKAGE__->meta->make_immutable;
no Moose;
1;
2 changes: 1 addition & 1 deletion lib/Bio/PanGenome/External/Mcl.pm
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ sub _build__memory_required_in_mb
my ($self) = @_;
# Todo: implement this equation for memory estimation if this hardcoded value proves too unstable.
# http://micans.org/mcl/man/mcl.html#opt-how-much-ram
return 2000;
return 1000;
}


Expand Down
9 changes: 8 additions & 1 deletion lib/Bio/PanGenome/GroupStatistics.pm
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,16 @@ sub _build__text_csv_obj {
return Text::CSV->new( { binary => 1, always_quote => 1, eol => "\r\n" } );
}

sub fixed_headers
{
my ($self) = @_;
my @header = ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate');
return \@header;
}

sub _header {
my ($self) = @_;
my @header = ( 'Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate');
my @header = @{$self->fixed_headers};

for my $filename (@{$self->_sorted_file_names})
{
Expand Down
2 changes: 2 additions & 0 deletions lib/Bio/PanGenome/PostAnalysis.pm
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ sub run {
unlink( $self->clusters_filename);
unlink( $self->clusters_filename . '.clstr' );
unlink( $self->clusters_filename . '.bak.clstr' );
unlink('_gff_files');
unlink('_fasta_files');

}

Expand Down
170 changes: 170 additions & 0 deletions lib/Bio/PanGenome/ReorderSpreadsheet.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package Bio::PanGenome::ReorderSpreadsheet;

# ABSTRACT: Take in a tree file and a spreadsheet and output a spreadsheet with reordered columns

=head1 SYNOPSIS

Take in a tree file and a spreadsheet and output a spreadsheet with reordered columns
use Bio::PanGenome::ReorderSpreadsheet;

my $obj = Bio::PanGenome::ReorderSpreadsheet->new(
tree_file => $tree_file,
spreadsheet => 'groups.csv'
);
$obj->reorder_spreadsheet();

=cut

use Moose;
use Text::CSV;
use Bio::PanGenome::SampleOrder;
use Bio::PanGenome::GroupStatistics;

has 'tree_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 );
has 'tree_format' => ( is => 'ro', isa => 'Str', default => 'newick' );
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reordered_groups_stats.csv' );

has '_sample_order' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__sample_order' );
has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' );
has '_output_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_spreadsheet_fh' );
has '_column_mappings' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__column_mappings' );
has '_fixed_headers' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__fixed_headers' );
has '_num_fixed_headers' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__num_fixed_headers' );
has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' );
has '_csv_output' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_output' );

sub BUILD {
my ($self) = @_;
# read the headers first
$self->_column_mappings;
}


sub reorder_spreadsheet {
my ($self) = @_;

# make sure the file handle is at the start
seek($self->_input_spreadsheet_fh ,0,0);
while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) )
{
$self->_csv_output->print($self->_output_spreadsheet_fh, $self->_remap_columns($row));
}

close($self->_output_spreadsheet_fh);
close($self->_input_spreadsheet_fh);
return 1;
}

sub _remap_columns
{
my ($self, $row) = @_;

my @output_row;
for(my $output_index = 0; $output_index < @{$self->_column_mappings}; $output_index++)
{
my $input_index = $self->_column_mappings->[$output_index];
push(@output_row, $row->[$input_index]);
}
return \@output_row;
}

sub _column_mappings_populate_fixed_headers
{
my ($self, $column_mappings,$header_row) = @_;
my $column_counter = 0;
for($column_counter = 0; $column_counter < $self->_num_fixed_headers; $column_counter++)
{
push(@{$column_mappings}, $column_counter);
shift(@{$header_row});
}
return $column_counter;
}

sub _build__column_mappings
{
my ($self) = @_;
my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );

my @column_mappings;
my $column_counter = $self->_column_mappings_populate_fixed_headers(\@column_mappings, $header_row);

# put the input column names into an array where the key is the name and the value is the order
my %input_sample_order;
for(my $i = 0; $i < @{$header_row}; $i++)
{
$input_sample_order{$header_row->[$i]} = $i + $column_counter;
}

# Go through the order of the samples from the tree and see if the headers exist
for my $sample_name (@{$self->_sample_order})
{
if(defined($input_sample_order{$sample_name}))
{
push(@column_mappings, $input_sample_order{$sample_name});
delete($input_sample_order{$sample_name});
}
$column_counter++;
}

# Add any columns not in the tree to the end
for my $sample_name (keys %input_sample_order)
{
push(@column_mappings, $input_sample_order{$sample_name});
delete($input_sample_order{$sample_name});
$column_counter++;
}
return \@column_mappings;
}

sub _build__num_fixed_headers
{
my ($self) = @_;
return @{$self->_fixed_headers};
}

sub _build__fixed_headers
{
my ($self) = @_;
my @fixed_headers = @{Bio::PanGenome::GroupStatistics->fixed_headers()};
return \@fixed_headers;
}

sub _build__csv_parser
{
my ($self) = @_;
return Text::CSV->new( { binary => 1, always_quote => 1} );
}

sub _build__csv_output
{
my ($self) = @_;
return Text::CSV->new( { binary => 1, always_quote => 1, eol => "\r\n"} );
}

sub _build__input_spreadsheet_fh {
my ($self) = @_;
open( my $fh, $self->spreadsheet );
return $fh;
}

sub _build__output_spreadsheet_fh {
my ($self) = @_;
open( my $fh, '>', $self->output_filename );
return $fh;
}

sub _build__sample_order {
my ($self) = @_;
my $obj = Bio::PanGenome::SampleOrder->new(
tree_file => $self->tree_file,
tree_format => $self->tree_format
);
return $obj->ordered_samples();
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

42 changes: 42 additions & 0 deletions lib/Bio/PanGenome/SampleOrder.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package Bio::PanGenome::SampleOrder;

# ABSTRACT: Take in a tree file and return an ordering of the samples

=head1 SYNOPSIS

Take in a tree file and return an ordering of the samples
use Bio::PanGenome::SampleOrder;

my $obj = Bio::PanGenome::SampleOrder->new(
tree_file => $tree_file,
);
$obj->ordered_samples();

=cut

use Moose;
use Bio::TreeIO;

has 'tree_file' => ( is => 'ro', isa => 'Str', required => 1 );
has 'tree_format' => ( is => 'ro', isa => 'Str', default => 'newick' );
has 'ordered_samples' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_samples' );

sub _build_ordered_samples {
my ($self) = @_;
my $input = Bio::TreeIO->new(
-file => $self->tree_file,
-format => $self->tree_format
);
my $tree = $input->next_tree;
my @taxa;
for my $leaf_node ( $tree->get_leaf_nodes ) {
push( @taxa, $leaf_node->id );
}
return \@taxa;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;

28 changes: 28 additions & 0 deletions t/Bio/PanGenome/CommandLine/PanGenomeReorderSpreadsheet.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env perl
use Moose;
use Data::Dumper;
use File::Slurp;
use Cwd;

BEGIN { unshift( @INC, './lib' ) }
BEGIN { unshift( @INC, './t/lib' ) }
with 'TestHelper';

BEGIN {
use Test::Most;
use_ok('Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet');
}
my $script_name = 'Bio::PanGenome::CommandLine::PanGenomeReorderSpreadsheet';

my %scripts_and_expected_files = (
'-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv' =>
[ 'reordered_spreadsheet.csv', 't/data/reorder_isolates_expected_output.csv' ],
'-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv -o different_output_name.csv' =>
[ 'different_output_name.csv', 't/data/reorder_isolates_expected_output.csv' ],
'-t t/data/reorder_isolates.tre -s t/data/reorder_isolates_input.csv -f newick' =>
[ 'reordered_spreadsheet.csv', 't/data/reorder_isolates_expected_output.csv' ],
);

mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files );

done_testing();
Loading