Skip to content

Commit

Permalink
Merge pull request #46 from andrewjpage/master
Browse files Browse the repository at this point in the history
dont split groups
  • Loading branch information
andrewjpage committed Oct 31, 2013
2 parents 2e89621 + b337c82 commit ef7d78e
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 43 deletions.
5 changes: 0 additions & 5 deletions lib/Bio/PanGenome/AnnotateGroups.pm
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,6 @@ sub _split_groups_with_min_sub_group_size {
sub _split_groups {
my ($self) = @_;

# Split off the largest groups first
for ( my $i = $self->_number_of_files ; $i >= (($self->_number_of_files / 32)) && $i >= 1 ; $i /= 2 ) {
$self->_split_groups_with_min_sub_group_size($i);
}

$self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
$self->_ids_to_groups( $self->_generate__ids_to_groups );
}
Expand Down
4 changes: 2 additions & 2 deletions lib/Bio/PanGenome/External/Cdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ has 'exec' => ( is => 'ro', isa => 'Str', default => '
has '_number_of_threads' => ( is => 'ro', isa => 'Int', default => 1 );
has '_max_available_memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__max_available_memory_in_mb' );
has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default => 1 );
has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 1 );
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 1 );
has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_description_length' => ( is => 'ro', isa => 'Int', default => 256 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

Expand Down
14 changes: 13 additions & 1 deletion lib/Bio/PanGenome/Output/NumberOfGroups.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use Moose;
use List::Util qw(shuffle);

has 'group_statistics_obj' => ( is => 'ro', isa => 'Bio::PanGenome::GroupStatistics', required => 1 );
has 'number_of_iterations' => ( is => 'ro', isa => 'Int', default => 100 );
has 'number_of_iterations' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_iterations' );
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'number_of_new_genes.png' );
has 'output_raw_filename_conserved_genes' => ( is => 'ro', isa => 'Str', default => 'number_of_conserved_genes.tab' );
has 'output_raw_filename_unique_genes' => ( is => 'ro', isa => 'Str', default => 'number_of_unique_genes.tab' );
Expand All @@ -29,6 +29,18 @@ has '_unique_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
has '_total_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
has '_new_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );

sub _build_number_of_iterations
{
my ($self) = @_;
my $number_of_iterations = 100;
my $number_of_files = @{ $self->group_statistics_obj->_sorted_file_names };
if($number_of_files > $number_of_iterations)
{
$number_of_iterations = $number_of_files;
}
return $number_of_iterations;
}

sub create_output_files {
my ($self) = @_;

Expand Down
6 changes: 0 additions & 6 deletions t/Bio/PanGenome/CommandLine/CreatePanGenome.t
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@ my %scripts_and_expected_files = (
mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files );
cleanup_files();

%scripts_and_expected_files = (
' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' =>
[ 'pan_genome_sequences/00006-group_1.fa.aln', 't/data/00006-group_1.fa.aln' ],
);
mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files );
cleanup_files();

%scripts_and_expected_files = (
' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' =>
Expand Down
2 changes: 1 addition & 1 deletion t/Bio/PanGenome/External/Cdhit.t
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ok($obj = Bio::PanGenome::External::Cdhit->new(
exec => $cwd.'/t/bin/dummy_cd-hit',
),'initialise object');

is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 1 -d 256 -c 1 2> /dev/null', 'Command constructed as expected');
is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 0.99 -d 256 -c 0.99 2> /dev/null', 'Command constructed as expected');
ok($obj->run(), 'run dummy command');
unlink('output');
unlink('output.clstr');
Expand Down
24 changes: 0 additions & 24 deletions t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,4 @@ is(read_file('pan_genome_sequences/00001-group_6.fa'), read_file('t/data/pan_gen
is(read_file('pan_genome_sequences/00001-yfnB.fa'), read_file('t/data/pan_genome_sequences/00001-yfnB.fa' ), 'Check multifasta content is correct for 1-yfnB.fa ');
remove_tree('pan_genome_sequences');

my $annotate_groups_all_merged = Bio::PanGenome::AnnotateGroups->new(
gff_files => $gff_files,
groups_filename => 't/data/query_groups_all_merged',
);
$annotate_groups_all_merged->reannotate;

ok(
my $obj_all_merged = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new(
group_names => [ 'group_2', 'group_5' ],
gff_files => $gff_files,
annotate_groups => $annotate_groups_all_merged
),
'All groups are merged into one so it needs to be deconvoluted'
);
ok( $obj_all_merged->create_files(), 'Split out the annotation into separate group files' );


is(read_file('pan_genome_sequences/00006-different.fa'), read_file('t/data/split_pan_genome_sequences/00006-different.fa'), 'Check multifasta content correct for 00006-different.fa' );
is(read_file('pan_genome_sequences/00002-speH.fa'), read_file('t/data/split_pan_genome_sequences/00002-speH.fa'), 'Check multifasta content correct for speH.fa ' );
is(read_file('pan_genome_sequences/00002-hly.fa'), read_file('t/data/split_pan_genome_sequences/00002-hly.fa'), 'Check multifasta content correct for hly.fa ' );
is(read_file('pan_genome_sequences/00002-argF.fa'), read_file('t/data/split_pan_genome_sequences/00002-argF.fa'), 'Check multifasta content correct for argF.fa ' );
is(read_file('reannotated_groups_file'), read_file('t/data/split_pan_genome_sequences/reannotated_groups_file'),'Check multifasta content correct for reannotated_groups_file' );

remove_tree('pan_genome_sequences');
done_testing();
3 changes: 1 addition & 2 deletions t/data/clustered_proteins_pan_genome
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
group_1: abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006
speH: abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006 1_2 2_2 3_2
group_2: abc_00014 abc_00014 abc_00014 1_6 2_7 abc_00015
speH: 1_2 2_2 3_2
group_12: abc_00013 abc_00013 abc_00013
yfnB: abc_00016 abc_00016 3_5
group_5: abc_00003 abc_00003 abc_00003
Expand Down
3 changes: 1 addition & 2 deletions t/data/overall_group_statisics.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3"
"speH","","hypothetical protein","3","3","1","1_2","2_2","3_2"
"argF","","Ornithine carbamoyltransferase","3","3","1","1_3","2_3","3_3"
"speH","","superantigen-like protein","3","9","3","1_2","2_2","abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006 3_2"
"group_2","","Gonococcal growth inhibitor III","3","6","2","1_6","2_7","abc_00014 abc_00014 abc_00014 abc_00015"
"hly","","Alpha-toxin","3","3","1","1_1","2_1","3_1"
"yfnB","","Putative HAD-hydrolase yfnB","2","3","1.5","","abc_00016 abc_00016","3_5"
"group_12","","","1","3","3","","","abc_00013 abc_00013 abc_00013"
"group_5","","hypothetical protein","1","3","3","","","abc_00003 abc_00003 abc_00003"
"group_1","","superantigen-like protein","1","6","6","","","abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006"
"group_9","","hypothetical protein","1","3","3","","","abc_00010 abc_00010 abc_00010"
"group_8","","","1","3","3","","","abc_01705 abc_01705 abc_01705"
"group_10","","C4-dicarboxylate transporter/malic acid transport protein","1","3","3","","","abc_00011 abc_00011 abc_00011"
Expand Down

0 comments on commit ef7d78e

Please sign in to comment.