From b337c82152554a0ef3969067850762561b11e18e Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Thu, 31 Oct 2013 11:22:05 +0000 Subject: [PATCH] dont split groups --- lib/Bio/PanGenome/AnnotateGroups.pm | 5 ---- lib/Bio/PanGenome/External/Cdhit.pm | 4 ++-- lib/Bio/PanGenome/Output/NumberOfGroups.pm | 14 ++++++++++- t/Bio/PanGenome/CommandLine/CreatePanGenome.t | 6 ----- t/Bio/PanGenome/External/Cdhit.t | 2 +- .../Output/GroupsMultifastasNucleotide.t | 24 ------------------- t/data/clustered_proteins_pan_genome | 3 +-- t/data/overall_group_statisics.csv | 3 +-- 8 files changed, 18 insertions(+), 43 deletions(-) diff --git a/lib/Bio/PanGenome/AnnotateGroups.pm b/lib/Bio/PanGenome/AnnotateGroups.pm index cf36a6a..7f749df 100644 --- a/lib/Bio/PanGenome/AnnotateGroups.pm +++ b/lib/Bio/PanGenome/AnnotateGroups.pm @@ -243,11 +243,6 @@ sub _split_groups_with_min_sub_group_size { sub _split_groups { my ($self) = @_; - # Split off the largest groups first - for ( my $i = $self->_number_of_files ; $i >= (($self->_number_of_files / 32)) && $i >= 1 ; $i /= 2 ) { - $self->_split_groups_with_min_sub_group_size($i); - } - $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names ); $self->_ids_to_groups( $self->_generate__ids_to_groups ); } diff --git a/lib/Bio/PanGenome/External/Cdhit.pm b/lib/Bio/PanGenome/External/Cdhit.pm index d0034c5..4be80e1 100644 --- a/lib/Bio/PanGenome/External/Cdhit.pm +++ b/lib/Bio/PanGenome/External/Cdhit.pm @@ -25,8 +25,8 @@ has 'exec' => ( is => 'ro', isa => 'Str', default => ' has '_number_of_threads' => ( is => 'ro', isa => 'Int', default => 1 ); has '_max_available_memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__max_available_memory_in_mb' ); has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default => 1 ); -has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 1 ); -has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 1 ); +has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 0.99 ); +has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 0.99 ); has '_description_length' => ( is => 'ro', isa => 'Int', default => 256 ); has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' ); diff --git a/lib/Bio/PanGenome/Output/NumberOfGroups.pm b/lib/Bio/PanGenome/Output/NumberOfGroups.pm index 3f42198..360ee34 100644 --- a/lib/Bio/PanGenome/Output/NumberOfGroups.pm +++ b/lib/Bio/PanGenome/Output/NumberOfGroups.pm @@ -18,7 +18,7 @@ use Moose; use List::Util qw(shuffle); has 'group_statistics_obj' => ( is => 'ro', isa => 'Bio::PanGenome::GroupStatistics', required => 1 ); -has 'number_of_iterations' => ( is => 'ro', isa => 'Int', default => 100 ); +has 'number_of_iterations' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_iterations' ); has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'number_of_new_genes.png' ); has 'output_raw_filename_conserved_genes' => ( is => 'ro', isa => 'Str', default => 'number_of_conserved_genes.tab' ); has 'output_raw_filename_unique_genes' => ( is => 'ro', isa => 'Str', default => 'number_of_unique_genes.tab' ); @@ -29,6 +29,18 @@ has '_unique_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } ); has '_total_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } ); has '_new_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } ); +sub _build_number_of_iterations +{ + my ($self) = @_; + my $number_of_iterations = 100; + my $number_of_files = @{ $self->group_statistics_obj->_sorted_file_names }; + if($number_of_files > $number_of_iterations) + { + $number_of_iterations = $number_of_files; + } + return $number_of_iterations; +} + sub create_output_files { my ($self) = @_; diff --git a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t index 5560054..d73b5c9 100644 --- a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t +++ b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t @@ -28,12 +28,6 @@ my %scripts_and_expected_files = ( mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files ); cleanup_files(); -%scripts_and_expected_files = ( - ' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' => - [ 'pan_genome_sequences/00006-group_1.fa.aln', 't/data/00006-group_1.fa.aln' ], -); -mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files ); -cleanup_files(); %scripts_and_expected_files = ( ' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' => diff --git a/t/Bio/PanGenome/External/Cdhit.t b/t/Bio/PanGenome/External/Cdhit.t index cb8c31b..6d7131a 100644 --- a/t/Bio/PanGenome/External/Cdhit.t +++ b/t/Bio/PanGenome/External/Cdhit.t @@ -20,7 +20,7 @@ ok($obj = Bio::PanGenome::External::Cdhit->new( exec => $cwd.'/t/bin/dummy_cd-hit', ),'initialise object'); -is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 1 -d 256 -c 1 2> /dev/null', 'Command constructed as expected'); +is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 0.99 -d 256 -c 0.99 2> /dev/null', 'Command constructed as expected'); ok($obj->run(), 'run dummy command'); unlink('output'); unlink('output.clstr'); diff --git a/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t b/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t index 2c87398..335f9f9 100644 --- a/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t +++ b/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t @@ -43,28 +43,4 @@ is(read_file('pan_genome_sequences/00001-group_6.fa'), read_file('t/data/pan_gen is(read_file('pan_genome_sequences/00001-yfnB.fa'), read_file('t/data/pan_genome_sequences/00001-yfnB.fa' ), 'Check multifasta content is correct for 1-yfnB.fa '); remove_tree('pan_genome_sequences'); -my $annotate_groups_all_merged = Bio::PanGenome::AnnotateGroups->new( - gff_files => $gff_files, - groups_filename => 't/data/query_groups_all_merged', -); -$annotate_groups_all_merged->reannotate; - -ok( - my $obj_all_merged = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new( - group_names => [ 'group_2', 'group_5' ], - gff_files => $gff_files, - annotate_groups => $annotate_groups_all_merged - ), - 'All groups are merged into one so it needs to be deconvoluted' -); -ok( $obj_all_merged->create_files(), 'Split out the annotation into separate group files' ); - - -is(read_file('pan_genome_sequences/00006-different.fa'), read_file('t/data/split_pan_genome_sequences/00006-different.fa'), 'Check multifasta content correct for 00006-different.fa' ); -is(read_file('pan_genome_sequences/00002-speH.fa'), read_file('t/data/split_pan_genome_sequences/00002-speH.fa'), 'Check multifasta content correct for speH.fa ' ); -is(read_file('pan_genome_sequences/00002-hly.fa'), read_file('t/data/split_pan_genome_sequences/00002-hly.fa'), 'Check multifasta content correct for hly.fa ' ); -is(read_file('pan_genome_sequences/00002-argF.fa'), read_file('t/data/split_pan_genome_sequences/00002-argF.fa'), 'Check multifasta content correct for argF.fa ' ); -is(read_file('reannotated_groups_file'), read_file('t/data/split_pan_genome_sequences/reannotated_groups_file'),'Check multifasta content correct for reannotated_groups_file' ); - -remove_tree('pan_genome_sequences'); done_testing(); diff --git a/t/data/clustered_proteins_pan_genome b/t/data/clustered_proteins_pan_genome index 7dfd5f7..0df4164 100644 --- a/t/data/clustered_proteins_pan_genome +++ b/t/data/clustered_proteins_pan_genome @@ -1,6 +1,5 @@ -group_1: abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006 +speH: abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006 1_2 2_2 3_2 group_2: abc_00014 abc_00014 abc_00014 1_6 2_7 abc_00015 -speH: 1_2 2_2 3_2 group_12: abc_00013 abc_00013 abc_00013 yfnB: abc_00016 abc_00016 3_5 group_5: abc_00003 abc_00003 abc_00003 diff --git a/t/data/overall_group_statisics.csv b/t/data/overall_group_statisics.csv index 02360a0..ba1dd5b 100644 --- a/t/data/overall_group_statisics.csv +++ b/t/data/overall_group_statisics.csv @@ -1,12 +1,11 @@ "Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3" -"speH","","hypothetical protein","3","3","1","1_2","2_2","3_2" "argF","","Ornithine carbamoyltransferase","3","3","1","1_3","2_3","3_3" +"speH","","superantigen-like protein","3","9","3","1_2","2_2","abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006 3_2" "group_2","","Gonococcal growth inhibitor III","3","6","2","1_6","2_7","abc_00014 abc_00014 abc_00014 abc_00015" "hly","","Alpha-toxin","3","3","1","1_1","2_1","3_1" "yfnB","","Putative HAD-hydrolase yfnB","2","3","1.5","","abc_00016 abc_00016","3_5" "group_12","","","1","3","3","","","abc_00013 abc_00013 abc_00013" "group_5","","hypothetical protein","1","3","3","","","abc_00003 abc_00003 abc_00003" -"group_1","","superantigen-like protein","1","6","6","","","abc_00004 abc_00004 abc_00004 abc_00006 abc_00006 abc_00006" "group_9","","hypothetical protein","1","3","3","","","abc_00010 abc_00010 abc_00010" "group_8","","","1","3","3","","","abc_01705 abc_01705 abc_01705" "group_10","","C4-dicarboxylate transporter/malic acid transport protein","1","3","3","","","abc_00011 abc_00011 abc_00011"