From b337c82152554a0ef3969067850762561b11e18e Mon Sep 17 00:00:00 2001
From: andrewjpage <andrewjpage@gmail.com>
Date: Thu, 31 Oct 2013 11:22:05 +0000
Subject: [PATCH] dont split groups

---
 lib/Bio/PanGenome/AnnotateGroups.pm           |  5 ----
 lib/Bio/PanGenome/External/Cdhit.pm           |  4 ++--
 lib/Bio/PanGenome/Output/NumberOfGroups.pm    | 14 ++++++++++-
 t/Bio/PanGenome/CommandLine/CreatePanGenome.t |  6 -----
 t/Bio/PanGenome/External/Cdhit.t              |  2 +-
 .../Output/GroupsMultifastasNucleotide.t      | 24 -------------------
 t/data/clustered_proteins_pan_genome          |  3 +--
 t/data/overall_group_statisics.csv            |  3 +--
 8 files changed, 18 insertions(+), 43 deletions(-)

diff --git a/lib/Bio/PanGenome/AnnotateGroups.pm b/lib/Bio/PanGenome/AnnotateGroups.pm
index cf36a6a..7f749df 100644
--- a/lib/Bio/PanGenome/AnnotateGroups.pm
+++ b/lib/Bio/PanGenome/AnnotateGroups.pm
@@ -243,11 +243,6 @@ sub _split_groups_with_min_sub_group_size {
 sub _split_groups {
     my ($self) = @_;
 
-    # Split off the largest groups first
-    for ( my $i = $self->_number_of_files ; $i >= (($self->_number_of_files / 32)) && $i >= 1 ; $i /= 2 ) {
-        $self->_split_groups_with_min_sub_group_size($i);
-    }
-
     $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
     $self->_ids_to_groups( $self->_generate__ids_to_groups );
 }
diff --git a/lib/Bio/PanGenome/External/Cdhit.pm b/lib/Bio/PanGenome/External/Cdhit.pm
index d0034c5..4be80e1 100644
--- a/lib/Bio/PanGenome/External/Cdhit.pm
+++ b/lib/Bio/PanGenome/External/Cdhit.pm
@@ -25,8 +25,8 @@ has 'exec'                         => ( is => 'ro', isa => 'Str',  default  => '
 has '_number_of_threads'           => ( is => 'ro', isa => 'Int',  default  => 1 );
 has '_max_available_memory_in_mb'  => ( is => 'ro', isa => 'Int',  lazy => 1, builder => '_build__max_available_memory_in_mb' );
 has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default  => 1 );
-has '_length_difference_cutoff'    => ( is => 'ro', isa => 'Num',  default  => 1 );
-has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num',  default  => 1 );
+has '_length_difference_cutoff'    => ( is => 'ro', isa => 'Num',  default  => 0.99 );
+has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num',  default  => 0.99 );
 has '_description_length'          => ( is => 'ro', isa => 'Int',  default  => 256 );
 has '_logging'          => ( is => 'ro', isa => 'Str', default  => '2> /dev/null' );
 
diff --git a/lib/Bio/PanGenome/Output/NumberOfGroups.pm b/lib/Bio/PanGenome/Output/NumberOfGroups.pm
index 3f42198..360ee34 100644
--- a/lib/Bio/PanGenome/Output/NumberOfGroups.pm
+++ b/lib/Bio/PanGenome/Output/NumberOfGroups.pm
@@ -18,7 +18,7 @@ use Moose;
 use List::Util qw(shuffle);
 
 has 'group_statistics_obj' => ( is => 'ro', isa => 'Bio::PanGenome::GroupStatistics', required => 1 );
-has 'number_of_iterations' => ( is => 'ro', isa => 'Int', default => 100 );
+has 'number_of_iterations' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_iterations' );
 has 'output_filename'                     => ( is => 'ro', isa => 'Str', default => 'number_of_new_genes.png' );
 has 'output_raw_filename_conserved_genes' => ( is => 'ro', isa => 'Str', default => 'number_of_conserved_genes.tab' );
 has 'output_raw_filename_unique_genes'    => ( is => 'ro', isa => 'Str', default => 'number_of_unique_genes.tab' );
@@ -29,6 +29,18 @@ has '_unique_genes' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
 has '_total_genes'  => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
 has '_new_genes'    => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
 
+sub _build_number_of_iterations
+{
+   my ($self) = @_;
+   my $number_of_iterations = 100;
+   my $number_of_files = @{ $self->group_statistics_obj->_sorted_file_names  };
+   if($number_of_files > $number_of_iterations)
+   {
+     $number_of_iterations = $number_of_files;
+   }
+   return $number_of_iterations;
+}
+
 sub create_output_files {
     my ($self) = @_;
 
diff --git a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t
index 5560054..d73b5c9 100644
--- a/t/Bio/PanGenome/CommandLine/CreatePanGenome.t
+++ b/t/Bio/PanGenome/CommandLine/CreatePanGenome.t
@@ -28,12 +28,6 @@ my %scripts_and_expected_files = (
 mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files );
 cleanup_files();
 
-%scripts_and_expected_files = (
-      ' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff            ' =>
-          [ 'pan_genome_sequences/00006-group_1.fa.aln', 't/data/00006-group_1.fa.aln' ],
-);
-mock_execute_script_and_check_output( $script_name, \%scripts_and_expected_files );
-cleanup_files();
 
 %scripts_and_expected_files = (
   ' -j Local --output_multifasta_files t/data/query_1.gff t/data/query_2.gff t/data/query_3.gff ' =>
diff --git a/t/Bio/PanGenome/External/Cdhit.t b/t/Bio/PanGenome/External/Cdhit.t
index cb8c31b..6d7131a 100644
--- a/t/Bio/PanGenome/External/Cdhit.t
+++ b/t/Bio/PanGenome/External/Cdhit.t
@@ -20,7 +20,7 @@ ok($obj = Bio::PanGenome::External::Cdhit->new(
   exec         =>  $cwd.'/t/bin/dummy_cd-hit',
 ),'initialise object');
 
-is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 1 -d 256 -c 1 2> /dev/null', 'Command constructed as expected');
+is($obj->_command_to_run, $cwd.'/t/bin/dummy_cd-hit -i t/data/some_fasta_file.fa -o output -T 1 -M 900 -g 1 -s 0.99 -d 256 -c 0.99 2> /dev/null', 'Command constructed as expected');
 ok($obj->run(), 'run dummy command');
 unlink('output');
 unlink('output.clstr');
diff --git a/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t b/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t
index 2c87398..335f9f9 100644
--- a/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t
+++ b/t/Bio/PanGenome/Output/GroupsMultifastasNucleotide.t
@@ -43,28 +43,4 @@ is(read_file('pan_genome_sequences/00001-group_6.fa'), read_file('t/data/pan_gen
 is(read_file('pan_genome_sequences/00001-yfnB.fa'),    read_file('t/data/pan_genome_sequences/00001-yfnB.fa' ), 'Check multifasta content is correct for 1-yfnB.fa ');
 remove_tree('pan_genome_sequences');
 
-my $annotate_groups_all_merged = Bio::PanGenome::AnnotateGroups->new(
-  gff_files       => $gff_files,
-  groups_filename => 't/data/query_groups_all_merged',
-);
-$annotate_groups_all_merged->reannotate;
-
-ok(
-    my $obj_all_merged = Bio::PanGenome::Output::GroupsMultifastasNucleotide->new(
-        group_names    => [ 'group_2', 'group_5' ],
-        gff_files      => $gff_files,
-        annotate_groups => $annotate_groups_all_merged
-    ),
-    'All groups are merged into one so it needs to be deconvoluted'
-);
-ok( $obj_all_merged->create_files(), 'Split out the annotation into separate group files' );
-
-
-is(read_file('pan_genome_sequences/00006-different.fa'),  read_file('t/data/split_pan_genome_sequences/00006-different.fa'),       'Check multifasta content correct for 00006-different.fa'   );
-is(read_file('pan_genome_sequences/00002-speH.fa'),     read_file('t/data/split_pan_genome_sequences/00002-speH.fa'),          'Check multifasta content correct for speH.fa  '    );
-is(read_file('pan_genome_sequences/00002-hly.fa'),      read_file('t/data/split_pan_genome_sequences/00002-hly.fa'),           'Check multifasta content correct for hly.fa   '    );
-is(read_file('pan_genome_sequences/00002-argF.fa'),     read_file('t/data/split_pan_genome_sequences/00002-argF.fa'),          'Check multifasta content correct for argF.fa '     );
-is(read_file('reannotated_groups_file'),                read_file('t/data/split_pan_genome_sequences/reannotated_groups_file'),'Check multifasta content correct for reannotated_groups_file' );
-
-remove_tree('pan_genome_sequences');
 done_testing();
diff --git a/t/data/clustered_proteins_pan_genome b/t/data/clustered_proteins_pan_genome
index 7dfd5f7..0df4164 100644
--- a/t/data/clustered_proteins_pan_genome
+++ b/t/data/clustered_proteins_pan_genome
@@ -1,6 +1,5 @@
-group_1: abc_00004	abc_00004	abc_00004	abc_00006	abc_00006	abc_00006
+speH: abc_00004	abc_00004	abc_00004	abc_00006	abc_00006	abc_00006	1_2	2_2	3_2
 group_2: abc_00014	abc_00014	abc_00014	1_6	2_7	abc_00015
-speH: 1_2	2_2	3_2
 group_12: abc_00013	abc_00013	abc_00013
 yfnB: abc_00016	abc_00016	3_5
 group_5: abc_00003	abc_00003	abc_00003
diff --git a/t/data/overall_group_statisics.csv b/t/data/overall_group_statisics.csv
index 02360a0..ba1dd5b 100644
--- a/t/data/overall_group_statisics.csv
+++ b/t/data/overall_group_statisics.csv
@@ -1,12 +1,11 @@
 "Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","query_1","query_2","query_3"
-"speH","","hypothetical protein","3","3","1","1_2","2_2","3_2"
 "argF","","Ornithine carbamoyltransferase","3","3","1","1_3","2_3","3_3"
+"speH","","superantigen-like protein","3","9","3","1_2","2_2","abc_00004	abc_00004	abc_00004	abc_00006	abc_00006	abc_00006	3_2"
 "group_2","","Gonococcal growth inhibitor III","3","6","2","1_6","2_7","abc_00014	abc_00014	abc_00014	abc_00015"
 "hly","","Alpha-toxin","3","3","1","1_1","2_1","3_1"
 "yfnB","","Putative HAD-hydrolase yfnB","2","3","1.5","","abc_00016	abc_00016","3_5"
 "group_12","","","1","3","3","","","abc_00013	abc_00013	abc_00013"
 "group_5","","hypothetical protein","1","3","3","","","abc_00003	abc_00003	abc_00003"
-"group_1","","superantigen-like protein","1","6","6","","","abc_00004	abc_00004	abc_00004	abc_00006	abc_00006	abc_00006"
 "group_9","","hypothetical protein","1","3","3","","","abc_00010	abc_00010	abc_00010"
 "group_8","","","1","3","3","","","abc_01705	abc_01705	abc_01705"
 "group_10","","C4-dicarboxylate transporter/malic acid transport protein","1","3","3","","","abc_00011	abc_00011	abc_00011"