Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gnu parallel support #89

Merged
1 change: 1 addition & 0 deletions dist.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ requires = fasta_grep
requires = bedtools
requires = muscle
requires = revtrans.py
requires = parallel


[@Basic]
Expand Down
11 changes: 9 additions & 2 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ has 'output_filename' => ( is => 'rw', isa => 'Str', default =
has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' );
has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'group_statisics.csv' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
Expand All @@ -40,7 +41,8 @@ has 'perc_identity' => ( is => 'ro', isa => 'Num', default =
has 'dont_delete_files' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );

has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );

Expand Down Expand Up @@ -71,6 +73,7 @@ sub run {
number_of_input_files => $number_of_input_files,
output_filtered_clustered_fasta => $output_filtered_clustered_fasta,
job_runner => $self->job_runner,
cpus => $self->cpus
);

$iterative_cdhit->run();
Expand All @@ -79,6 +82,7 @@ sub run {
fasta_file => $output_cd_hit_filename,
blast_results_file_name => $output_blast_results_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec,
perc_identity => $self->perc_identity
Expand All @@ -95,6 +99,7 @@ sub run {
mcxdeblast_exec => $self->mcxdeblast_exec,
mcl_exec => $self->mcl_exec,
job_runner => $self->job_runner,
cpus => $self->cpus,
output_file => $output_mcl_filename
);
$mcl->run();
Expand All @@ -104,6 +109,7 @@ sub run {

my $post_analysis = Bio::PanGenome::External::PostAnalysis->new(
job_runner => $self->job_runner,
cpus => $self->cpus,
fasta_files => $self->fasta_files,
input_files => $self->input_files,
output_filename => $self->output_filename,
Expand All @@ -115,7 +121,8 @@ sub run {
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
translation_table => $self->translation_table
translation_table => $self->translation_table,
group_limit => $self->group_limit,
);
$post_analysis->run();

Expand Down
32 changes: 24 additions & 8 deletions lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,21 @@ has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' )
has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' );
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 98 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );

sub BUILD {
my ($self) = @_;

my ( $fasta_files, $dont_create_rplots, $dont_delete_files, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus,$output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $help );
my ( $fasta_files, $create_rplots,$group_limit, $max_threads, $dont_delete_files, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus,$output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $help );

GetOptionsFromArray(
$self->args,
Expand All @@ -55,9 +56,10 @@ sub BUILD {
'e|output_multifasta_files' => \$output_multifasta_files,
'i|perc_identity=i' => \$perc_identity,
'dont_delete_files' => \$dont_delete_files,
'dont_create_rplots' => \$dont_create_rplots,
'create_rplots' => \$create_rplots,
'verbose_stats' => \$verbose_stats,
't|translation_table=i' => \$translation_table,
'group_limit=i' => \$group_limit,
'qc|run_qc' => \$run_qc,
'h|help' => \$help,
);
Expand All @@ -78,9 +80,10 @@ sub BUILD {
$self->apply_unknowns_filter($apply_unknowns_filter) if ( defined($apply_unknowns_filter) );
$self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) );
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
$self->dont_create_rplots($dont_create_rplots) if (defined($dont_create_rplots) );
$self->dont_create_rplots(0) if (defined($create_rplots) );
$self->verbose_stats($verbose_stats) if ( defined $verbose_stats );
$self->translation_table($translation_table) if (defined($translation_table) );
$self->group_limit($group_limit) if ( defined($group_limit) );
$self->run_qc($run_qc) if ( defined( $run_qc ) );

for my $filename ( @{ $self->args } ) {
Expand All @@ -106,6 +109,7 @@ sub run {
input_files => $self->fasta_files,
job_runner => $self->job_runner,
apply_unknowns_filter => $self->apply_unknowns_filter,
cpus => $self->cpus,
translation_table => $self->translation_table
);

Expand All @@ -122,14 +126,16 @@ sub run {
fasta_files => $prepare_input_files->fasta_files,
output_filename => $self->output_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec,
output_multifasta_files => $self->output_multifasta_files,
perc_identity => $self->perc_identity,
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
translation_table => $self->translation_table
translation_table => $self->translation_table,
group_limit => $self->group_limit
);
$pan_genome_obj->run();
}
Expand Down Expand Up @@ -164,6 +170,16 @@ sub usage_text {

# Include full annotation and inference in group statistics
create_pan_genome --verbose_stats *.gff

# Run sequentially without LSF
create_pan_genome -j Local *.gff

# Run locally with GNU parallel and 4 processors
create_pan_genome -j Parallel -p 4 *.gff

# Increase the groups/clusters limit (default 50,000). If you need to change this your
# probably trying to work data from more than one species (which this script wasnt designed for).
create_pan_genome --group_limit 60000 *.gff

# Generate QC report detailing top genus and species for each assembly
create_pan_genome -qc *.gff
Expand Down
15 changes: 11 additions & 4 deletions lib/Bio/PanGenome/CommandLine/IterativeCdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ has 'output_filtered_clustered_fasta' => ( is => 'rw', isa => 'Str', default =>
has 'lower_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.98 );
has 'upper_bound_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'step_size_percentage' => ( is => 'rw', isa => 'Num', default => 0.005 );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );


sub BUILD {
my ($self) = @_;

my ( $output_cd_hit_filename,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,
my ( $output_cd_hit_filename,$cpus,$lower_bound_percentage,$upper_bound_percentage,$step_size_percentage, $output_combined_filename, $number_of_input_files, $output_filtered_clustered_fasta,
$help );

GetOptionsFromArray(
Expand All @@ -42,6 +43,7 @@ sub BUILD {
'l|lower_bound_percentage=s' => \$lower_bound_percentage,
'u|upper_bound_percentage=s' => \$upper_bound_percentage,
's|step_size_percentage=s' => \$step_size_percentage,
'cpus=i' => \$cpus,
'h|help' => \$help,
);

Expand All @@ -52,6 +54,7 @@ sub BUILD {
$self->output_cd_hit_filename($output_cd_hit_filename) if ( defined($output_cd_hit_filename) );
$self->output_combined_filename($output_combined_filename) if ( defined($output_combined_filename) );
$self->number_of_input_files($number_of_input_files) if ( defined($number_of_input_files) );
$self->cpus($cpus) if ( defined($cpus) );
$self->output_filtered_clustered_fasta($output_filtered_clustered_fasta)
if ( defined($output_filtered_clustered_fasta) );

Expand All @@ -73,7 +76,8 @@ sub run {
output_filtered_clustered_fasta => $self->output_filtered_clustered_fasta,
lower_bound_percentage => $self->lower_bound_percentage,
upper_bound_percentage => $self->upper_bound_percentage,
step_size_percentage => $self->step_size_percentage
step_size_percentage => $self->step_size_percentage,
cpus => $self->cpus

);
$obj->run;
Expand All @@ -86,10 +90,13 @@ sub usage_text {
Usage: iterative_cdhit [options]
Iteratively cluster a set of proteins with CD-hit, lower the threshold each time and extracting core genes (1 per isolate) to another file, and remove them from the input proteins file.

# Basic usage where you have a single isolate
# Basic usage where you have a single isolate
iterative_cdhit -m proteome_fasta.faa

# Where you have 10 isolates
# Use multiple CPUs
iterative_cdhit -m proteome_fasta.faa --cpus 8

# Where you have 10 isolates
iterative_cdhit -m proteome_fasta.faa -n 10

# Specify the output file name cdhit results
Expand Down
35 changes: 23 additions & 12 deletions lib/Bio/PanGenome/CommandLine/PanGenomePostAnalysis.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::PostAnalysis;
use File::Find::Rule;
use Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides;

use File::Path qw(remove_tree);

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
Expand All @@ -28,10 +28,12 @@ has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'g
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'clusters_filename' => ( is => 'rw', isa => 'Str' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );


sub BUILD {
Expand All @@ -40,7 +42,7 @@ sub BUILD {
my (
$output_filename, $dont_create_rplots, $dont_delete_files, $output_pan_geneome_filename,
$job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename,
$fasta_files, $input_files, $verbose_stats, $translation_table, $help
$fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit
);


Expand All @@ -57,7 +59,9 @@ sub BUILD {
'dont_delete_files' => \$dont_delete_files,
'dont_create_rplots' => \$dont_create_rplots,
'verbose_stats' => \$verbose_stats,
'processors=i' => \$cpus,
't|translation_table=i' => \$translation_table,
'group_limit=i' => \$group_limit,
'h|help' => \$help,
);

Expand All @@ -74,6 +78,8 @@ sub BUILD {
$self->dont_create_rplots($dont_create_rplots) if (defined($dont_create_rplots) );
$self->verbose_stats($verbose_stats) if (defined($verbose_stats));
$self->translation_table($translation_table) if (defined($translation_table) );
$self->cpus($cpus) if ( defined($cpus) );
$self->group_limit($group_limit) if ( defined($group_limit) );

}

Expand All @@ -97,19 +103,23 @@ sub run {
dont_delete_files => $self->dont_delete_files,
dont_create_rplots => $self->dont_create_rplots,
verbose_stats => $self->verbose_stats,
group_limit => $self->group_limit,
);
$obj->run();


if($self->output_multifasta_files == 1)

my $output_gene_files = $self->_find_input_files;
my $seg = Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides->new(
fasta_files => $output_gene_files,
job_runner => $self->job_runner,
translation_table => $self->translation_table,
cpus => $self->cpus
);
$seg->run();

# Cleanup intermediate multifasta files
if($self->output_multifasta_files == 0)
{
my $output_gene_files = $self->_find_input_files;
my $seg = Bio::PanGenome::External::ProteinMuscleAlignmentFromNucleotides->new(
fasta_files => $output_gene_files,
job_runner => $self->job_runner,
translation_table => $self->translation_table
);
$seg->run();
remove_tree('pan_genome_sequences');
}
}

Expand Down Expand Up @@ -151,6 +161,7 @@ sub usage_text {
-c output_clusters_filename
-f file_of_proteins
-i file_of_gffs
--processors number of processors
--verbose_stats

# This help message
Expand Down
9 changes: 8 additions & 1 deletion lib/Bio/PanGenome/CommandLine/ParallelAllAgainstAllBlastp.pm
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'blast_results' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );

Expand All @@ -29,14 +30,15 @@ has '_error_message' => ( is => 'rw', isa => 'Str' );
sub BUILD {
my ($self) = @_;

my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec, $blastp_exec, $help );
my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec, $blastp_exec, $help, $cpus );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'j|job_runner=s' => \$job_runner,
'm|makeblastdb_exec=s' => \$makeblastdb_exec,
'b|blastp_exec=s' => \$blastp_exec,
'p|processors=i' => \$cpus,
'h|help' => \$help,
);

Expand All @@ -49,6 +51,7 @@ sub BUILD {
$self->job_runner($job_runner) if ( defined($job_runner) );
$self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
$self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
$self->cpus($cpus) if ( defined($cpus) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
Expand Down Expand Up @@ -94,6 +97,7 @@ sub run {
fasta_file => $output_combined_filename,
blast_results_file_name => $self->output_filename,
job_runner => $self->job_runner,
cpus => $self->cpus,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
Expand All @@ -112,6 +116,9 @@ sub usage_text {

# Provide an output filename
parallel_all_against_all_blastp -o blast_results example.faa

# number of processors to use
parallel_all_against_all_blastp -p 10 example.faa

# This help message
parallel_all_against_all_blastp -h
Expand Down
Loading