diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 1473d06..ea997ec --- a/README.md +++ b/README.md @@ -17,6 +17,43 @@ The categories of virus clusters represent the range of genomes in which this vi - Fasta_files/: intermediary files, including predicted proteins - Tab_files/: intermediary files, including results of the search agasint PFAM and the virus database. +VirSorter results can be imported into [Anvi'o](http://merenlab.org/software/anvio/) by following [these instructions](http://merenlab.org/2018/02/08/importing-virsorter-annotations/). + +# Using a conda virtual environment (tested on Ubuntu and CentOS) +* First install [Anaconda or Miniconda](https://conda.io/docs/user-guide/install/index.html) +* Download the databases required by VirSorter which have been converted to be used with HMMER version 3.1b2. Change to the directory where you want the databases be, and then run the following commands: +``` +wget https://zenodo.org/record/1168727/files/virsorter-data-v2.tar.gz +md5sum virsorter-data-v2.tar.gz +#m5sum should return dd12af7d13da0a85df0a9106e9346b45 +tar -xvzf virsorter-data-v2.tar.gz +``` +* Create and install your conda virtual environment. Change to the directory where you want VirSorter to be installed and run the following commands: +``` +conda create --name virsorter -c bioconda mcl=14.137 muscle blast perl-bioperl perl-file-which hmmer=3.1b2 perl-parallel-forkmanager perl-list-moreutils diamond +git clone https://github.com/simroux/VirSorter.git +cd VirSorter/Scripts +make +``` +* To run VirSorter from any directory, you can make symbolic links to `VirSorter/wrapper_phage_contigs_sorter_iPlant.pl` and `VirSorter/Scripts` and place them in the `bin` folder for your "virsorter" conda environment. An example location of this `bin` folder is `~/miniconda/envs/virsorter/bin`. Substitute this path with the path to the `bin` folder for your newly created "virsorter" environment. +``` +ln -s ~/Applications/VirSorter/wrapper_phage_contigs_sorter_iPlant.pl ~/miniconda/envs/virsorter/bin +ln -s ~/Applications/VirSorter/Scripts ~/miniconda/envs/virsorter/bin +``` +* Finally, you'll need to download MetaGeneAnnotator ([Noguchi et al, 2006](https://doi.org/10.1093/nar/gkl723)). You can put this directly in the "virsorter" environment's `bin` folder alongside the VirSorter symbolic links taht were just created. +``` +cd ~/miniconda/envs/virsorter/bin +wget http://metagene.nig.ac.jp/metagene/mga_x86_64.tar.gz +tar -xvzf metagene/mga_x86_64.tar.gz +``` + +To run VirSorter, type the following: + +``` +source activate virsorter +wrapper_phage_contigs_sorter_iPlant.pl -f assembly.fasta --db 1 --wdir output_directory --ncpu 4 --data-dir /path/to/virsorter-data +``` + # Docker - from DockerHub * Download the databases required by VirSorter, available as a tarball archive on iMicrobe: http://mirrors.iplantcollaborative.org/browse/iplant/home/shared/imicrobe/VirSorter/virsorter-data.tar.gz @@ -43,7 +80,7 @@ Install the following into a "bin" directory: * Metagene Annotator (http://metagene.nig.ac.jp/metagene/download_mga.html) * MUSCLE (http://www.drive5.com/muscle/) * BLAST+ (ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/) - +* DIAMOND (https://github.com/bbuchfink/diamond) ## Data Container diff --git a/Scripts/Sliding_windows_3.c b/Scripts/Sliding_windows_3.c index cc18a06..45ff41d 100644 --- a/Scripts/Sliding_windows_3.c +++ b/Scripts/Sliding_windows_3.c @@ -1,259 +1,284 @@ -#include -#include -#include -#include - -long double factorial(unsigned n){ - long double f=1; - while(n>0){f*=n--;} - return f; -} - -long double combination(unsigned k,unsigned n){ - long double f=(factorial(n) / (factorial(k) * factorial(n-k))); - return f; -} - - -long double combination_eff(unsigned k,unsigned n){ - long double num=1; - if (k<(n/2)){k=n-k;} - int n_2=n; - while (n_2>k){num*=n_2--;} - long double f= num / factorial(n-k); - return f; -} - -long double proba_n(unsigned n,unsigned k, long double proba){ - long double result=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); // New way more efficient to compute combination - return result; -} - - -long double proba_more_than(int n,int k, long double proba){ - long double result=0.0; - while(k<=n) { - result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); - k++; - } - return result; -} - - -long double proba_less_than(int n,int k, long double proba){ - long double result=0.0; - while(k>=0) { - result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); - k--; - } - return result; -} - -int get_th(int size_window,long double threshold, long double proba){ - int th_nb_gene=size_window+1; - long double p_t=0.0; -// printf("starting at %d / with proba %LE\n",th_nb_gene,proba); - while(p_t<=threshold && th_nb_gene>0){ - th_nb_gene--; - p_t = p_t + proba_n(size_window,th_nb_gene,proba); -// printf("\tp(x>=%d) = %LE\n",th_nb_gene,p_t); - } - return th_nb_gene; -} - - -int get_th_less(int size_window,long double threshold, long double proba){ - int th_nb_gene=-1; - long double p_t=0.0; - while(p_t<=threshold && th_nb_gene=0 && j>=0 && istore[start][size][type]){ - result=0; -// i=start+hood+1;j=size+hood+1; - } - } - } - } - return result; -} - - -long double log10perso(long double x){ - return log(x)/log(10); -} - - -int main(int argc, char *argv[]) -{ -// printf( "I am alive! Beware.\n" ); - FILE *ifp, *reffile; - char* refFilename=argv[1];char* inputFilename=argv[2];char* outputFilename=argv[3]; - reffile=fopen(refFilename,"r"); - int nb_genes=0,phage=0,pfam=0,unch=0,size=0,strand=0,hallmark=0,i=0,noncaudo=0; - float f_size=0.0; - long double p_phage=0.0,p_pfam=0.0,p_unch=0.0,p_strand=0.0,p_noncaudo=0.0; - if (reffile == NULL) { - fprintf(stderr, "Can't open input file %s\n",refFilename); - exit(1); - } - while (fscanf(reffile,"%Lf %Lf %Lf %Lf %f %Lf", &p_phage, &p_pfam, &p_unch, &p_strand, &f_size, &p_noncaudo) == 6) {} - printf("refs => %LE %LE %LE %LE %f %LE\n", p_phage, p_pfam, p_unch, p_strand, f_size, p_noncaudo); - fclose(reffile); - ifp = fopen(inputFilename, "r"); - if (ifp == NULL) { - fprintf(stderr, "Can't open input file %s!\n",inputFilename); - exit(1); - } - if (fscanf(ifp, "%d", &nb_genes) == 1){ -// printf("%d genes\n",nb_genes); - } - // Alloc memory for gene tables - int t_phage[nb_genes],t_pfam[nb_genes],t_unch[nb_genes], t_size[nb_genes],t_strand[nb_genes],t_hallmark[nb_genes],t_noncaudo[nb_genes]; - while (fscanf(ifp,"%d %d %d %d %d %d %d", &phage, &noncaudo, &pfam, &unch, &size, &strand, &hallmark) == 7) { -// printf("gene %d => %d %d %d %d %d %d %d\n", i, phage, noncaudo, pfam, unch, size, strand, hallmark); - t_phage[i]=phage; - t_noncaudo[i]=noncaudo; - t_pfam[i]=pfam; - t_unch[i]=unch; - t_size[i]=size; - t_strand[i]=strand; - t_hallmark[i]=hallmark; - i++; - } - fclose(ifp); - if (nb_genes!=i){ - printf("Houston we got a problem !!!!!! : we had %d genes and we count %d lines\n",nb_genes,i); - exit(1); - } -// // set up sliding windows - int min=10,max=100; - if (min>nb_genes){min=nb_genes;} - if (max>nb_genes){max=nb_genes;} -// // how many sliding windows will we have ? - int k=0,j=0,max_g=0,c_phage=0,c_pfam=0,pred_nb_s_w=0,t=0,th_nb_gene=0; - for (k=min;k<=max;k++){ - pred_nb_s_w+=nb_genes-k+1; - } -// printf("Predicting %d sliding windows\n",pred_nb_s_w); - // computing the threshold for each size of sliding window -// printf("Trying to allocate the memory 1\n"); - long double th=0.01/pred_nb_s_w,p_t=0.0; - // alloc memory for score matrix for the 6 metrics - double ***store=malloc(nb_genes*sizeof(double **)); - if (store==NULL){printf("out of memory\n");exit(1);} - for(i=0; i < nb_genes; i++){ - store[i] = malloc(max * sizeof(double *)); - if(store[i] == NULL){printf("out of memory\n");exit(1);} - for (j=0;j<=max;j++){ - store[i][j] = malloc(6 * sizeof(double )); - if(store[i][j] == NULL){printf("out of memory\n");exit(1);} - for (k=0;k<6;k++){store[i][j][k]=0;} - } - } -// printf("Memory Allocated and Initialized for %d %d 5\n",nb_genes,max); - int store_h[nb_genes][max]; - int n_phage=0,n_pfam=0,n_short=0,n_switch=0,n_unch=0,n_hallmark=0,n_noncaudo=0; - printf("For this contig we'll have %d sliding windows (= nb of comparison)\n",pred_nb_s_w); - for (k=max;k>=min;k--){ - int th_phage=k,th_pfam=k,th_size=k,th_unch=k,th_strand=k,th_noncaudo=k; - // we get all thresholds - th_phage=get_th(k,th,p_phage); -// printf("For window size %d, you will need at least %d phage genes to be significant\n",k,th_phage); - th_pfam=get_th_less(k,th,p_pfam); - th_unch=get_th(k,th,p_unch); -// printf("For window size %d, you will need at least %d uncharacterized genes to be significant\n",k,th_unch); - th_size=get_th(k,th,0.1); - th_strand=get_th_less(k,th,p_strand); - th_noncaudo=get_th(k,th,p_noncaudo); -// printf("For window size %d, you will need at least %d noncaudo genes to be significant\n",k,th_noncaudo); -// printf("////// Sliding window of %d genes -> th %d\n",k,th_phage); - // For all the sliding windows of this size, we count and compute and store the significativity value if > sig - for (i=0;i<(nb_genes-k+1);i++){ - n_phage=0;n_pfam=0;n_unch=0;n_short=0;n_switch=0;n_hallmark=0;n_noncaudo=0; -// // Counting - for (j=i;j<(i+k);j++){ - n_phage+=t_phage[j]; -// printf("Adding %d to the number of phage genes (%d)\n",t_phage[j],j); - n_pfam+=t_pfam[j]; - n_unch+=t_unch[j]; - n_short+=t_size[j]; - n_switch+=t_strand[j]; - n_hallmark+=t_hallmark[j]; - n_noncaudo+=t_noncaudo[j]; - } - unsigned tag=0; -// // If above thresholds - if (n_phage>th_phage){ -// // Calculate and store significativity - store[i][k][0]=-1*log10(proba_more_than(k,n_phage,p_phage)*pred_nb_s_w);tag=1; -// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); - } - if (n_pfam %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 1\n",n_pfam,th_pfam,store[i][k][1],i,k); - } - if (n_unch>th_unch){ -// // Calculate and store significativity - store[i][k][2]=-1*log10(proba_more_than(k,n_unch,p_unch)*pred_nb_s_w);tag=1; -// printf("Unch => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 2\n",n_unch,th_unch,store[i][k][2],i,k); - } - if (n_short>th_size){ -// // Calculate and store significativity - store[i][k][3]=-1*log10(proba_more_than(k,n_short,0.1)*pred_nb_s_w);tag=1; -// printf("Short => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 3\n",n_short,th_size,store[i][k][3],i,k); - } - if (n_switch %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 4\n",n_switch,th_strand,store[i][k][4],i,k); - } - if (n_noncaudo>th_noncaudo){ -// // Calculate and store significativity - store[i][k][5]=-1*log10(proba_more_than(k,n_noncaudo,p_noncaudo)*pred_nb_s_w);tag=1; -// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); - } - if (tag==1){store_h[i][k]=n_hallmark;} - } - } - // We look for local maxima and export the results - FILE *ofp; - ofp = fopen(outputFilename, "w"); - if (ofp == NULL) { - fprintf(stderr, "Can't open output file %s!\n",outputFilename); - exit(1); - } - for (k=max;k>=min;k--){ - for (i=0;i<(nb_genes-k+1);i++){ - for (j=0;j<6;j++){ - if (store[i][k][j] != 0.0){ // the stored value is not null -// printf("potential local maximum %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); - if (is_local_maximum(i,k,j,nb_genes-1,max,store)==1){ // and is a local maxima - // so we print it, with the nb_hallmark (start / window size / type / sig / nb_hallmark) -// printf("local maximum ! %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); - // i - start gene / k - sliding window size / j - proof typ (0 - phage / 1 - pfam / 2 - unch / 3 - size / 4 - strand) - fprintf(ofp, "%d\t%d\t%d\t%.14lf\t%d\n",i,k,j,store[i][k][j],store_h[i][k]); - } - } - } - } - } - fclose(ofp); - printf("done"); - // We export the results - return 0; -} +#include +#include +#include +#include + +long double factorial(unsigned n){ + long double f=1; + while(n>0){f*=n--;} + return f; +} + +long double combination(unsigned k,unsigned n){ + long double f=(factorial(n) / (factorial(k) * factorial(n-k))); + return f; +} + + +long double combination_eff(unsigned k,unsigned n){ + long double num=1; + if (k<(n/2)){k=n-k;} + int n_2=n; + while (n_2>k){num*=n_2--;} + long double f= num / factorial(n-k); + return f; +} + +long double proba_n(unsigned n,unsigned k, long double proba){ + long double result=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); // New way more efficient to compute combination + return result; +} + + +long double proba_more_than(int n,int k, long double proba){ + long double result=0.0; + while(k<=n) { + result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); + k++; + } + return result; +} + + +long double proba_less_than(int n,int k, long double proba){ + long double result=0.0; + while(k>=0) { + result+=combination_eff(k,n) * powl(proba,k) * powl((1-proba),(n-k)); + k--; + } + return result; +} + +int get_th(int size_window,long double threshold, long double proba){ + int th_nb_gene=size_window+1; + long double p_t=0.0; +// printf("starting at %d / with proba %LE\n",th_nb_gene,proba); + while(p_t<=threshold && th_nb_gene>0){ + th_nb_gene--; + p_t = p_t + proba_n(size_window,th_nb_gene,proba); +// printf("\tp(x>=%d) = %LE\n",th_nb_gene,p_t); + } + return th_nb_gene; +} + + +int get_th_less(int size_window,long double threshold, long double proba){ + int th_nb_gene=-1; + long double p_t=0.0; + while(p_t<=threshold && th_nb_gene=0 && j>=0 && istore[start][size][type]){ + result=0; +// i=start+hood+1;j=size+hood+1; + } + } + } + } + return result; +} + + +long double log10perso(long double x){ + return log(x)/log(10); +} + + +int main(int argc, char **argv) +{ +// printf( "I am alive! Beware.\n" ); + FILE *ifp, *reffile; +// char* refFilename=argv[1]; + char* inputFilename=argv[2];char* outputFilename=argv[3]; + + char *refFilename; + char *buffer = NULL; + int read; + unsigned long len; + read = getline(&refFilename, &len, stdin); + if (-1 != read){ +// puts(refFilename); + } else { + printf("No line read...\n"); + exit(1); + } + refFilename[strcspn(refFilename, "\n")] = 0; +// fprintf(stderr, "All good for file --%s--\n",refFilename); +// refFilename="/global/dna/projectdirs/MEP/tools/VirSorter/Generic_ref_file.refs"; + reffile=fopen(refFilename,"r"); + int nb_genes=0,phage=0,pfam=0,unch=0,size=0,strand=0,hallmark=0,i=0,noncaudo=0; + float f_size=0.0; + long double p_phage=0.0,p_pfam=0.0,p_unch=0.0,p_strand=0.0,p_noncaudo=0.0; + if (reffile == NULL) { + fprintf(stderr, "Can't open input file %s\n",refFilename); + exit(1); + } + while (fscanf(reffile,"%Lf %Lf %Lf %Lf %f %Lf", &p_phage, &p_pfam, &p_unch, &p_strand, &f_size, &p_noncaudo) == 6) {} +// printf("refs => %LE %LE %LE %LE %f %LE\n", p_phage, p_pfam, p_unch, p_strand, f_size, p_noncaudo); + fclose(reffile); +// ifp = fopen(inputFilename, "r"); +// if (ifp == NULL) { +// fprintf(stderr, "Can't open input file %s!\n",inputFilename); +// exit(1); +// } +// if (fscanf(ifp, "%d", &nb_genes) == 1){ +// printf("%d genes\n",nb_genes); +// } + if (fscanf(stdin, "%d", &nb_genes) == 1){ +// printf("%d genes\n",nb_genes); + } + // Alloc memory for gene tables + int t_phage[nb_genes],t_pfam[nb_genes],t_unch[nb_genes], t_size[nb_genes],t_strand[nb_genes],t_hallmark[nb_genes],t_noncaudo[nb_genes]; +// while (fscanf(ifp,"%d %d %d %d %d %d %d", &phage, &noncaudo, &pfam, &unch, &size, &strand, &hallmark) == 7) { + while (fscanf(stdin,"%d %d %d %d %d %d %d", &phage, &noncaudo, &pfam, &unch, &size, &strand, &hallmark) == 7) { +// printf("gene %d => %d %d %d %d %d %d %d\n", i, phage, noncaudo, pfam, unch, size, strand, hallmark); + t_phage[i]=phage; + t_noncaudo[i]=noncaudo; + t_pfam[i]=pfam; + t_unch[i]=unch; + t_size[i]=size; + t_strand[i]=strand; + t_hallmark[i]=hallmark; + i++; + } +// printf("Remaining lines\n"); + while (getline(&buffer, &len, stdin)!= -1){ + puts(buffer); + } +// fclose(ifp); + if (nb_genes!=i){ + printf("Houston we got a problem !!!!!! : we had %d genes and we count %d lines\n",nb_genes,i); + exit(1); + } +// // set up sliding windows + int min=10,max=100; + if (min>nb_genes){min=nb_genes;} + if (max>nb_genes){max=nb_genes;} +// // how many sliding windows will we have ? + int k=0,j=0,max_g=0,c_phage=0,c_pfam=0,pred_nb_s_w=0,t=0,th_nb_gene=0; + for (k=min;k<=max;k++){ + pred_nb_s_w+=nb_genes-k+1; + } +// printf("Predicting %d sliding windows\n",pred_nb_s_w); + // computing the threshold for each size of sliding window +// printf("Trying to allocate the memory 1\n"); + long double th=0.01/pred_nb_s_w,p_t=0.0; + // alloc memory for score matrix for the 6 metrics + double ***store=malloc(nb_genes*sizeof(double **)); + if (store==NULL){printf("out of memory\n");exit(1);} + for(i=0; i < nb_genes; i++){ + store[i] = malloc(max * sizeof(double *)); + if(store[i] == NULL){printf("out of memory\n");exit(1);} + for (j=0;j<=max;j++){ + store[i][j] = malloc(6 * sizeof(double )); + if(store[i][j] == NULL){printf("out of memory\n");exit(1);} + for (k=0;k<6;k++){store[i][j][k]=0;} + } + } +// printf("Memory Allocated and Initialized for %d %d 5\n",nb_genes,max); + int store_h[nb_genes][max]; + int n_phage=0,n_pfam=0,n_short=0,n_switch=0,n_unch=0,n_hallmark=0,n_noncaudo=0; +// printf("For this contig we'll have %d sliding windows (= nb of comparison)\n",pred_nb_s_w); + for (k=max;k>=min;k--){ + int th_phage=k,th_pfam=k,th_size=k,th_unch=k,th_strand=k,th_noncaudo=k; + // we get all thresholds + th_phage=get_th(k,th,p_phage); +// printf("For window size %d, you will need at least %d phage genes to be significant\n",k,th_phage); + th_pfam=get_th_less(k,th,p_pfam); + th_unch=get_th(k,th,p_unch); +// printf("For window size %d, you will need at least %d uncharacterized genes to be significant\n",k,th_unch); + th_size=get_th(k,th,0.1); + th_strand=get_th_less(k,th,p_strand); + th_noncaudo=get_th(k,th,p_noncaudo); +// printf("For window size %d, you will need at least %d noncaudo genes to be significant\n",k,th_noncaudo); +// printf("////// Sliding window of %d genes -> th %d\n",k,th_phage); + // For all the sliding windows of this size, we count and compute and store the significativity value if > sig + for (i=0;i<(nb_genes-k+1);i++){ + n_phage=0;n_pfam=0;n_unch=0;n_short=0;n_switch=0;n_hallmark=0;n_noncaudo=0; +// // Counting + for (j=i;j<(i+k);j++){ + n_phage+=t_phage[j]; +// printf("Adding %d to the number of phage genes (%d)\n",t_phage[j],j); + n_pfam+=t_pfam[j]; + n_unch+=t_unch[j]; + n_short+=t_size[j]; + n_switch+=t_strand[j]; + n_hallmark+=t_hallmark[j]; + n_noncaudo+=t_noncaudo[j]; + } + unsigned tag=0; +// // If above thresholds + if (n_phage>th_phage){ +// // Calculate and store significativity + store[i][k][0]=-1*log10(proba_more_than(k,n_phage,p_phage)*pred_nb_s_w);tag=1; +// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); + } + if (n_pfam %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 1\n",n_pfam,th_pfam,store[i][k][1],i,k); + } + if (n_unch>th_unch){ +// // Calculate and store significativity + store[i][k][2]=-1*log10(proba_more_than(k,n_unch,p_unch)*pred_nb_s_w);tag=1; +// printf("Unch => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 2\n",n_unch,th_unch,store[i][k][2],i,k); + } + if (n_short>th_size){ +// // Calculate and store significativity + store[i][k][3]=-1*log10(proba_more_than(k,n_short,0.1)*pred_nb_s_w);tag=1; +// printf("Short => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 3\n",n_short,th_size,store[i][k][3],i,k); + } + if (n_switch %d is below the threshold %d, so we compute its significativity %E, that we store in %d, %d, 4\n",n_switch,th_strand,store[i][k][4],i,k); + } + if (n_noncaudo>th_noncaudo){ +// // Calculate and store significativity + store[i][k][5]=-1*log10(proba_more_than(k,n_noncaudo,p_noncaudo)*pred_nb_s_w);tag=1; +// printf("Phage => %d is beyond the threshold %d, so we compute its significativity %E, that we store in %d, %d, 0\n",n_phage,th_phage,store[i][k][0],i,k); + } + if (tag==1){store_h[i][k]=n_hallmark;} + } + } + // We look for local maxima and export the results + FILE *ofp; +// ofp = fopen(outputFilename, "w"); +// if (ofp == NULL) { +// printf(stderr, "Can't open output file %s!\n",outputFilename); +// exit(1); +// } + for (k=max;k>=min;k--){ + for (i=0;i<(nb_genes-k+1);i++){ + for (j=0;j<6;j++){ + if (store[i][k][j] != 0.0){ // the stored value is not null +// printf("potential local maximum %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); + if (is_local_maximum(i,k,j,nb_genes-1,max,store)==1){ // and is a local maxima + // so we print it, with the nb_hallmark (start / window size / type / sig / nb_hallmark) +// printf("local maximum ! %d %d %d %E %d\n",i,k,j,store[i][k][j],store_h[i][k]); + // i - start gene / k - sliding window size / j - proof typ (0 - phage / 1 - pfam / 2 - unch / 3 - size / 4 - strand) +// fprintf(ofp, "%d\t%d\t%d\t%.14lf\t%d\n",i,k,j,store[i][k][j],store_h[i][k]); + printf("%d\t%d\t%d\t%.14lf\t%d\n",i,k,j,store[i][k][j],store_h[i][k]); + } + } + } + } + } +// fclose(ofp); +// printf("done"); + // We export the results + return 0; +} diff --git a/Scripts/Step_0_make_new_clusters.pl b/Scripts/Step_0_make_new_clusters.pl index b595ac3..277a57a 100755 --- a/Scripts/Step_0_make_new_clusters.pl +++ b/Scripts/Step_0_make_new_clusters.pl @@ -5,23 +5,32 @@ use File::Spec::Functions; use File::Path 'mkpath'; use File::Which 'which'; +use Parallel::ForkManager; # Script to generate a new db with putative new clusters # Argument 0 : revision directory # Argument 1 : Fasta file of the predicted proteins # Argument 2 : Fasta file of the unclustered from previous Runs # Argument 3 : Liste of prots to try to cluster -if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3]))) +# Argument 4 : Number of CPUs to use +# Argument 5 : (optional) specify "diamond" if this script should use diamond instead of blastp +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[4]))) { print "# Script to generate a new db with putative new clusters # Argument 0 : revision directory # Argument 1 : Fasta file of the predicted proteins # Argument 2 : Fasta file of the unclustered from previous Runs -# Argument 3 : Liste of prots to try to cluster\n"; +# Argument 3 : Liste of prots to try to cluster +# Argument 4 : Number of CPUs to use +# Argument 5 : specify \"diamond\" if this script should use diamond instead of blastp\n"; die "\n"; } - +my $diamond = 0; +if (defined($ARGV[5])) { + $diamond = 1; +} my $path_to_blastp = which("blastp") or die "No blastp\n"; +my $path_to_diamond = ""; my $MCX_LOAD = which("mcxload") or die "No mcxload\n"; my $MCL = which("mcl") or die "No mcl\n"; my $path_to_muscle = which("muscle") or die "No muscle\n"; @@ -29,6 +38,10 @@ my $path_to_hmmpress = which("hmmpress") or die "No hmmpress\n"; my $path_to_makeblastdb = which("makeblastdb") or die "No makeblastdb\n"; +if ($diamond == 1) { + $path_to_diamond = which('diamond') or die "Missing diamond\n"; +} +my $n_cpus=$ARGV[4]; my $r_dir=$ARGV[0]; $r_dir=~/(r_\d*)\/?$/; my $r_number=$1; @@ -73,6 +86,9 @@ my $db= catfile($r_dir, "pool_new_proteins"); my $cmd_format="$path_to_makeblastdb -dbtype prot -in $pool_new -out $db"; +if ($diamond == 1) { + $cmd_format="$path_to_diamond makedb --in $pool_new --db $db"; +} print "$cmd_format\n"; my $out=`$cmd_format`; print "makeblastdb : $out\n"; @@ -82,7 +98,10 @@ print "Cat : $cmd_cat\n"; # BLAST des unclustered et des new contre les new my $out_blast=catfile($r_dir, "pool_unclustered-and-new-proteins-vs-new-proteins.tab"); -my $cmd_blast="$path_to_blastp -query $pool_new -db $db -out $out_blast -outfmt 6 -num_threads 10 -evalue 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts +my $cmd_blast="$path_to_blastp -query $pool_new -db $db -out $out_blast -outfmt 6 -num_threads $n_cpus -evalue 0.00001"; +if ($diamond == 1) { + $cmd_blast="$path_to_diamond blastp --query $pool_new --db $db --out $out_blast --outfmt 6 -b 2 --threads $n_cpus -k 500 --evalue 0.00001 --more-sensitive"; +} print "$cmd_blast\n"; $out=`$cmd_blast`; print "Blast : $out\n"; @@ -116,7 +135,7 @@ $out=`$cmd_mcxload`; print "Mxc Load : $out\n"; my $dump_file=catfile($r_dir, "new_clusters.csv"); -my $cmd_mcl="$MCL $out_mci -I 2 -use-tab $out_tab -o $dump_file"; +my $cmd_mcl="$MCL $out_mci -I 2 -te $n_cpus -use-tab $out_tab -o $dump_file"; print "$cmd_mcl\n"; $out=`$cmd_mcl`; print "Mcl : $out\n"; @@ -192,7 +211,12 @@ close S1; close S2; print "making a blastable db from the new unclustered\n"; -$out=`$path_to_makeblastdb -dbtype prot -in $pool_new_unclustered -out $blastable_new_unclustered`; +my $unclustered_blast_db_cmd = "$path_to_makeblastdb -dbtype prot -in $pool_new_unclustered -out $blastable_new_unclustered"; +if ($diamond == 1) { + $unclustered_blast_db_cmd = "$path_to_diamond makedb --in $pool_new_unclustered --db $blastable_new_unclustered"; +} + +$out=`$unclustered_blast_db_cmd`; # on réduit aussi le fichier blast qu'on ajoute au blast des unclustered open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n"; open(S1,">$blast_unclustered") || die "pblm ouverture fichier $blast_unclustered\n"; @@ -206,9 +230,13 @@ close BL; close S1; -my $tag=0; +#my $tag=0; + +my $pm = new Parallel::ForkManager($n_cpus); #Starts the parent process for parallelizing the next foreach loop, sets max number of parallel processes +$pm->set_waitpid_blocking_sleep(0); foreach(sort keys %clusters){ - $tag=1; + $pm->start and next; #do the fork +# $tag=1; my $ali_id=$_; my $path_to_file= catfile($r_dir, "clusts", $ali_id); my $path_to_fasta=catfile($r_dir, "clusts", $ali_id . ".faa"); @@ -217,27 +245,31 @@ if (-e $path_to_ali){ `rm $path_to_ali $path_to_hmm`; } - my $muscle_out= catfile($r_dir, "log_out_muscle"); - my $muscle_err= catfile($r_dir, "log_err_muscle"); + my $muscle_out= catfile($r_dir, "log_out_muscle_$$"); + my $muscle_err= catfile($r_dir, "log_err_muscle_$$"); `$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`; - my $out_stokcholm=$path_to_ali.".stockholm"; - open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; - print S1 "# STOCKHOLM 1.0\n"; - open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; - while(){ - chomp($_); - if ($_=~/^>(.*)/){ - my $id=$1; - $id=~s/\s/_/g; - print S1 "\n$id "; - - } - else{print S1 "$_";} - } - close FA; - print S1 "\n//\n"; - `$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`; +# my $out_stokcholm=$path_to_ali.".stockholm"; +# open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; +# print S1 "# STOCKHOLM 1.0\n"; +# open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; +# while(){ +# chomp($_); +# if ($_=~/^>(.*)/){ +# my $id=$1; +# $id=~s/\s/_/g; +# print S1 "\n$id "; +# +# } +# else{print S1 "$_";} +# } +# close FA; +# print S1 "\n//\n"; +#HMMER can take aligned fasta as input, no need to convert. Commmenting out previous 16 lines. + `$path_to_hmmbuild --amino --cpu 1 --informat afa $path_to_hmm $path_to_ali`; + `rm $muscle_out $muscle_err`; + $pm->finish(0); # do the exit in the child process } +#$pm->wait_all_children; # wait until everything in the above foreach loop is done before moving on my @tab_hmm=<$r_dir/clusts/*.hmm>; if ($#tab_hmm>=0){ diff --git a/Scripts/Step_3_highlight_phage_signal.pl b/Scripts/Step_3_highlight_phage_signal.pl index 6c13a5c..699d171 100755 --- a/Scripts/Step_3_highlight_phage_signal.pl +++ b/Scripts/Step_3_highlight_phage_signal.pl @@ -4,28 +4,33 @@ use autodie; use File::Spec::Functions; use FindBin '$Bin'; +use Parallel::ForkManager; +use List::MoreUtils qw(natatime); # Script to measure metrics on the sliding window # Argument 0 : csv file of the contigs # Argument 1 : summary file of the phage fragments -if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[1]))) +# Argument 2 : number of CPUs to use in parallel processing of sliding window analysis +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[1])) || (!defined($ARGV[2]))) { print "# Script to measure metrics on the sliding window # Argument 0 : csv file of the contigs # Argument 1 : summary file of the phage fragments -# Argument 2 (optional) : a file with the refs values that we could use instead of estimating them \n"; +# Argument 2 : number of CPUs to use in parallel processing of sliding window analysis +# Argument 3 (optional) : a file with the refs values that we could use instead of estimating them \n"; die "\n"; } $| = 1; my $csv_file = $ARGV[0]; my $out_file = $ARGV[1]; +my $n_cpus = $ARGV[2]; if ( -e $out_file ) { `rm $out_file`; } my $ref_file = $ARGV[0]; $ref_file =~ s/\.csv/.refs/g; my $do_ref_estimation = 0; -if (defined($ARGV[2])){ -# $ref_file=$ARGV[2]; - `cp $ARGV[2] $ref_file`; # That way, the ref file is in the result directory if a use wants to check it +if (defined($ARGV[3])){ +# $ref_file=$ARGV[3]; + `cp $ARGV[3] $ref_file`; # That way, the ref file is in the result directory if a use wants to check it $do_ref_estimation=1; } @@ -162,368 +167,406 @@ open S1, '>', $out_file; close S1; my $i=0; -foreach(@liste_contigs){ - my $contig_c=$_; - my @tab_genes=sort {$infos{$contig_c}{$a}{"order"} <=> $infos{$contig_c}{$b}{"order"}} keys %{$infos{$contig_c}}; - my $total_nb_genes=$#tab_genes+1; - ### Preparing data for C program - my $out_file_c=$ref_file; - $out_file_c=~s/\.refs/.tmp_$i/g; - my $out_file_c2=$ref_file; - $out_file_c2=~s/\.refs/.out_$i/g; - my $out_file_c3=$ref_file; - $out_file_c3=~s/\.refs/.out_$i-sorted/g; -# print "we have $out_file_c $out_file_c2 $out_file_c3\n"; - open MAP_C, '>', $out_file_c; - print MAP_C "$nb_genes{$contig_c}\n"; - my $last_strand="0"; - my $total_hallmark=0; - my $total_noncaudo=0; - foreach(@tab_genes){ - my $gene=$_; - my $tag=""; - # Line : PC / PFAM / UNCH / SIZE / STRAND / HALLMARK - if($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PC/){ - if ($infos{$contig_c}{$gene}{"category"}>=3){$tag="1\t1\t0\t0\t";$total_noncaudo++;} - else{$tag="1\t0\t0\t0\t";} - } - elsif($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PFAM/){$tag="0\t0\t1\t0\t";} - else{$tag="0\t0\t0\t1\t";} - if ($infos{$contig_c}{$gene}{"length"}<$th_gene_size){$tag.="1\t";} - else{$tag.="0\t";} - if (($last_strand eq "0") || ($infos{$contig_c}{$gene}{"strand"} eq $last_strand)){$tag.="0\t";} - else{$tag.="1\t";} - $last_strand=$infos{$contig_c}{$gene}{"strand"}; - if (($infos{$contig_c}{$gene}{"category"}==0) || ($infos{$contig_c}{$gene}{"category"}==3)){ - $tag.="1\t";$total_hallmark++; - print "Gene $contig_c / $gene -> category $infos{$contig_c}{$gene}{category} -> putative hallmark\n"; - } # look at putative hallmarklmark - else{$tag.="0\t";} - print MAP_C "$tag\n"; - } - close MAP_C; - ### Now go execute the C program - my $c_cmd="$path_to_c_script $ref_file $out_file_c $out_file_c2"; -# print "Step 1 - $c_cmd\n"; - my $out=`$c_cmd`; -# print "$out\n"; - $c_cmd="sort -r -n -k 4 $out_file_c2 > $out_file_c3"; -# print "Step 2 - $c_cmd\n"; - $out=`$c_cmd`; -# print "$out\n"; - ### reading the c program output to fill the match hash table / and removing overlap - my %match; - my %check; - my @check_gene; - open OUT_C, '<', $out_file_c3; - while(){ - chomp($_); - my @tab=split("\t",$_); - my $start=$tab[0]; - my $last=$tab[0]+$tab[1]-1; - my $fragment_id=$contig_c."-".$tab_genes[$start]."-".$tab_genes[$last]; - my $tag=0; - # Code : 0 phage / 1 pfam / 2 unch / 3 size / 4 strand switch - if ($tab[2]==0){ - if (overlap($fragment_id,$check{"phage"})==0){ - $match{$fragment_id}{"proof"}{"phage"}=$tab[3]; - $check{"phage"}{$fragment_id}=1; - $tag=1; - for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} - } - } - if ($tab[2]==1){ - if (overlap($fragment_id,$check{"pfam"})==0){ - $match{$fragment_id}{"proof"}{"pfam"}=$tab[3]; - $check{"pfam"}{$fragment_id}=1; - $tag=1; - for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + +my $num_contigs = scalar(@liste_contigs); +my $chunk_len = int($num_contigs/$n_cpus)+1; +#print "$chunk_len\n"; +my @grouped = @liste_contigs; +my $it = natatime $chunk_len, @grouped; + +my $pm = Parallel::ForkManager->new($n_cpus); #Starts the parent process for parallelizing the next foreach loop, sets max number of parallel processes +#$pm->set_waitpid_blocking_sleep(0); +while (my @vals = $it->()){ + $pm->start and next; #do the fork + foreach(@vals){ + my $contig_c=$_; + my @tab_genes=sort {$infos{$contig_c}{$a}{"order"} <=> $infos{$contig_c}{$b}{"order"}} keys %{$infos{$contig_c}}; + my $total_nb_genes=$#tab_genes+1; + ### Preparing data for C program + my $out_file_c=$ref_file; + $out_file_c=~s/\.refs/.tmp_$$/g; #The variable $$ is the PID for the fork + my $out_file_c2=$ref_file; + $out_file_c2=~s/\.refs/.out_$$/g; + my $out_file_c3=$ref_file; + $out_file_c3=~s/\.refs/.out_$$-sorted/g; + # print "we have $out_file_c $out_file_c2 $out_file_c3\n"; + #open MAP_C, '>', $out_file_c; + # # # # # # # # NEW WAY OF GIVING RESULTS TO C FILE -- MODIFICATION START HERE + # open(MAP_C,">$out_file_c") || die "pblm opening file $out_file_c\n"; # We don't create a file anymore + # print MAP_C "$nb_genes{$contig_c}\n"; + my $line_input="$ref_file\n$nb_genes{$contig_c}\n"; # We start a "line" instead, so put everything in a string ($line_input) + my $last_strand="0"; + my $total_hallmark=0; + my $total_noncaudo=0; + foreach(@tab_genes){ + my $gene=$_; + my $tag=""; + # Line : PC / noncaudo / PFAM / UNCH / SIZE / STRAND / HALLMARK + if($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PC/){ + if ($infos{$contig_c}{$gene}{"category"}>=3){$tag="1\t1\t0\t0\t";$total_noncaudo++;} + else{$tag="1\t0\t0\t0\t";} } + elsif($infos{$contig_c}{$gene}{"best_domain_hit"}=~/^PFAM/){$tag="0\t0\t1\t0\t";} + else{$tag="0\t0\t0\t1\t";} + if ($infos{$contig_c}{$gene}{"length"}<$th_gene_size){$tag.="1\t";} + else{$tag.="0\t";} + if (($last_strand eq "0") || ($infos{$contig_c}{$gene}{"strand"} eq $last_strand)){$tag.="0\t";} + else{$tag.="1\t";} + $last_strand=$infos{$contig_c}{$gene}{"strand"}; + if (($infos{$contig_c}{$gene}{"category"}==0) || ($infos{$contig_c}{$gene}{"category"}==3)){ + $tag.="1\t";$total_hallmark++; + print "Gene $contig_c / $gene -> category $infos{$contig_c}{$gene}{category} -> putative hallmark\n"; + } # look at putative hallmarklmark + else{$tag.="0\t";} + # # # # # We remove the PART where we write to the file, and we put the same info in the string instead + # print MAP_C "$tag\n"; + # } + # close MAP_C; + $line_input.=$tag."\n"; } - if ($tab[2]==2){ - if (overlap($fragment_id,$check{"unch"})==0){ - $match{$fragment_id}{"proof"}{"unch"}=$tab[3]; - $check{"unch"}{$fragment_id}=1; - $tag=1; - for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + # # # # # + + + # # # # # The call to the C script get replaced too + # my $c_cmd="$path_to_c_script $ref_file $out_file_c $out_file_c2"; + # print "Step 1 - $c_cmd\n"; + # my $out=`$c_cmd`; + # print "$out\n"; + # $c_cmd="sort -r -n -k 4 $out_file_c2 > $out_file_c3"; + # print "Step 2 - $c_cmd\n"; + # $out=`$c_cmd`; + # print "$out\n"; + ### reading the c program output to fill the match hash table / and removing overlap + ## Now go execute the C program + my $path_to_c_script= catfile($script_dir, "Sliding_windows_3"); + my $c_cmd="printf \"$line_input\" | $path_to_c_script | sort -r -n -k 4 "; + my $out=`$c_cmd`; + print "$out\n"; + # # # # #### SWITCH FILE VS NOFILE + my %match; + my %check; + my @check_gene; + + + # # # # # We used to read the output from the C script, now we simply have the same information but in a string ($out) + # open OUT_C, '<', $out_file_c3; + # while(){ + # chomp($_); + # my @tab=split("\t",$_); + my @tab_lines=split("\n",$out); + foreach my $line (@tab_lines){ + my @tab=split("\t",$line); + # # # # # + + ### Starting from there, it should be the same thing as the current version on your fork, there is only the "close OUT_C;" at the bottom of the loop to remove (l. 142 in this file) + my $start=$tab[0]; + my $last=$tab[0]+$tab[1]-1; + my $fragment_id=$contig_c."-".$tab_genes[$start]."-".$tab_genes[$last]; + my $tag=0; + # Code : 0 phage / 1 pfam / 2 unch / 3 size / 4 strand switch + if ($tab[2]==0){ + if (overlap($fragment_id,$check{"phage"})==0){ + $match{$fragment_id}{"proof"}{"phage"}=$tab[3]; + $check{"phage"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } } - } - if ($tab[2]==3){ - if (overlap($fragment_id,$check{"avg_g_size"})==0){ - $match{$fragment_id}{"proof"}{"avg_g_size"}=$tab[3]; - $check{"avg_g_size"}{$fragment_id}=1; - $tag=1; + if ($tab[2]==1){ + if (overlap($fragment_id,$check{"pfam"})==0){ + $match{$fragment_id}{"proof"}{"pfam"}=$tab[3]; + $check{"pfam"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } } - } - if ($tab[2]==4){ - if (overlap($fragment_id,$check{"switch"})==0){ - $match{$fragment_id}{"proof"}{"switch"}=$tab[3]; - $check{"switch"}{$fragment_id}=1; - $tag=1; + if ($tab[2]==2){ + if (overlap($fragment_id,$check{"unch"})==0){ + $match{$fragment_id}{"proof"}{"unch"}=$tab[3]; + $check{"unch"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + } } - } - if ($tab[2]==5){ - if (overlap($fragment_id,$check{"noncaudo"})==0){ - $match{$fragment_id}{"proof"}{"noncaudo"}=$tab[3]; - $check{"noncaudo"}{$fragment_id}=1; - $tag=1; - for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} + if ($tab[2]==3){ + if (overlap($fragment_id,$check{"avg_g_size"})==0){ + $match{$fragment_id}{"proof"}{"avg_g_size"}=$tab[3]; + $check{"avg_g_size"}{$fragment_id}=1; + $tag=1; + } } - } - if ($tag==1){ - # If a match, we also take the nb of hallmark genes, and the size - if ($tab[4]>0){$match{$fragment_id}{"hallmark"}=$tab[4];} - $match{$fragment_id}{"size"}=$tab[1]; - } - } - close OUT_C; - ### Ok, we read the C output, no we try (neatly) to merge all predictions for this sequence - my $n=0; - my %merged_match; - my $th_contig_size=$th_nb_genes_covered*$total_nb_genes; - my @tab_matches=sort { $match{$b}{"size"} <=> $match{$a}{"size"} } keys %match; - if (!defined($match{$tab_matches[0]}{"size"})){} # Not even an interesting region, skip to the next sequence - else{ - my $tag_complete=0; - my $i=0; - while ($match{$tab_matches[$i]}{"size"}>$th_contig_size && $tag_complete==0){ - if ($match{$tab_matches[$i]}{"size"}>$th_contig_size && (defined($match{$tab_matches[$i]}{"proof"}{"pfam"}) || defined($match{$tab_matches[$i]}{"proof"}{"phage"}) || defined($match{$tab_matches[$i]}{"proof"}{"unch"}) || defined($match{$tab_matches[$i]}{"proof"}{"noncaudo"}))){ # SEEMS LIKE WE HAVE A COMPLETE PHAGE SEQUENCE - $tag_complete=1; - my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; - if (defined($match{$fragment_id})){ - $merged_match{$fragment_id}=$match{$fragment_id}; # If we indeed have complete metrics, we take themn + if ($tab[2]==4){ + if (overlap($fragment_id,$check{"switch"})==0){ + $match{$fragment_id}{"proof"}{"switch"}=$tab[3]; + $check{"switch"}{$fragment_id}=1; + $tag=1; } - else{ - $merged_match{$fragment_id}{"size"}=$total_nb_genes;# Otherwise we store just the size - $merged_match{$fragment_id}{"hallmark"}=$total_hallmark;# And the total number of hallmark genes on this fragment + } + if ($tab[2]==5){ + if (overlap($fragment_id,$check{"noncaudo"})==0){ + $match{$fragment_id}{"proof"}{"noncaudo"}=$tab[3]; + $check{"noncaudo"}{$fragment_id}=1; + $tag=1; + for (my $i=$start;$i<=$last;$i++){$check_gene[$i]++;} } - $merged_match{$fragment_id}{"type"}="complete_phage";# And we store the type of fragment - foreach(@tab_matches){ - my $fragment_id=$_; - if ($match{$fragment_id}{"size"}<$total_nb_genes){ - my $r=get_overlap($fragment_id,\%merged_match); - if ($r eq "no"){ # if no overlap - $merged_match{$fragment_id}=$match{$fragment_id}; # NO OVERLAP WITH THE COMPLETE - print "!!!!!!!!!!!!!!!!!!! THIS SHOULD NOT BE POSSIBLE\n"; - } - else{ - # Overlap, we propagate the proof and note it "partial" - foreach(keys %{$match{$fragment_id}{"proof"}}){ - if (defined($merged_match{$r}{"proof"}{$_})){ - if ($merged_match{$r}{"proof"}{$_}=~/:/){ + } + if ($tag==1){ + # If a match, we also take the nb of hallmark genes, and the size + if ($tab[4]>0){$match{$fragment_id}{"hallmark"}=$tab[4];} + $match{$fragment_id}{"size"}=$tab[1]; + } + } + # close OUT_C; + ### Ok, we read the C output, no we try (neatly) to merge all predictions for this sequence + my $n=0; + my %merged_match; + my $th_contig_size=$th_nb_genes_covered*$total_nb_genes; + my @tab_matches=sort { $match{$b}{"size"} <=> $match{$a}{"size"} } keys %match; + if (!defined($match{$tab_matches[0]}{"size"})){} # Not even an interesting region, skip to the next sequence + else{ + my $tag_complete=0; + my $i=0; + while ($match{$tab_matches[$i]}{"size"}>$th_contig_size && $tag_complete==0){ + if ($match{$tab_matches[$i]}{"size"}>$th_contig_size && (defined($match{$tab_matches[$i]}{"proof"}{"pfam"}) || defined($match{$tab_matches[$i]}{"proof"}{"phage"}) || defined($match{$tab_matches[$i]}{"proof"}{"unch"}) || defined($match{$tab_matches[$i]}{"proof"}{"noncaudo"}))){ # SEEMS LIKE WE HAVE A COMPLETE PHAGE SEQUENCE + $tag_complete=1; + my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + if (defined($match{$fragment_id})){ + $merged_match{$fragment_id}=$match{$fragment_id}; # If we indeed have complete metrics, we take themn + } + else{ + $merged_match{$fragment_id}{"size"}=$total_nb_genes;# Otherwise we store just the size + $merged_match{$fragment_id}{"hallmark"}=$total_hallmark;# And the total number of hallmark genes on this fragment + } + $merged_match{$fragment_id}{"type"}="complete_phage";# And we store the type of fragment + foreach(@tab_matches){ + my $fragment_id=$_; + if ($match{$fragment_id}{"size"}<$total_nb_genes){ + my $r=get_overlap($fragment_id,\%merged_match); + if ($r eq "no"){ # if no overlap + $merged_match{$fragment_id}=$match{$fragment_id}; # NO OVERLAP WITH THE COMPLETE + print "!!!!!!!!!!!!!!!!!!! THIS SHOULD NOT BE POSSIBLE\n"; + } + else{ + # Overlap, we propagate the proof and note it "partial" + foreach(keys %{$match{$fragment_id}{"proof"}}){ + if (defined($merged_match{$r}{"proof"}{$_})){ + if ($merged_match{$r}{"proof"}{$_}=~/:/){ + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + else{} # already a score for the entire match, no pblm + } + else { $fragment_id=~/.*-(gene_\d*-gene_\d*)/; - $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; } - else{} # already a score for the entire match, no pblm - } - else { - $fragment_id=~/.*-(gene_\d*-gene_\d*)/; - $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; } } } } } + $i++; } - $i++; - } - if($tag_complete==0){ # No complete phage, putatively one or several prophages - # First get all the phage region - # We look for interesting regions my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; - my $tag=-1; - my $tag_h=0; - for (my $i=0;$i<$total_nb_genes;$i++){ - if ($tag>=0 && (!defined($check_gene[$i]) || $check_gene[$i]<1)){ # end of an interesting region - my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$i-1]; + if($tag_complete==0){ # No complete phage, putatively one or several prophages + # First get all the phage region + # We look for interesting regions my $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + my $tag=-1; + my $tag_h=0; + for (my $i=0;$i<$total_nb_genes;$i++){ + if ($tag>=0 && (!defined($check_gene[$i]) || $check_gene[$i]<1)){ # end of an interesting region + my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$i-1]; + if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage + $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + $merged_match{$fragment_id}{"type"}="complete_phage"; + $merged_match{$fragment_id}{"size"}=$total_nb_genes; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + else{ # Prophage + $merged_match{$fragment_id}{"size"}=$i-$tag; + $merged_match{$fragment_id}{"type"}="prophage"; + $merged_match{$fragment_id}{"hallmark"}=$tag_h; + } + $tag=-1; + $tag_h=0; + } + elsif ($tag==-1 && $check_gene[$i]>=1){ + $tag=$i; + $tag_h=0; + } + if ($infos{$contig_c}{$tab_genes[$i]}{"category"}==0 || $infos{$contig_c}{$tab_genes[$i]}{"category"}==3){$tag_h++;} # look at putative hallmark + } + if ($tag>=0){ + my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$#tab_genes]; + print "Region is $fragment_id .."; if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage + print "which is a complete phage\n"; $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; $merged_match{$fragment_id}{"type"}="complete_phage"; $merged_match{$fragment_id}{"size"}=$total_nb_genes; $merged_match{$fragment_id}{"hallmark"}=$tag_h; } else{ # Prophage - $merged_match{$fragment_id}{"size"}=$i-$tag; + print "which is a prophage\n"; + $merged_match{$fragment_id}{"size"}=$total_nb_genes-$tag; $merged_match{$fragment_id}{"type"}="prophage"; $merged_match{$fragment_id}{"hallmark"}=$tag_h; } - $tag=-1; - $tag_h=0; } - elsif ($tag==-1 && $check_gene[$i]>=1){ - $tag=$i; - $tag_h=0; - } - if ($infos{$contig_c}{$tab_genes[$i]}{"category"}==0 || $infos{$contig_c}{$tab_genes[$i]}{"category"}==3){$tag_h++;} # look at putative hallmark - } - if ($tag>=0){ - my $fragment_id.=$contig_c."-".$tab_genes[$tag]."-".$tab_genes[$#tab_genes]; - print "Region is $fragment_id .."; - if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ # Complete phage - print "which is a complete phage\n"; - $fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; - $merged_match{$fragment_id}{"type"}="complete_phage"; - $merged_match{$fragment_id}{"size"}=$total_nb_genes; - $merged_match{$fragment_id}{"hallmark"}=$tag_h; - } - else{ # Prophage - print "which is a prophage\n"; - $merged_match{$fragment_id}{"size"}=$total_nb_genes-$tag; - $merged_match{$fragment_id}{"type"}="prophage"; - $merged_match{$fragment_id}{"hallmark"}=$tag_h; - } - } - # Now we merge the annotation in these regions - foreach(@tab_matches){ - my $fragment_id=$_; - # Check if overlap - my $r=get_overlap($fragment_id,\%merged_match); - if ($r eq "no"){ } # if no overlap # not in an interesting region - else{ - # Overlap, we propagate the proof and note it "partial" - foreach(keys %{$match{$fragment_id}{"proof"}}){ - if (defined($merged_match{$r}{"proof"}{$_})){ - if ($merged_match{$r}{"proof"}{$_}=~/:/){ + # Now we merge the annotation in these regions + foreach(@tab_matches){ + my $fragment_id=$_; + # Check if overlap + my $r=get_overlap($fragment_id,\%merged_match); + if ($r eq "no"){ } # if no overlap # not in an interesting region + else{ + # Overlap, we propagate the proof and note it "partial" + foreach(keys %{$match{$fragment_id}{"proof"}}){ + if (defined($merged_match{$r}{"proof"}{$_})){ + if ($merged_match{$r}{"proof"}{$_}=~/:/){ + $fragment_id=~/.*-(gene_\d*-gene_\d*)/; + $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + } + else{} # already a score for the entire match, no pblm + } + else { $fragment_id=~/.*-(gene_\d*-gene_\d*)/; - $merged_match{$r}{"proof"}{$_}.=$1.":".$match{$fragment_id}{"proof"}{$_}.","; + $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; } - else{} # already a score for the entire match, no pblm - } - else { - $fragment_id=~/.*-(gene_\d*-gene_\d*)/; - $merged_match{$r}{"proof"}{$_}=$1.":".$match{$fragment_id}{"proof"}{$_}.","; } + delete($match{$fragment_id}); } - delete($match{$fragment_id}); } - } - ## New addition that should help to get the prophage coordinates correctly ! - # And now check if one of the prophage map to the whole sequence - foreach(keys %merged_match){ - print "This is a prophage\n"; - my $fragment_id=$_; - if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ - $tag_complete=1; - my $new_fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; - print "We have a complete prophage -- we add it $new_fragment_id !\n"; - # ; - foreach(keys %{$merged_match{$fragment_id}}){ - $merged_match{$new_fragment_id}{$_}=$merged_match{$fragment_id}{$_}; + ## New addition that should help to get the prophage coordinates correctly ! + # And now check if one of the prophage map to the whole sequence + foreach(keys %merged_match){ + print "This is a prophage\n"; + my $fragment_id=$_; + if ($merged_match{$fragment_id}{"size"}>$th_contig_size){ + $tag_complete=1; + my $new_fragment_id=$contig_c."-".$tab_genes[0]."-".$tab_genes[$#tab_genes]; + print "We have a complete prophage -- we add it $new_fragment_id !\n"; + # ; + foreach(keys %{$merged_match{$fragment_id}}){ + $merged_match{$new_fragment_id}{$_}=$merged_match{$fragment_id}{$_}; + } + $merged_match{$new_fragment_id}{"type"}="complete_phage";# And we store the type of fragment } - $merged_match{$new_fragment_id}{"type"}="complete_phage";# And we store the type of fragment } - } - if ($tag_complete==1){ - # We can remove all the prophages - my @tab_temp=keys %merged_match; - foreach(@tab_temp){ - if ($merged_match{$_}{"type"} eq "complete_phage"){} - else{ - delete($merged_match{$_}); + if ($tag_complete==1){ + # We can remove all the prophages + my @tab_temp=keys %merged_match; + foreach(@tab_temp){ + if ($merged_match{$_}{"type"} eq "complete_phage"){} + else{ + delete($merged_match{$_}); + } } } + ## END OF THE NEW ADDITION } - ## END OF THE NEW ADDITION - } - open S1, '>>', $out_file; - foreach(sort { $merged_match{$b}{"size"} <=> $merged_match{$a}{"size"} } keys %merged_match){ ## IMPORTANT, HAVE TO BE SIZE ORDERED - my $fragment_id=$_; - $fragment_id=~/.*-(gene_\d+-gene_\d+)/; - my $zone=$1; - my $type_detection=$merged_match{$fragment_id}{"type"}; - print "$fragment_id\t$merged_match{$fragment_id}{size}\t$merged_match{$fragment_id}{hallmark}\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\n"; - my $category=3; - if ($merged_match{$fragment_id}{"hallmark"}==0){delete($merged_match{$fragment_id}{"hallmark"});} - # Determine the category. To check this, we want several good indicators - And also remove prediction based on one single indicator, unless it's a strong one (sig >2) - # New categories : - # Cat 1 - hallmark + gene phage enrichment - # Cat 2 - gene phage or hallmark without gene phage - # Cat 3 - no hallmark or gene phage, but other signal - my @tab_proof=keys %{$merged_match{$fragment_id}{"proof"}}; - if ($merged_match{$fragment_id}{"hallmark"}>0){ - if (defined($merged_match{$fragment_id}{"proof"}{"noncaudo"}) || defined($merged_match{$fragment_id}{"proof"}{"phage"})){ - if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/(gene_\d+-gene_\d+):(\d+)/){ - my $match_region=$1; - my $score=$2; - if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region - } - elsif ($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; ## THRESHOLD TO REMOVE THE NONCAUDO ON THE SMALL SMALL CONTIGS - if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/(gene_\d+-gene_\d+):(\d+)/){ - my $match_region=$1; - my $score=$2; - if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region - } - elsif ($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; - if ($category==3){ # no match complete, so category 2 - $category=2; - } - } - else{ - foreach(@tab_proof){ - if ($merged_match{$fragment_id}{"proof"}{$_}=~/(gene_\d+-gene_\d+):(\d+)/){ + open S1, '>>', $out_file; + foreach(sort { $merged_match{$b}{"size"} <=> $merged_match{$a}{"size"} } keys %merged_match){ ## IMPORTANT, HAVE TO BE SIZE ORDERED + my $fragment_id=$_; + $fragment_id=~/.*-(gene_\d+-gene_\d+)/; + my $zone=$1; + my $type_detection=$merged_match{$fragment_id}{"type"}; + print "$fragment_id\t$merged_match{$fragment_id}{size}\t$merged_match{$fragment_id}{hallmark}\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\n"; + my $category=3; + if ($merged_match{$fragment_id}{"hallmark"}==0){delete($merged_match{$fragment_id}{"hallmark"});} + # Determine the category. To check this, we want several good indicators - And also remove prediction based on one single indicator, unless it's a strong one (sig >2) + # New categories : + # Cat 1 - hallmark + gene phage enrichment + # Cat 2 - gene phage or hallmark without gene phage + # Cat 3 - no hallmark or gene phage, but other signal + my @tab_proof=keys %{$merged_match{$fragment_id}{"proof"}}; + if ($merged_match{$fragment_id}{"hallmark"}>0){ + if (defined($merged_match{$fragment_id}{"proof"}{"noncaudo"}) || defined($merged_match{$fragment_id}{"proof"}{"phage"})){ + if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/(gene_\d+-gene_\d+):(\d+)/){ my $match_region=$1; my $score=$2; - print "Hallmark but no phage or noncaudo, but other proof $_ -> $match_region / $score ($merged_match{$fragment_id}{proof}{$_})\n"; - if ($match_region eq $zone && $score>=$th_sig){$category=2;} # other metric on the whole region - elsif($score>=$th_sig_2){$category=2;} # metric partial only but strong enough so we keep it + if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region } - elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig){ - if ($_ eq "pfam" || $_ eq "unch"){ - $category=2; # if we have hallmark or gene_size + a metric pfam or unch on the whole fragment -> should be quite sure + elsif ($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; ## THRESHOLD TO REMOVE THE NONCAUDO ON THE SMALL SMALL CONTIGS + if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/(gene_\d+-gene_\d+):(\d+)/){ + my $match_region=$1; + my $score=$2; + if ($match_region eq $zone && $score>=$th_sig){$category=1;} # Phage metric on the whole region + } + elsif ($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){$category=1;} # if we have hallmark or gene_size + a phage metric on the whole fragment -> should be quite sure $category=1; + if ($category==3){ # no match complete, so category 2 + $category=2; + } + } + else{ + foreach(@tab_proof){ + if ($merged_match{$fragment_id}{"proof"}{$_}=~/(gene_\d+-gene_\d+):(\d+)/){ + my $match_region=$1; + my $score=$2; + print "Hallmark but no phage or noncaudo, but other proof $_ -> $match_region / $score ($merged_match{$fragment_id}{proof}{$_})\n"; + if ($match_region eq $zone && $score>=$th_sig){$category=2;} # other metric on the whole region + elsif($score>=$th_sig_2){$category=2;} # metric partial only but strong enough so we keep it + } + elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig){ + if ($_ eq "pfam" || $_ eq "unch"){ + $category=2; # if we have hallmark or gene_size + a metric pfam or unch on the whole fragment -> should be quite sure + } } } } } - } - elsif (defined($merged_match{$fragment_id}{"proof"}{"phage"}) || defined($merged_match{$fragment_id}{"proof"}{"noncaudo"})){# If we have some phage signal, - if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/:(\d*)/){ - if ($1>=$th_sig){ + elsif (defined($merged_match{$fragment_id}{"proof"}{"phage"}) || defined($merged_match{$fragment_id}{"proof"}{"noncaudo"})){# If we have some phage signal, + if ($merged_match{$fragment_id}{"proof"}{"phage"}=~/:(\d*)/){ + if ($1>=$th_sig){ + $category=2; # Good, phage signal significant -> should be quite sure + } + } + elsif($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){ $category=2; # Good, phage signal significant -> should be quite sure } - } - elsif($merged_match{$fragment_id}{"proof"}{"phage"}>=$th_sig){ - $category=2; # Good, phage signal significant -> should be quite sure - } - if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/:(\d*)/){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS - if ($1>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ + if ($merged_match{$fragment_id}{"proof"}{"noncaudo"}=~/:(\d*)/){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS + if ($1>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ + $category=2; # Good, phage signal significant -> should be quite sure + } + } + elsif($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS $category=2; # Good, phage signal significant -> should be quite sure } - } - elsif($merged_match{$fragment_id}{"proof"}{"noncaudo"}>=$th_sig && $total_noncaudo>$th_nb_genes_noncaudo){ ## THRESHOLD TO AVOID SHORT CONTIGS BIAS - $category=2; # Good, phage signal significant -> should be quite sure - } - } - if ($category==3){ # If the category is still 3, meaning that the phage signal (if there was any) was not that strong .. - if ($#tab_proof==0){ - $category=0; # No phage signal nor hallmark gene, and only one metric, we remove } - else{ - my $tag1=0; - foreach(@tab_proof){ - if ($merged_match{$fragment_id}{"proof"}{$_}=~/:(\d*)/){ - if ($1>=$th_sig_2){ + if ($category==3){ # If the category is still 3, meaning that the phage signal (if there was any) was not that strong .. + if ($#tab_proof==0){ + $category=0; # No phage signal nor hallmark gene, and only one metric, we remove + } + else{ + my $tag1=0; + foreach(@tab_proof){ + if ($merged_match{$fragment_id}{"proof"}{$_}=~/:(\d*)/){ + if ($1>=$th_sig_2){ + $tag1=1; # Good, one signal very significant + } + } + elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig_2){ $tag1=1; # Good, one signal very significant } - } - elsif ($merged_match{$fragment_id}{"proof"}{$_}>=$th_sig_2){ - $tag1=1; # Good, one signal very significant } - } - if ($tag1==0){ # If none of the metrics is really strong ... - $category=0; # .. we remove the detection + if ($tag1==0){ # If none of the metrics is really strong ... + $category=0; # .. we remove the detection + } } } + # Columns index : 0 / 1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9 / 10 / 11 / 12 + # Columns : Contig / Total Nb Genes / Fragment / Size / Type detection / Category / Enrich Phage / Enrich Non Caudo / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark + if ($category>0){ + print S1 "$contig_c\t$total_nb_genes\t$fragment_id\t$merged_match{$fragment_id}{size}\t$type_detection\t$category\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{noncaudo}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\t$merged_match{$fragment_id}{hallmark}\n"; + } } - # Columns index : 0 / 1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9 / 10 / 11 / 12 - # Columns : Contig / Total Nb Genes / Fragment / Size / Type detection / Category / Enrich Phage / Enrich Non Caudo / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark - if ($category>0){ - print S1 "$contig_c\t$total_nb_genes\t$fragment_id\t$merged_match{$fragment_id}{size}\t$type_detection\t$category\t$merged_match{$fragment_id}{proof}{phage}\t$merged_match{$fragment_id}{proof}{noncaudo}\t$merged_match{$fragment_id}{proof}{pfam}\t$merged_match{$fragment_id}{proof}{unch}\t$merged_match{$fragment_id}{proof}{switch}\t$merged_match{$fragment_id}{proof}{avg_g_size}\t$merged_match{$fragment_id}{hallmark}\n"; - } + close S1; } - close S1; + # $i++; + #`rm $out_file_c $out_file_c2 $out_file_c3`; } - $i++; - `rm $out_file_c $out_file_c2 $out_file_c3`; + $pm->finish(0); # do the exit in the child process } +$pm->wait_all_children; # wait until everything in the above foreach loop is done before moving on sub factorial { # factorial $n my $n = shift; diff --git a/Scripts/Step_first_add_custom_phage_sequence.pl b/Scripts/Step_first_add_custom_phage_sequence.pl index 3a3211a..d2ceb69 100755 --- a/Scripts/Step_first_add_custom_phage_sequence.pl +++ b/Scripts/Step_first_add_custom_phage_sequence.pl @@ -7,19 +7,32 @@ use File::Spec::Functions; use File::Path 'mkpath'; use File::Which 'which'; +use Parallel::ForkManager; + # Script to generate a new db with putative new clusters -# Argument 0 : Fasta file of the new phages +# Argument 0 : fasta of custom phages +# Argument 1 : db-in directory +# Argument 2 : db-out directory +# Argument 3 : number of CPUs to use +# Argument 4 : (optional) specify "diamond" if this script should use diamond instead of blastp -if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[2]))) +if (($ARGV[0] eq "-h") || ($ARGV[0] eq "--h") || ($ARGV[0] eq "-help" )|| ($ARGV[0] eq "--help") || (!defined($ARGV[3]))) { print "# Script to generate a new db with putative new clusters # Argument 0 : fasta of custom phages # Argument 1 : db-in directory # Argument 2 : db-out directory -\n"; +# Argument 3 : number of CPUs to use +# Argument 4 : (optional) specify \"diamond\" if this script should use diamond instead of blastp\n"; die "\n"; } +my $diamond = 0; +if (defined($ARGV[4])) { + $diamond = 1; + print "Diamond was used here instead of blastp.\n"; +} +my $path_to_diamond = ""; my $path_to_makeblastdb = which("makeblastdb") or die "No makeblastdb\n"; my $path_to_blastp = which("blastp") or die "No blastp\n"; my $path_to_muscle = which("muscle") or die "No muscle\n"; @@ -30,12 +43,17 @@ my $MCX_LOAD = which("mcxload") or die "No mcxload\n"; my $MCL = which("mcl") or die "No mcl\n"; +if ($diamond == 1) { + $path_to_diamond = which('diamond') or die "Missing diamond\n"; +} my $min_seq_in_a_cluster=3; -my $n_cpus=8; + my $fasta_contigs=$ARGV[0]; my $db_in=$ARGV[1]; my $db_out=$ARGV[2]; +my $n_cpus=$ARGV[3]; +#my $n_cpus=8; my $tmp_dir=$db_out."/initial_db"; `mkdir $tmp_dir`; @@ -210,16 +228,22 @@ # - 3 - and make new clusters my $db=$tmp_dir."Custom_phages_mga_prots-to-cluster"; my $cmd_format="$path_to_makeblastdb -in $prot_file_to_cluster -out $db -dbtype prot"; +if ($diamond == 1) { + $cmd_format="$path_to_diamond makedb --in $prot_file_to_cluster --db $db"; +} print "$cmd_format\n"; my $out=`$cmd_format`; -print "Formatdb : $out\n"; +print "Making BLAST database : $out\n"; my $cmd_cat="cat $fasta_unclustered >> $prot_file_to_cluster"; print "$cmd_cat\n"; $out=`$cmd_cat`; print "Cat : $cmd_cat\n"; # - blast vs themselves and the unclustered my $out_blast=$tmp_dir."pool_unclustered-and-custom-phages-vs-custom-phages.tab"; -my $cmd_blast="$path_to_blastp -query $prot_file_to_cluster -db $db -out $out_blast -outfmt 6 -num_threads 10 -evalue 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts +my $cmd_blast="$path_to_blastp -query $prot_file_to_cluster -db $db -out $out_blast -outfmt 6 -num_threads $n_cpus -evalue 0.00001"; # On 10 cores to keep a few alive for the rest of the scripts +if ($diamond == 1) { + $cmd_blast="$path_to_diamond blastp --query $prot_file_to_cluster --db $db --out $out_blast --outfmt 6 --threads $n_cpus -k 500 -b 2 --evalue 0.00001 --more-sensitive"; +} print "$cmd_blast\n"; $out=`$cmd_blast`; print "Blast : $out\n"; @@ -254,7 +278,7 @@ $out=`$cmd_mcxload`; print "Mxc Load : $out\n"; my $dump_file=$tmp_dir."new_clusters.csv"; -my $cmd_mcl="$MCL $out_mci -I 2 -use-tab $out_tab -o $dump_file"; +my $cmd_mcl="$MCL $out_mci -I 2 -te $n_cpus -use-tab $out_tab -o $dump_file"; print "$cmd_mcl\n"; $out=`$cmd_mcl`; print "Mcl : $out\n"; @@ -321,8 +345,12 @@ } close S1; print "making a blastable db from the new unclustered\n"; -$out=`$path_to_makeblastdb -in $final_pool_unclustered -out $final_blastable_unclustered -dbtype prot`; +my $unclustered_blast_db_cmd = "$path_to_makeblastdb -in $final_pool_unclustered -out $final_blastable_unclustered -dbtype prot"; # We subset the BLAST result to only unclustered proteins, and add it to the previous unclustered blast result +if ($diamond == 1) { + $unclustered_blast_db_cmd = "$path_to_diamond makedb --in $final_pool_unclustered --db $final_blastable_unclustered" +} +$out=`$unclustered_blast_db_cmd`; open(BL,"<$out_blast") || die "pblm ouverture fichier $out_blast\n"; open(S1,">$final_blast_unclustered") || die "pblm ouverture fichier $final_blast_unclustered\n"; while(){ @@ -336,8 +364,12 @@ close S1; # Generating the new database my $tag=0; + +my $pm = new Parallel::ForkManager($n_cpus); #Starts the parent process for parallelizing the next foreach loop, sets max number of parallel processes +$pm->set_waitpid_blocking_sleep(0); foreach(sort keys %clusters){ - $tag=1; + $pm->start and next; #do the fork +# $tag=1; my $ali_id=$_; my $path_to_file=$tmp_dir."clusts/".$ali_id; my $path_to_fasta=$tmp_dir."clusts/".$ali_id.".faa"; @@ -346,27 +378,30 @@ if (-e $path_to_ali){ `rm $path_to_ali $path_to_hmm`; } - my $muscle_out=$tmp_dir."log_out_muscle"; - my $muscle_err=$tmp_dir."log_err_muscle"; + my $muscle_out=$tmp_dir."log_out_muscle_$$"; + my $muscle_err=$tmp_dir."log_err_muscle_$$"; `$path_to_muscle -in $path_to_fasta -out $path_to_ali > $muscle_out 2> $muscle_err`; - my $out_stokcholm=$path_to_ali.".stockholm"; - open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; - print S1 "# STOCKHOLM 1.0\n"; - open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; - while(){ - chomp($_); - if ($_=~/^>(.*)/){ - my $id=$1; - $id=~s/\s/_/g; - print S1 "\n$id "; - - } - else{print S1 "$_";} - } - close FA; - print S1 "\n//\n"; - `$path_to_hmmbuild --amino $path_to_hmm $out_stokcholm`; +# my $out_stokcholm=$path_to_ali.".stockholm"; +# open(S1,">$out_stokcholm") || die "pblm opening $out_stokcholm\n"; +# print S1 "# STOCKHOLM 1.0\n"; +# open(FA,"<$path_to_ali") || die "pblm ouverture $path_to_ali\n"; +# while(){ +# chomp($_); +# if ($_=~/^>(.*)/){ +# my $id=$1; +# $id=~s/\s/_/g; +# print S1 "\n$id "; +# +# } +# else{print S1 "$_";} +# } +# close FA; +# print S1 "\n//\n"; + `$path_to_hmmbuild --amino --cpu 1 --informat afa $path_to_hmm $path_to_ali`; + `rm $muscle_out $muscle_err`; + $pm->finish; # do the exit in the child process } +#$pm->wait_all_children; # wait until everything in the above foreach loop is done before moving on # We pool all hmm / fasta from all PCs $out=`cat $db_phage > $db_out/Pool_clusters.hmm`; print "cat previous hmm : $out\n"; @@ -382,6 +417,7 @@ open(CA,">>$final_catalog") || die ("pblm opening file $final_catalog\n"); foreach(keys %clusters){ my $liste=join(" ",keys %{$clusters{$_}}); + $liste =~ s/\|/_/g; #Added to keep same format of clusters file when adding new ones print CA "$_|2||$liste\n"; } close CA; diff --git a/cpanfile b/cpanfile index 93a5395..88c3059 100644 --- a/cpanfile +++ b/cpanfile @@ -1,2 +1,4 @@ requires 'Bio::Perl', 1.007002; #requires 'File::Which', 1.22; +requires 'Parallel::ForkManager', 1.17; +requires 'List::MoreUtils', 0.428; diff --git a/wrapper_phage_contigs_sorter_iPlant.pl b/wrapper_phage_contigs_sorter_iPlant.pl index ab4fad6..d154dc6 100755 --- a/wrapper_phage_contigs_sorter_iPlant.pl +++ b/wrapper_phage_contigs_sorter_iPlant.pl @@ -14,9 +14,21 @@ =head1 SYNOPSIS --cp Custom phage sequence --db Either "1" (DEFAULT Refseqdb) or "2" (Viromedb) --wdir Working directory (DEFAULT cwd) - --ncpu Number of CPUs - --virome Virome decontamination mode, for datasets mostly viral, force the use of generic metrics instead of calculated from the whole dataset. Set to 1 to use virome decontamination mode (default: 0) - + --ncpu Number of CPUs (default: 4) + --virome Add this flag to enable virome decontamination mode, for datasets + mostly viral to force the use of generic metrics instead of + calculated from the whole dataset. (default: off) + --data-dir Path to "virsorter-data" directory (e.g. /path/to/virsorter-data) + --diamond Use diamond (in "--more-sensitive" mode) instead of blastp. + Diamond is much faster than blastp and may be useful for adding + many custom phages, or for processing extremely large Fasta files. + Unless you specify --diamond, VirSorter will use blastp. + --keep-db Specifying this flag keeps the new HMM and BLAST databases created + after adding custom phages. This is useful if you have custom phages + that you want to be included in several different analyses and want + to save the database and point VirSorter to it in subsequent runs. + By default, this is off, and you should only specify this flag if + you're SURE you need it. --help Show help and exit =head1 DESCRIPTION @@ -44,18 +56,23 @@ =head1 DESCRIPTION my $tag_virome = 0; my $custom_phage = ''; my $data_dir = '/data'; -my $n_cpus = 16; +my $n_cpus = 4; my $wdir = catdir(cwd(), 'virsorter-out'); +my $diamond = 0; +my $blastp = 'blastp'; +my $keepdb = 0; GetOptions( 'f|fna=s' => \$input_file, 'd|dataset:s' => \$code_dataset, 'db:i' => \$choice_database, - 'virome:i' => \$tag_virome, + 'virome' => \$tag_virome, 'wdir:s' => \$wdir, 'cp:s' => \$custom_phage, 'data-dir:s' => \$data_dir, 'ncpu:i' => \$n_cpus, + 'diamond' => \$diamond, + 'keep-db' => \$keepdb, 'h|help' => \$help, ); @@ -71,6 +88,10 @@ =head1 DESCRIPTION pod2usage('choice_database must be 1, 2, or 3'); } +if ($diamond == 1) { + $blastp = 'diamond' +} + say map { sprintf "%-15s: %s\n", @$_ } ( ['Bin', $Bin], ['Dataset', $code_dataset], @@ -79,23 +100,32 @@ =head1 DESCRIPTION ['Working dir', $wdir], ['Custom phages', $custom_phage], ['Data dir', $data_dir], + ['Num CPUs', $n_cpus], + ['blastp', $blastp], ); +if ($diamond == 1) { + say "This VirSorter run uses DIAMOND (Buchfink et al., Nature Methods 2015) instead of blastp.\n"; +} if ($tag_virome == 1) { say "WARNING: THIS WILL BE A VIROME DECONTAMINATION RUN"; } # Need 2 databases # PCs from Refseq (phages) or PCs from Refseq+Viromes -# PFAM (26.0) +# PFAM (27.0) my $path_hmmsearch = which('hmmsearch') or die "Missing hmmsearch\n"; my $path_blastp = which('blastp') or die "Missing blastp\n"; +my $path_diamond = ''; my $script_dir = catdir($Bin, 'Scripts'); my $dir_Phage_genes = catdir($data_dir,'Phage_gene_catalog'); my $readme_file = catfile($data_dir, 'VirSorter_Readme.txt'); my $ref_phage_clusters = catfile($data_dir, 'Phage_gene_catalog', 'Phage_Clusters_current.tab'); +if ($diamond == 1) { + $path_diamond = which('diamond') or die "Missing diamond\n"; +} if ($tag_virome == 1) { $readme_file = catfile($data_dir, 'VirSorter_Readme_viromes.txt'); @@ -166,6 +196,7 @@ =head1 DESCRIPTION my $cmd_step_1 = "$path_script_step_1 $code_dataset $fastadir $fna_file $nb_gene_th " . ">> $log_out 2>> $log_err"; + say "Started at ".(localtime); say "Step 0.5 : $cmd_step_1"; `echo $cmd_step_1 >> $log_out 2>> $log_err`; $out = `$cmd_step_1`; @@ -186,6 +217,7 @@ =head1 DESCRIPTION . "-o $out_hmmsearch_pfama_bis --noali $db_PFAM_a $fasta_file_prots " . ">> $log_out 2>> $log_err"; +say "Started at ".(localtime); say "Step 0.8 : $cmd_hmm_pfama"; `echo $cmd_hmm_pfama >> $log_out 2>> $log_err`; @@ -201,6 +233,7 @@ =head1 DESCRIPTION = "$path_hmmsearch --tblout $out_hmmsearch_pfamb --cpu $n_cpus " . "-o $out_hmmsearch_pfamb_bis --noali $db_PFAM_b $fasta_file_prots " . ">> $log_out 2>> $log_err"; +say "Started at ".(localtime); say "Step 0.9 : $cmd_hmm_pfamb"; `echo $cmd_hmm_pfamb >> $log_out 2>> $log_err`; @@ -237,12 +270,12 @@ =head1 DESCRIPTION my $script_detect = catfile($script_dir, "Step_3_highlight_phage_signal.pl"); my $cmd_detect - = "$script_detect $out_file_affi $out_file_phage_fragments " + = "$script_detect $out_file_affi $out_file_phage_fragments $n_cpus " . ">> $log_out 2>> $log_err"; if ($tag_virome == 1) { $cmd_detect - = "$script_detect $out_file_affi $out_file_phage_fragments " + = "$script_detect $out_file_affi $out_file_phage_fragments $n_cpus " . "$generic_ref_file >> $log_out 2>> $log_err"; } @@ -261,7 +294,7 @@ =head1 DESCRIPTION $r_n++; # New revision of the prediction my $dir_revision = catdir($wdir, 'r_' . $r_n); say "### Revision $r_n"; - + say "Started at ".(localtime); if (!-d $dir_revision) { ## mkdir for this revision mkpath($dir_revision); @@ -281,9 +314,19 @@ =head1 DESCRIPTION my $script_custom_phage = catfile( $script_dir, "Step_first_add_custom_phage_sequence.pl" ); - $out = `$script_custom_phage $custom_phage $dir_Phage_genes/ $dir_revision/db >> $log_out 2>> $log_err`; - - say "Adding custom phage to the database : $out"; + my $add_first = join(' ', + "$script_custom_phage $custom_phage $dir_Phage_genes/", + "$dir_revision/db $n_cpus >> $log_out 2>> $log_err" + ); + if ($diamond == 1) { + $add_first = join(' ', + "$script_custom_phage $custom_phage $dir_Phage_genes/", + "$dir_revision/db $n_cpus diamond >> $log_out 2>> $log_err" + ); + } + + say "Adding custom phage to the database : \n$add_first\n"; + $out = `$add_first`; } # should replace Pool_cluster / Pool_unclustered and # Pool_new_unclustered else , we just import the Refseq database @@ -299,13 +342,19 @@ =head1 DESCRIPTION my $cmd_new_clusters = join(' ', "$script_new_cluster $dir_revision $fasta_file_prots", "$previous_fasta_unclustered", - "$new_prots_to_cluster >> $log_out 2>> $log_err" - ); - + "$new_prots_to_cluster $n_cpus >> $log_out 2>> $log_err" + ); + if ($diamond == 1) { + $cmd_new_clusters = join(' ', + "$script_new_cluster $dir_revision $fasta_file_prots", + "$previous_fasta_unclustered", + "$new_prots_to_cluster $n_cpus diamond >> $log_out 2>> $log_err" + ); + } say $cmd_new_clusters; $out = `$cmd_new_clusters`; - say "Step 1.1 new clusters and new database : $out"; + say "\nStep 1.1 new clusters and new database : $out"; # Rm the list of prots to be clustered now that they should be # clustered $out = `rm $new_prots_to_cluster`; @@ -345,6 +394,7 @@ =head1 DESCRIPTION "$fasta_file_prots >> $log_out 2>> $log_err" ); + say "\nStarted at ".(localtime); say "Step 1.2 : $cmd_hmm_cluster"; `echo $cmd_hmm_cluster >> $log_out 2>> $log_err`; @@ -371,7 +421,21 @@ =head1 DESCRIPTION "-outfmt 6", "-evalue 0.001 >> $log_out 2>> $log_err" ); - + if ($diamond == 1) { + $cmd_blast_unclustered = join(' ', + $path_diamond, + "blastp --query $fasta_file_prots", + "--db $blastable_unclustered", + "--out $out_blast_new_unclustered", + "--threads $n_cpus", + "--outfmt 6", + "-b 2", #Uses at most approx. b * 6 GB of RAM. -b 2 will use at most ~12 GB of RAM. + "--more-sensitive", + "-k 500", #This is the default max sequences for blastp + "--evalue 0.001 >> $log_out 2>> $log_err" + ); + } + say "\nStarted at ".(localtime); say "\nStep 1.3 : $cmd_blast_unclustered"; `echo $cmd_blast_unclustered >> $log_out 2>> $log_err`; @@ -404,6 +468,7 @@ =head1 DESCRIPTION } ## Complete the affi + say "Started at ".(localtime); say "Step 2 : $cmd_merge"; `echo $cmd_merge >> $log_out 2>> $log_err`; $out = `$cmd_merge`; @@ -412,6 +477,7 @@ =head1 DESCRIPTION say "\t$out"; ## Complete the summary + say "Started at ".(localtime); say "Step 3 : $cmd_detect"; `echo $cmd_detect >> $log_out 2>> $log_err`; $out = `$cmd_detect`; @@ -421,6 +487,7 @@ =head1 DESCRIPTION # which of both of these categories are phage enough to be added to the # databases say "Setting up the final result file"; + say "Started at ".(localtime); say "Step 4 : $cmd_summary"; `echo $cmd_summary >> $log_out 2>> $log_err`; $out = `$cmd_summary`; @@ -434,6 +501,7 @@ =head1 DESCRIPTION my $cmd_step_5 = "$script_generate_output $code_dataset $wdir >> $log_out 2>> $log_err"; +say "\nStarted at ".(localtime); say "\nStep 5 : $cmd_step_5"; `echo $cmd_step_5 >> $log_out 2>> $log_err`; @@ -446,10 +514,14 @@ =head1 DESCRIPTION # We rm the first db to not overload user disk space my $db_revision_0 = catdir($wdir, 'r_0', 'db'); -if (-d $db_revision_0) { - $out = `rm -r $db_revision_0`; - say "rm -r $db_revision_0 : $out"; +#Comment out the next 4 lines to keep the database after processing! +if ($keepdb == 0) { + if (-d $db_revision_0) { + $out = `rm -r $db_revision_0`; + say "rm -r $db_revision_0 : $out"; + } } +#Comment out the above 4 lines to keep the database after processing! #`mv $fastadir $wdir/Fasta_files`; @@ -481,7 +553,7 @@ =head1 DESCRIPTION } safe_mv($out_file_affi, "$store_metric_files/VIRSorter_affi-contigs.tab"); -my $out_file_affi_ref = $code_dataset . "_affi-contigs.refs"; +my $out_file_affi_ref = catdir($wdir, $code_dataset . "_affi-contigs.refs"); safe_mv($out_file_affi_ref, $store_metric_files); safe_mv($out_file_phage_fragments, "$store_metric_files/VIRSorter_phage_signal.tab");