diff --git a/config/example_assignment_bwa.yaml b/config/example_assignment_bwa.yaml new file mode 100644 index 0000000..35aa441 --- /dev/null +++ b/config/example_assignment_bwa.yaml @@ -0,0 +1,27 @@ +--- +global: # generall configs effecting one or multiple parts + threads: 1 + assignments: + split_number: 1 # number of files fastq should be split for parallelization +assignments: + exampleAssignment: # name of an example assignment (can be any string) + bc_length: 15 + alignment_tool: + tool: bwa # bwa or exact + configs: + min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file + sequence_length: # sequence length of design excluding adapters. + min: 166 + max: 175 + alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches + min: 1 # integer + max: 3 # integer + FW: + - resources/Assignment_BasiC/R1.fastq.gz + BC: + - resources/Assignment_BasiC/R2.fastq.gz + REV: + - resources/Assignment_BasiC/R3.fastq.gz + reference: resources/design.fa + configs: + default: {} # name of an example filtering config diff --git a/config/example_assignment_exact_lazy.yaml b/config/example_assignment_exact_lazy.yaml new file mode 100644 index 0000000..ea3b75e --- /dev/null +++ b/config/example_assignment_exact_lazy.yaml @@ -0,0 +1,24 @@ +--- +global: # generall configs effecting one or multiple parts + threads: 1 + assignments: + split_number: 1 # number of files fastq should be split for parallelization +assignments: + exampleAssignment: # name of an example assignment (can be any string) + bc_length: 15 + alignment_tool: + tool: exact # bwa or exact + configs: + sequence_length: 170 # sequence length of design excluding adapters. + alignment_start: 1 # start of the alignment in the reference + FW: + - resources/Assignment_BasiC/R1.fastq.gz + BC: + - resources/Assignment_BasiC/R2.fastq.gz + REV: + - resources/Assignment_BasiC/R3.fastq.gz + reference: resources/design.fa + configs: + lazy: # name of an example filtering config + min_support: 2 # default 3 + fraction: 0.6 # default 0.75 diff --git a/config/example_assignment_exact_linker.yaml b/config/example_assignment_exact_linker.yaml new file mode 100644 index 0000000..13c4fda --- /dev/null +++ b/config/example_assignment_exact_linker.yaml @@ -0,0 +1,22 @@ +--- +global: # generall configs effecting one or multiple parts + threads: 1 + assignments: + split_number: 1 # number of files fastq should be split for parallelization +assignments: + exampleAssignment: # name of an example assignment (can be any string) + bc_length: 20 + BC_rev_comp: true + linker: TCTAGACCGTCACTAACTAACAGTGGGTACCC + alignment_tool: + tool: exact # bwa or exact + configs: + sequence_length: 170 # sequence length of design excluding adapters. + alignment_start: 1 # start of the alignment in the reference + FW: + - resources/Assignment_BasiC/R1.fastq.gz + REV: + - resources/Assignment_BasiC/R3.fastq.gz + reference: resources/design.fa + configs: + default: {} # name of an example filtering config diff --git a/config/example_config.yaml b/config/example_config.yaml index 520a423..80e716e 100644 --- a/config/example_config.yaml +++ b/config/example_config.yaml @@ -1,18 +1,23 @@ --- -global: # generall configs effecting one or multiple parts +global: # generall configs effecting one or multiple parts threads: 1 assignments: - split_number: 1 # number of files fastq should be split for parallelization + split_number: 1 # number of files fastq should be split for parallelization assignments: - exampleAssignment: # name of an example assignment (can be any string) + exampleAssignment: # name of an example assignment (can be any string) bc_length: 15 - sequence_length: # sequence length of design excluding adapters. - min: 195 - max: 205 - alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches - min: 15 # integer - max: 17 # integer - min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file + alignment_tool: + tool: exact # bwa or exact + configs: + sequence_length: 170 # sequence length of design excluding adapters. + alignment_start: 1 # start of the alignment in the reference + # min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file + # sequence_length: # sequence length of design excluding adapters. + # min: 195 + # max: 205 + # alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches + # min: 15 # integer + # max: 17 # integer FW: - resources/Assignment_BasiC/R1.fastq.gz BC: @@ -21,9 +26,7 @@ assignments: - resources/Assignment_BasiC/R3.fastq.gz reference: resources/design.fa configs: - exampleAssignmentConfig: # name of an example filtering config - min_support: 3 - fraction: 0.7 + default: {} # name of an example filtering config experiments: exampleCount: bc_length: 15 @@ -38,24 +41,8 @@ experiments: fromWorkflow: type: config assignment_name: exampleAssignment - assignment_config: exampleAssignmentConfig + assignment_config: default design_file: resources/design.fa - label_file: resources/labels.tsv # optional + label_file: resources/labels.tsv # optional configs: - exampleConfig: - filter: - bc_threshold: 10 - DNA: - min_counts: 1 - RNA: - min_counts: 1 - sampling: # optional, just for benmarking - DNA: - total: 30000000 - threshold: 300 - RNA: - total: 50000000 - threshold: 300 - variants: # optional - map: resources/variant_map.tsv - min_barcodes: [5, 10] # min BC for ref and alt sequence + default: {} # name of an example filtering config diff --git a/config/sbatch.yml b/config/sbatch.yml index 7f64c75..6ae2df3 100644 --- a/config/sbatch.yml +++ b/config/sbatch.yml @@ -27,7 +27,7 @@ assignment_fastq_split: threads: 1 mem: 10G queue: medium -assignment_mapping: +assignment_mapping_bwa: time: "0-02:00" threads: 30 mem: 10G @@ -46,6 +46,11 @@ assignment_collectBCs: threads: 20 mem: 10G queue: medium +assignment_mapping_exact: + time: "0-01:00" + threads: 1 + mem: 10G + queue: debug assignment_statistic_totalCounts: time: "0-01:00" threads: 1 @@ -81,12 +86,16 @@ counts_umi_raw_counts: mem: 6G queue: "medium" counts_noUMI_create_BAM: - time: "4-00:00" + time: 4-00:00 mem: 12G - queue: "medium" + queue: medium +counts_filter_counts: + time: 0-02:00 + queue: medium counts_final_counts_samplerer: mem: 20G queue: medium + ######################### ### (ASSIGNED) COUNTS ### ######################### @@ -133,6 +142,9 @@ statistic_counts_barcode_base_composition: time: "1-00:00" queue: "medium" mem: 20G +statistic_counts_table: + time: 0-02:00 + queue: medium ############################# ### Statistic/correlation ### diff --git a/docs/assignment.rst b/docs/assignment.rst index 1b5e977..5884a8f 100644 --- a/docs/assignment.rst +++ b/docs/assignment.rst @@ -32,6 +32,24 @@ Example file: >CRS4 TTAGACCGCCCTTTACCCCGAGAAAACTCAGCTACACACTC +Config File +----------- +Multiple mapping strategies are implemented to find the corresponding CRS sequence for each read. The mapping strategy can be chosen in the config file (bwa mem or exact matches). The config file also defines the filtering of the mapping results. The config file is a yaml file. + +Example of an assignment file using bwa and the standard filtering: + +.. literalinclude:: ../configs/example_assignment_bwa.yml + :language: yaml + +Example of an assignment file using exact matches and the with and non-default filtering of barcodes: + +.. literalinclude:: ../configs/example_assignment_exact_lazy.yml + :language: yaml + +Example of an assignment file using exact matches and read 1 with BC, linker and oligo (no seperate BC index read): + +.. literalinclude:: ../configs/example_assignment_exact_linker.yml + :language: yaml snakemake ============================ @@ -67,7 +85,7 @@ all assignment_attach_idx Extract the index sequence and add it to the header. assignment_bwa_ref - Create mapping reference for BWA from design file. + Create mapping reference for BWA from design file (code:`bwa` mapping approach). assignment_collect Collect mapped reads into one BAM. assignment_collectBCs @@ -82,12 +100,16 @@ assignmemt_hybridFWRead_get_reads_by_cutadapt Get the barcode and read from the FW read using cutadapt (when no index BC read is present). Uses the paired end mode of cutadapt to write the FW and BC read. assignment_merge Merge the FW,REV and BC fastq files into one. Extract the index sequence from the middle and end of an Illumina run. Separates reads for Paired End runs. Merge/Adapter trim reads stored in BAM. -assignment_mapping - Map the reads to the reference. +assignment_mapping_bwa + Map the reads to the reference (code:`bwa` mapping approach). assignment_idx_bam - Index the BAM file + Index the BAM file (code:`bwa` mapping approach). assignment_flagstat - Run samtools flagstat. Results are in :code:`results/assignment//statistic/assignment/bam_stats.txt` + Run samtools flagstat. Results are in :code:`results/assignment//statistic/assignment/bam_stats.txt` (code:`bwa` mapping approach). +assignment_mapping_exact_reference + Create reference to map the exact design (code:`exact` mapping approach). +rule assignment_mapping_exact + Map the reads to the reference and sort using exact match (code:`exact` mapping approach). assignment_getBCs Get the barcodes (not filtered). Results are in :code:`results/assignment//barcodes_incl_other.sorted.tsv.gz` assignment_statistic_totalCounts diff --git a/docs/assignment_example1.rst b/docs/assignment_example1.rst index e404fa9..134aee5 100644 --- a/docs/assignment_example1.rst +++ b/docs/assignment_example1.rst @@ -108,28 +108,32 @@ First we do a try run using snakemake :code:`-n` option. The MPRAsnakeflow comma cd assoc_basic conda activate mprasnakeflow - snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile config.yml -n + snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/assoc_basic/config.yml -n -q You should see a list of rules that will be executed. This is the summary: .. code-block:: text - Job stats: - job count min threads max threads - ----------------------------------- ------- ------------- ------------- - all 1 1 1 - assignment_bwa_ref 1 1 1 - assignment_fastq_split 3 1 1 - assignment_filter 2 1 1 - assignment_flagstat 1 1 1 - assignment_getBCs 1 1 1 - assignment_idx_bam 1 1 1 - assignment_mapping 1 1 1 - assignment_merge 30 10 10 - assignment_statistic_assignedCounts 2 1 1 - assignment_statistic_assignment 2 1 1 - assignment_statistic_totalCounts 1 1 1 - total 46 1 1 + Building DAG of jobs... + Job stats: + job count + ----------------------------------- ------- + all 1 + assignment_attach_idx 60 + assignment_bwa_ref 1 + assignment_collect 1 + assignment_collectBCs 1 + assignment_fastq_split 3 + assignment_filter 1 + assignment_flagstat 1 + assignment_getBCs 30 + assignment_idx_bam 1 + assignment_mapping_bwa 30 + assignment_merge 30 + assignment_statistic_assignedCounts 1 + assignment_statistic_assignment 1 + assignment_statistic_totalCounts 1 + total 163 When dry-drun does not give any errors we will run the workflow. We use a machine with 30 threads/cores to run the workflow. Therefore :code:`split_number` is set to 30 to parallize the workflow. Also we are using 10 threads for mapping (bwa mem). But snakemake takes care that no more than 30 threads are used. @@ -141,20 +145,26 @@ When dry-drun does not give any errors we will run the workflow. We use a machin .. note:: Please modify your code when running in a cluster environment. We have an example SLURM config file here :code:`config/sbatch.yml`. -If everything works fine the 12 rules showed above will run: +If everything works fine the 15 rules showed above will run: all The overall all rule. Here is defined what final output files are expected. +assignment_attach_idx + Extract the index sequence and add it to the header. assignment_bwa_ref Create mapping reference for BWA from design file. assignment_fastq_split Split the fastq files into n files for parallelisation. N is given by split_read in the configuration file. assignment_merge Merge the FW,REV and BC fastq files into one. Extract the index sequence from the middle and end of an Illumina run. Separates reads for Paired End runs. Merge/Adapter trim reads stored in BAM. -assignment_mapping +assignment_mapping_bwa Map the reads to the reference. assignment_idx_bam Index the BAM file +assignment_collect + Collect mapped reads into one BAM. +assignment_collectBCs + Get the barcodes. assignment_flagstat Run samtools flagstat. Results are in :code:`results/assignment/assocBasic/statistic/assignment/bam_stats.txt` assignment_getBCs @@ -162,48 +172,45 @@ assignment_getBCs assignment_statistic_totalCounts Statistic of the total (unfiltered counts). Results are in :code:`results/assignment/assocBasic/statistic/total_counts.tsv.gz` assignment_filter - Filter the barcodes file based on the config given in the config-file. Results for this run are here :code:`results/assignment/assocBasic/assignment_barcodes.exampleConfigTrueMatches.sorted.tsv.gz` (exampleConfigTrueMatches) and here :code:`results/assignment/assocBasic/assignment_barcodes.exampleConfig.sorted.tsv.gz` (exampleConfig) + Filter the barcodes file based on the config given in the config-file. Results for this run are here :code:`results/assignment/assocBasic/assignment_barcodes.default.sorted.tsv.gz` (default config). assignment_statistic_assignedCounts - Statistic of filtered the assigned counts. Result is here :code:`results/assignment/assocBasic/statistic/assigned_counts.exampleConfigTrueMatches.tsv.gz` (exampleConfigTrueMatches) or :code:`results/assignment/assocBasic/statistic/assigned_counts.exampleConfig.tsv.gz` (exampleConfig) + Statistic of filtered the assigned counts. Result is here :code:`results/assignment/assocBasic/statistic/assigned_counts.default.tsv.gz` (default) assignment_statistic_assignment - Statistic of the filtered assignment. Result is here :code:`results/assignment/assocBasic/statistic/assignment.exampleConfigTrueMatches.tsv.gz` and a plot here :code:`results/assignment/assocBasic/statistic/assignment.exampleConfigTrueMatches.png`. (also files are available for the config :code:`exampleConfig`). + Statistic of the filtered assignment. Result is here :code:`results/assignment/assocBasic/statistic/assignment.default.tsv.gz` and a plot here :code:`results/assignment/assocBasic/statistic/assignment.default.png`. Results ----------------- -All needed output files will be in the :code:`results/assignment/assocBasic` folder. The final assignment is in :code:`results/assignment/assocBasic/assignment_barcodes.exampleConfigTrueMatches.sorted.tsv.gz` or :code:`results/assignment/assocBasic/assignment_barcodes.exampleConfig.sorted.tsv.gz` depeding on the filtering in the config file. +All needed output files will be in the :code:`results/assignment/assocBasic` folder. The final assignment is in :code:`results/assignment/assocBasic/assignment_barcodes.default.sorted.tsv.gz`. -.. note:: Please note that for the experiment/count workflow you have to remove ambigous BCs. Therefore the file :code:`results/assignment/assocBasic/assignment_barcodes.exampleConfigTrueMatches.sorted.tsv.gz` is the correct wone +.. note:: Please note that for the experiment/count workflow you have to remove ambigous BCs. It is possible to retain ambigous BCs in the final file by configuring in the config file. But the default option will remove them from the final file. Total file tree of the results folder: .. code-block:: text - results - ├── assignment - │   └── assocBasic - │   ├── aligned_merged_reads.bam - │   ├── aligned_merged_reads.bam.bai - │   ├── assignment_barcodes.exampleConfig.sorted.tsv.gz - │   ├── assignment_barcodes.exampleConfigTrueMatches.sorted.tsv.gz - │   ├── barcodes_incl_other.sorted.tsv.gz - │   ├── reference - │   │   ├── reference.fa - │   │   ├── reference.fa.amb - │   │   ├── reference.fa.ann - │   │   ├── reference.fa.bwt - │   │   ├── reference.fa.dict - │   │   ├── reference.fa.fai - │   │   ├── reference.fa.pac - │   │   └── reference.fa.sa - │   └── statistic - │   ├── assigned_counts.exampleConfigTrueMatches.tsv.gz - │   ├── assigned_counts.exampleConfig.tsv.gz - │   ├── assignment - │   │   └── bam_stats.txt - │   ├── assignment.exampleConfig.png - │   ├── assignment.exampleConfigTrueMatches.png - │   ├── assignment.exampleConfigTrueMatches.tsv.gz - │   ├── assignment.exampleConfig.tsv.gz - │   └── total_counts.tsv.gz + results/ + ├── assignment + │   └── assocBasic + │   ├── aligned_merged_reads.bam + │   ├── aligned_merged_reads.bam.bai + │   ├── assignment_barcodes.default.sorted.tsv.gz + │   ├── barcodes_incl_other.sorted.tsv.gz + │   ├── reference + │   │   ├── reference.fa + │   │   ├── reference.fa.amb + │   │   ├── reference.fa.ann + │   │   ├── reference.fa.bwt + │   │   ├── reference.fa.dict + │   │   ├── reference.fa.fai + │   │   ├── reference.fa.pac + │   │   └── reference.fa.sa + │   └── statistic + │   ├── assigned_counts.default.tsv.gz + │   ├── assignment + │   │   └── bam_stats.txt + │   ├── assignment.default.png + │   ├── assignment.default.tsv.gz + │   └── total_counts.tsv.gz + └── logs diff --git a/docs/cluster.rst b/docs/cluster.rst index 1cf0694..57b9aaf 100644 --- a/docs/cluster.rst +++ b/docs/cluster.rst @@ -10,7 +10,7 @@ The configuration (SLURM) of resources for rules can be found in ``config/sbatch .. code-block:: bash - snakemake --use-conda --configfile conf/config.yaml --cluster "sbatch --nodes=1 --ntasks={cluster.threads} --mem={cluster.mem} -t {cluster.time} -p {cluster.queue} -o {cluster.output}" --jobs 100 --cluster-config config/sbatch.yaml + snakemake --use-conda --configfile config/config.yaml --cluster "sbatch --nodes=1 --ntasks={cluster.threads} --mem={cluster.mem} -t {cluster.time} -p {cluster.queue} -o {cluster.output}" --jobs 100 --cluster-config config/sbatch.yaml Please note that the log folder of the cluster environment has to be generated first, e.g: diff --git a/docs/combined_example1.rst b/docs/combined_example1.rst index a0636ba..2ad19c8 100644 --- a/docs/combined_example1.rst +++ b/docs/combined_example1.rst @@ -118,7 +118,7 @@ First we do a try run using snakemake :code:`-n` option. The MPRAsnakeflow comma cd combined_basic conda activate mprasnakeflow - snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/combined_basic/config.yml -n + snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/combined_basic/config.yml -n -q You should see a list of rules that will be executed. This is the summary: @@ -203,7 +203,7 @@ Total file tree of the results folder: │   └── assocBasic │   ├── aligned_merged_reads.bam │   ├── aligned_merged_reads.bam.bai - │   ├── assignment_barcodes.exampleConfig.sorted.tsv.gz + │   ├── assignment_barcodes.default.sorted.tsv.gz │   ├── barcodes_incl_other.sorted.tsv.gz │   ├── reference │   │   ├── reference.fa @@ -215,31 +215,31 @@ Total file tree of the results folder: │   │   ├── reference.fa.pac │   │   └── reference.fa.sa │   └── statistic - │   ├── assigned_counts.exampleConfig.tsv.gz + │   ├── assigned_counts.default.tsv.gz │   ├── assignment │   │   └── bam_stats.txt - │   ├── assignment.exampleConfig.png - │   ├── assignment.exampleConfig.tsv.gz + │   ├── assignment.default.png + │   ├── assignment.default.tsv.gz │   └── total_counts.tsv.gz └── experiments └── countBasic ├── assigned_counts │   └── fromWorkflow - │   ├── exampleConfig + │   ├── default │   │   ├── HEPG2_1_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_2_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_3_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_allreps_merged.tsv.gz │   │   └── HEPG2_allreps_minThreshold_merged.tsv.gz - │   ├── HEPG2_1_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_1.merged.config.exampleConfig.tsv.gz - │   ├── HEPG2_1_RNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_2_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_2.merged.config.exampleConfig.tsv.gz - │   ├── HEPG2_2_RNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_3_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_3.merged.config.exampleConfig.tsv.gz - │   └── HEPG2_3_RNA_final_counts.config.exampleConfig.tsv.gz + │   ├── HEPG2_1_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_1.merged.config.default.tsv.gz + │   ├── HEPG2_1_RNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_2_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_2.merged.config.default.tsv.gz + │   ├── HEPG2_2_RNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_3_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_3.merged.config.default.tsv.gz + │   └── HEPG2_3_RNA_final_counts.config.default.tsv.gz ├── assignment │   └── fromWorkflow.tsv.gz ├── counts @@ -247,7 +247,7 @@ Total file tree of the results folder: │   ├── HEPG2_1_DNA_filtered_counts.tsv.gz │   ├── HEPG2_1_DNA_final_counts.tsv.gz │   ├── HEPG2_1_DNA_raw_counts.tsv.gz - │   ├── HEPG2_1.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_1.merged.config.default.tsv.gz │   ├── HEPG2_1_RNA.bam │   ├── HEPG2_1_RNA_filtered_counts.tsv.gz │   ├── HEPG2_1_RNA_final_counts.tsv.gz @@ -256,7 +256,7 @@ Total file tree of the results folder: │   ├── HEPG2_2_DNA_filtered_counts.tsv.gz │   ├── HEPG2_2_DNA_final_counts.tsv.gz │   ├── HEPG2_2_DNA_raw_counts.tsv.gz - │   ├── HEPG2_2.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_2.merged.config.default.tsv.gz │   ├── HEPG2_2_RNA.bam │   ├── HEPG2_2_RNA_filtered_counts.tsv.gz │   ├── HEPG2_2_RNA_final_counts.tsv.gz @@ -265,7 +265,7 @@ Total file tree of the results folder: │   ├── HEPG2_3_DNA_filtered_counts.tsv.gz │   ├── HEPG2_3_DNA_final_counts.tsv.gz │   ├── HEPG2_3_DNA_raw_counts.tsv.gz - │   ├── HEPG2_3.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_3.merged.config.default.tsv.gz │   ├── HEPG2_3_RNA.bam │   ├── HEPG2_3_RNA_filtered_counts.tsv.gz │   ├── HEPG2_3_RNA_final_counts.tsv.gz @@ -273,7 +273,7 @@ Total file tree of the results folder: └── statistic ├── assigned_counts │   └── fromWorkflow - │   ├── exampleConfig + │   ├── default │   │   ├── combined │   │   │   └── HEPG2_merged_assigned_counts.statistic.tsv.gz │   │   ├── HEPG2_1_merged_assigned_counts.statistic.tsv.gz @@ -293,30 +293,30 @@ Total file tree of the results folder: │   │   ├── HEPG2_Ratio_pairwise.png │   │   ├── HEPG2_RNA_pairwise_minThreshold.png │   │   └── HEPG2_RNA_pairwise.png - │   ├── HEPG2_1_DNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_1_RNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_2_DNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_2_RNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_3_DNA_exampleConfig.statistic.tsv.gz - │   └── HEPG2_3_RNA_exampleConfig.statistic.tsv.gz + │   ├── HEPG2_1_DNA_default.statistic.tsv.gz + │   ├── HEPG2_1_RNA_default.statistic.tsv.gz + │   ├── HEPG2_2_DNA_default.statistic.tsv.gz + │   ├── HEPG2_2_RNA_default.statistic.tsv.gz + │   ├── HEPG2_3_DNA_default.statistic.tsv.gz + │   └── HEPG2_3_RNA_default.statistic.tsv.gz ├── barcode │   ├── assigned_counts │   │   └── fromWorkflow - │   │   ├── HEPG2_exampleConfig_barcode_correlation.tsv - │   │   ├── HEPG2_exampleConfig_barcode_DNA_pairwise.png - │   │   ├── HEPG2_exampleConfig_barcode_Ratio_pairwise.png - │   │   ├── HEPG2_exampleConfig_barcode_RNA_pairwise.png - │   │   ├── HEPG2_exampleConfig_DNA_perBarcode.png - │   │   └── HEPG2_exampleConfig_RNA_perBarcode.png + │   │   ├── HEPG2_default_barcode_correlation.tsv + │   │   ├── HEPG2_default_barcode_DNA_pairwise.png + │   │   ├── HEPG2_default_barcode_Ratio_pairwise.png + │   │   ├── HEPG2_default_barcode_RNA_pairwise.png + │   │   ├── HEPG2_default_DNA_perBarcode.png + │   │   └── HEPG2_default_RNA_perBarcode.png │   └── counts - │   ├── HEPG2_exampleConfig_barcode_correlation.tsv - │   ├── HEPG2_exampleConfig_barcode_DNA_pairwise.png - │   ├── HEPG2_exampleConfig_barcode_Ratio_pairwise.png - │   ├── HEPG2_exampleConfig_barcode_RNA_pairwise.png - │   ├── HEPG2_exampleConfig_DNA_perBarcode.png - │   └── HEPG2_exampleConfig_RNA_perBarcode.png - ├── bc_overlap.assigned_counts.exampleConfig.fromWorkflow.tsv - ├── bc_overlap.counts.exampleConfig.tsv + │   ├── HEPG2_default_barcode_correlation.tsv + │   ├── HEPG2_default_barcode_DNA_pairwise.png + │   ├── HEPG2_default_barcode_Ratio_pairwise.png + │   ├── HEPG2_default_barcode_RNA_pairwise.png + │   ├── HEPG2_default_DNA_perBarcode.png + │   └── HEPG2_default_RNA_perBarcode.png + ├── bc_overlap.assigned_counts.default.fromWorkflow.tsv + ├── bc_overlap.counts.default.tsv ├── counts │   ├── BCNucleotideComposition.HEPG2_1_DNA.tsv.gz │   ├── BCNucleotideComposition.HEPG2_1_RNA.tsv.gz @@ -332,8 +332,8 @@ Total file tree of the results folder: ├── counts.freqUMIs.HEPG2_3_DNA.txt ├── counts.freqUMIs.HEPG2_3_RNA.txt ├── counts.raw.tsv - ├── statistic_assigned_bc_correlation_merged_fromWorkflow_exampleConfig.tsv - ├── statistic_assigned_counts_merged_fromWorkflow_exampleConfig.tsv - ├── statistic_assigned_counts_single_fromWorkflow_exampleConfig.tsv - ├── statistic_bc_correlation_merged_exampleConfig.tsv - └── statistic_oligo_correlation_merged_fromWorkflow_exampleConfig.tsv + ├── statistic_assigned_bc_correlation_merged_fromWorkflow_default.tsv + ├── statistic_assigned_counts_merged_fromWorkflow_default.tsv + ├── statistic_assigned_counts_single_fromWorkflow_default.tsv + ├── statistic_bc_correlation_merged_default.tsv + └── statistic_oligo_correlation_merged_fromWorkflow_default.tsv diff --git a/docs/config.rst b/docs/config.rst index 9d6d820..c447595 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -4,14 +4,14 @@ Config File ===================== -The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend to use one config file per MPRA experiment or MPRA project. But in theory many different experiments can be configured in only one file. It is divided into :code:`global` (generell settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with all possible configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`. +The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend to use one config file per MPRA experiment or MPRA project. But in theory many different experiments can be configured in only one file. It is divided into :code:`global` (generell settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`. .. literalinclude:: ../config/example_config.yaml :language: yaml :linenos: -Note that the config file is conrolled by jscon schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`. +Note that the config file is conrolled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`. ---------------- General settings @@ -43,16 +43,29 @@ The assignment workflow is configured in the :code:`assignments` section. The fo :start-after: start_assignments :end-before: start_experiments -Each asignment you want to process you have to giv him a name like :code:`example_assignment`. The name is used to name the output files. +Each assignment you want to process you have to giv him a name like :code:`example_assignment`. The name is used to name the output files. + +:alignment_tool: + Alignment tool configuration that is used to map the reads to the oligos. + + :tool: + Alignment tool that is used. Currently :code:`bwa` and :code:`exact` is supported. + :configs: + Configurations of the alignment tool selected. + + :sequence_length (bwa): + Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the reference file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths. + :alignment_start (bwa): + Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1. + :min_mapping_quality (bwa): + (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 0. Otherwise the default value of 1 is recommended. + :sequence_length (exact): + Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the reference file. Only one length design is supported. + :alignment_start (exact): + Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. -:sequence_length: - Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the reference file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths. -:alignment_start: - Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1. -:min_mapping_quality: - (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 0. Otherwise the default value of 1 is recommended. :bc_length: - Length of the barcode. Must match with the length of :code:`R2`. + Length of the barcode. Must match with the length of :code:`BC`. :BC_rev_comp: (Optional) If set to :code:`true` the barcode of is reverse complemented. Default is :code:`false`. :linker_length: @@ -60,11 +73,11 @@ Each asignment you want to process you have to giv him a name like :code:`exampl :linker: (Optional) Length of the linker. Only needed if you don't have a barcode read and the barcode is in the FW read with the structure: BC+Linker+Insert. Uses cutadapt to trim the linker to get the barcode as well as the starting of the insert. :FW: - List of forward read files in gzipped fastq format. The full or relative path to the files should be used. Same order in R1, R2, and R3 is important. + List of forward read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. :REV: - list of reverse read files in gzipped fastq format. The full or relative path to the files should be used. Same order in R1, R2, and R3 is important. + list of reverse read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. :BC: - List of index read files in gzipped fastq format. The full or relative path to the files should be used. Same order in R1, R2, and R3 is important. + List of index read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. :NGmerge: (Optional) Options for NGmerge. NGmerge is used merge FW and REV reads. The following options are possible (we recommend to use the default values): diff --git a/docs/count_example1.rst b/docs/count_example1.rst index 3085e36..00c2dbe 100644 --- a/docs/count_example1.rst +++ b/docs/count_example1.rst @@ -152,51 +152,56 @@ First we do a try run using snakemake :code:`-n` option. The MPRAsnakeflow comma cd count_basic conda activate mprasnakeflow - snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile config.yml -n + snakemake -c 1 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/count_basic/config.yml -n -q You should see a list of rules that will be executed. This is the summary: .. code-block:: text + Building DAG of jobs... Job stats: - job count min threads max threads - ------------------------------------------------------------ ------- ------------- ------------- - all 1 1 1 - assigned_counts_assignBarcodes 6 1 1 - assigned_counts_dna_rna_merge 3 1 1 - assigned_counts_filterAssignment 1 1 1 - assigned_counts_make_master_tables 1 1 1 - counts_create_BAM_umi 6 1 1 - counts_dna_rna_merge_counts 6 1 1 - counts_filter_counts 6 1 1 - counts_final_counts_umi 6 1 1 - counts_raw_counts_umi 6 1 1 - statistic_assigned_counts_combine_BC_assignment_stats 1 1 1 - statistic_assigned_counts_combine_BC_assignment_stats_helper 1 1 1 - statistic_assigned_counts_combine_stats_dna_rna_merge 1 1 1 - statistic_assigned_counts_combine_stats_dna_rna_merge_all 1 1 1 - statistic_bc_overlap_combine_assigned_counts 1 1 1 - statistic_bc_overlap_combine_counts 1 1 1 - statistic_bc_overlap_run 4 1 1 - statistic_correlation_bc_counts 2 1 1 - statistic_correlation_calculate 1 1 1 - statistic_correlation_combine_bc_assigned 1 1 1 - statistic_correlation_combine_bc_raw 1 1 1 - statistic_correlation_combine_oligo 1 1 1 - statistic_counts_BC_in_RNA_DNA 6 1 1 - statistic_counts_BC_in_RNA_DNA_merge 2 1 1 - statistic_counts_barcode_base_composition 6 1 1 - statistic_counts_final 2 1 1 - statistic_counts_frequent_umis 6 1 1 - statistic_counts_stats_merge 2 1 1 - statistic_counts_table 12 1 1 - total 94 1 1 + job count + ------------------------------------------------------------ ------- + all 1 + assigned_counts_assignBarcodes 6 + assigned_counts_combine_replicates 2 + assigned_counts_combine_replicates_barcode_output 1 + assigned_counts_dna_rna_merge 3 + assigned_counts_filterAssignment 1 + assigned_counts_make_master_tables 1 + counts_dna_rna_merge_counts 6 + counts_filter_counts 6 + counts_final_counts 6 + counts_umi_create_BAM 6 + counts_umi_raw_counts 6 + statistic_assigned_counts_combine_BC_assignment_stats 1 + statistic_assigned_counts_combine_BC_assignment_stats_helper 1 + statistic_assigned_counts_combine_stats_dna_rna_merge 1 + statistic_assigned_counts_combine_stats_dna_rna_merge_all 1 + statistic_bc_overlap_combine_assigned_counts 1 + statistic_bc_overlap_combine_counts 1 + statistic_bc_overlap_run 4 + statistic_correlation_bc_counts 2 + statistic_correlation_bc_counts_hist 2 + statistic_correlation_calculate 1 + statistic_correlation_combine_bc_assigned 1 + statistic_correlation_combine_bc_raw 1 + statistic_correlation_combine_oligo 1 + statistic_correlation_hist_box_plots 1 + statistic_counts_BC_in_RNA_DNA 6 + statistic_counts_BC_in_RNA_DNA_merge 2 + statistic_counts_barcode_base_composition 6 + statistic_counts_final 2 + statistic_counts_frequent_umis 6 + statistic_counts_stats_merge 2 + statistic_counts_table 12 + total 100 When dry-drun does not give any errors we will run the workflow. We use a machine with 30 threads/cores to run the workflow. The MPRAsnakeflow command is: .. code-block:: bash - snakemake -c 30 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile config.yml + snakemake -c 30 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/count_basic/config.yml .. note:: Please modify your code when running in a cluster environment. We have an example SLURM config file here :code:`config/sbatch.yml`. @@ -218,12 +223,16 @@ counts_dna_rna_merge_counts Second with zeros, so a BC can be defined only in the DNA or RNA (when :code:`min_counts` is zero for DNA or RNA) assigned_counts_filterAssignment Use only unique assignments. +assigned_counts_combine_replicates + TODO +assigned_counts_combine_replicates_barcode_output + TODO assigned_counts_assignBarcodes Assign RNA and DNA barcodes seperately to make the statistic for assigned. assigned_counts_dna_rna_merge - Assign merged RNA/DNA barcodes. Filter BC depending on the min_counts option. Output for each replicate is here: :code:`results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_<1,2,3>_merged_assigned_counts.tsv.gz`. + Assign merged RNA/DNA barcodes. Filter BC depending on the min_counts option. Output for each replicate is here: :code:`results/experiments/exampleCount/assigned_counts/fromFile/default/HepG2_<1,2,3>_merged_assigned_counts.tsv.gz`. assigned_counts_make_master_tables - Final master table with all replicates combined. Output is here: :code:`results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_allreps_merged.tsv.gz` and using the :code:`bc-threshold` here :code:`results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_allreps_minThreshold_merged.tsv.gz`. + Final master table with all replicates combined. Output is here: :code:`results/experiments/exampleCount/assigned_counts/fromFile/default/HepG2_allreps_merged.tsv.gz` and using the :code:`bc-threshold` here :code:`results/experiments/exampleCount/assigned_counts/fromFile/default/HepG2_allreps_minThreshold_merged.tsv.gz`. statistic_assigned_counts_combine_BC_assignment_stats TODO statistic_assigned_counts_combine_BC_assignment_stats_helper @@ -290,21 +299,21 @@ Total file tree of the results folder: └── countBasic ├── assigned_counts │   └── fromWorkflow - │   ├── exampleConfig + │   ├── default │   │   ├── HEPG2_1_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_2_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_3_merged_assigned_counts.tsv.gz │   │   ├── HEPG2_allreps_merged.tsv.gz │   │   └── HEPG2_allreps_minThreshold_merged.tsv.gz - │   ├── HEPG2_1_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_1.merged.config.exampleConfig.tsv.gz - │   ├── HEPG2_1_RNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_2_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_2.merged.config.exampleConfig.tsv.gz - │   ├── HEPG2_2_RNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_3_DNA_final_counts.config.exampleConfig.tsv.gz - │   ├── HEPG2_3.merged.config.exampleConfig.tsv.gz - │   └── HEPG2_3_RNA_final_counts.config.exampleConfig.tsv.gz + │   ├── HEPG2_1_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_1.merged.config.default.tsv.gz + │   ├── HEPG2_1_RNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_2_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_2.merged.config.default.tsv.gz + │   ├── HEPG2_2_RNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_3_DNA_final_counts.config.default.tsv.gz + │   ├── HEPG2_3.merged.config.default.tsv.gz + │   └── HEPG2_3_RNA_final_counts.config.default.tsv.gz ├── assignment │   └── fromWorkflow.tsv.gz ├── counts @@ -312,7 +321,7 @@ Total file tree of the results folder: │   ├── HEPG2_1_DNA_filtered_counts.tsv.gz │   ├── HEPG2_1_DNA_final_counts.tsv.gz │   ├── HEPG2_1_DNA_raw_counts.tsv.gz - │   ├── HEPG2_1.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_1.merged.config.default.tsv.gz │   ├── HEPG2_1_RNA.bam │   ├── HEPG2_1_RNA_filtered_counts.tsv.gz │   ├── HEPG2_1_RNA_final_counts.tsv.gz @@ -321,7 +330,7 @@ Total file tree of the results folder: │   ├── HEPG2_2_DNA_filtered_counts.tsv.gz │   ├── HEPG2_2_DNA_final_counts.tsv.gz │   ├── HEPG2_2_DNA_raw_counts.tsv.gz - │   ├── HEPG2_2.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_2.merged.config.default.tsv.gz │   ├── HEPG2_2_RNA.bam │   ├── HEPG2_2_RNA_filtered_counts.tsv.gz │   ├── HEPG2_2_RNA_final_counts.tsv.gz @@ -330,7 +339,7 @@ Total file tree of the results folder: │   ├── HEPG2_3_DNA_filtered_counts.tsv.gz │   ├── HEPG2_3_DNA_final_counts.tsv.gz │   ├── HEPG2_3_DNA_raw_counts.tsv.gz - │   ├── HEPG2_3.merged.config.exampleConfig.tsv.gz + │   ├── HEPG2_3.merged.config.default.tsv.gz │   ├── HEPG2_3_RNA.bam │   ├── HEPG2_3_RNA_filtered_counts.tsv.gz │   ├── HEPG2_3_RNA_final_counts.tsv.gz @@ -338,7 +347,7 @@ Total file tree of the results folder: └── statistic ├── assigned_counts │   └── fromWorkflow - │   ├── exampleConfig + │   ├── default │   │   ├── combined │   │   │   └── HEPG2_merged_assigned_counts.statistic.tsv.gz │   │   ├── HEPG2_1_merged_assigned_counts.statistic.tsv.gz @@ -358,30 +367,30 @@ Total file tree of the results folder: │   │   ├── HEPG2_Ratio_pairwise.png │   │   ├── HEPG2_RNA_pairwise_minThreshold.png │   │   └── HEPG2_RNA_pairwise.png - │   ├── HEPG2_1_DNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_1_RNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_2_DNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_2_RNA_exampleConfig.statistic.tsv.gz - │   ├── HEPG2_3_DNA_exampleConfig.statistic.tsv.gz - │   └── HEPG2_3_RNA_exampleConfig.statistic.tsv.gz + │   ├── HEPG2_1_DNA_default.statistic.tsv.gz + │   ├── HEPG2_1_RNA_default.statistic.tsv.gz + │   ├── HEPG2_2_DNA_default.statistic.tsv.gz + │   ├── HEPG2_2_RNA_default.statistic.tsv.gz + │   ├── HEPG2_3_DNA_default.statistic.tsv.gz + │   └── HEPG2_3_RNA_default.statistic.tsv.gz ├── barcode │   ├── assigned_counts │   │   └── fromWorkflow - │   │   ├── HEPG2_exampleConfig_barcode_correlation.tsv - │   │   ├── HEPG2_exampleConfig_barcode_DNA_pairwise.png - │   │   ├── HEPG2_exampleConfig_barcode_Ratio_pairwise.png - │   │   ├── HEPG2_exampleConfig_barcode_RNA_pairwise.png - │   │   ├── HEPG2_exampleConfig_DNA_perBarcode.png - │   │   └── HEPG2_exampleConfig_RNA_perBarcode.png + │   │   ├── HEPG2_default_barcode_correlation.tsv + │   │   ├── HEPG2_default_barcode_DNA_pairwise.png + │   │   ├── HEPG2_default_barcode_Ratio_pairwise.png + │   │   ├── HEPG2_default_barcode_RNA_pairwise.png + │   │   ├── HEPG2_default_DNA_perBarcode.png + │   │   └── HEPG2_default_RNA_perBarcode.png │   └── counts - │   ├── HEPG2_exampleConfig_barcode_correlation.tsv - │   ├── HEPG2_exampleConfig_barcode_DNA_pairwise.png - │   ├── HEPG2_exampleConfig_barcode_Ratio_pairwise.png - │   ├── HEPG2_exampleConfig_barcode_RNA_pairwise.png - │   ├── HEPG2_exampleConfig_DNA_perBarcode.png - │   └── HEPG2_exampleConfig_RNA_perBarcode.png - ├── bc_overlap.assigned_counts.exampleConfig.fromWorkflow.tsv - ├── bc_overlap.counts.exampleConfig.tsv + │   ├── HEPG2_default_barcode_correlation.tsv + │   ├── HEPG2_default_barcode_DNA_pairwise.png + │   ├── HEPG2_default_barcode_Ratio_pairwise.png + │   ├── HEPG2_default_barcode_RNA_pairwise.png + │   ├── HEPG2_default_DNA_perBarcode.png + │   └── HEPG2_default_RNA_perBarcode.png + ├── bc_overlap.assigned_counts.default.fromWorkflow.tsv + ├── bc_overlap.counts.default.tsv ├── counts │   ├── BCNucleotideComposition.HEPG2_1_DNA.tsv.gz │   ├── BCNucleotideComposition.HEPG2_1_RNA.tsv.gz @@ -397,8 +406,8 @@ Total file tree of the results folder: ├── counts.freqUMIs.HEPG2_3_DNA.txt ├── counts.freqUMIs.HEPG2_3_RNA.txt ├── counts.raw.tsv - ├── statistic_assigned_bc_correlation_merged_fromWorkflow_exampleConfig.tsv - ├── statistic_assigned_counts_merged_fromWorkflow_exampleConfig.tsv - ├── statistic_assigned_counts_single_fromWorkflow_exampleConfig.tsv - ├── statistic_bc_correlation_merged_exampleConfig.tsv - └── statistic_oligo_correlation_merged_fromWorkflow_exampleConfig.tsv \ No newline at end of file + ├── statistic_assigned_bc_correlation_merged_fromWorkflow_default.tsv + ├── statistic_assigned_counts_merged_fromWorkflow_default.tsv + ├── statistic_assigned_counts_single_fromWorkflow_default.tsv + ├── statistic_bc_correlation_merged_default.tsv + └── statistic_oligo_correlation_merged_fromWorkflow_default.tsv diff --git a/docs/faq.rst b/docs/faq.rst index 3d26298..8729461 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -20,7 +20,7 @@ MPRAsnakeflow is not able to create a Conda environment Can I use STARR-seq with MPRAsnakeflow? - No, not yet! + No! Not yet ;-) The pipeline is giving an error **"BUG: Out of jobs ready to be started, but not all files built yet."** and won't run. How can I fix this? diff --git a/resources/assoc_basic/config.yml b/resources/assoc_basic/config.yml index e44db0a..2e9036f 100644 --- a/resources/assoc_basic/config.yml +++ b/resources/assoc_basic/config.yml @@ -6,12 +6,15 @@ global: assignments: assocBasic: bc_length: 15 - sequence_length: - min: 166 - max: 175 - alignment_start: - min: 1 - max: 3 + alignment_tool: + tool: bwa + configs: + sequence_length: + min: 166 + max: 175 + alignment_start: + min: 1 + max: 3 FW: - data/SRR10800986_1.fastq.gz BC: @@ -20,13 +23,4 @@ assignments: - data/SRR10800986_3.fastq.gz reference: data/design.fa configs: - exampleConfig: - min_support: 3 - fraction: 0.7 - unknown_other: true - ambiguous: true - exampleConfigTrueMatches: - min_support: 3 - fraction: 0.7 - unknown_other: false - ambiguous: false + default: {} diff --git a/resources/combined_basic/config.yml b/resources/combined_basic/config.yml index d4faf56..777834c 100644 --- a/resources/combined_basic/config.yml +++ b/resources/combined_basic/config.yml @@ -6,12 +6,15 @@ global: assignments: assocBasic: bc_length: 15 - sequence_length: - min: 166 - max: 175 - alignment_start: - min: 1 - max: 3 + alignment_tool: + tool: bwa + configs: + sequence_length: + min: 166 + max: 175 + alignment_start: + min: 1 + max: 3 FW: - data/SRR10800986_1.fastq.gz BC: @@ -20,11 +23,7 @@ assignments: - data/SRR10800986_3.fastq.gz reference: data/design.fa configs: - exampleConfig: - min_support: 3 - fraction: 0.7 - unknown_other: false - ambiguous: false + default: {} experiments: countBasic: bc_length: 15 @@ -36,13 +35,7 @@ experiments: fromWorkflow: type: config assignment_name: assocBasic - assignment_config: exampleConfig + assignment_config: configs design_file: data/design.fa configs: - exampleConfig: - filter: - bc_threshold: 10 - DNA: - min_counts: 1 - RNA: - min_counts: 1 + default: {} diff --git a/resources/count_basic/config.yml b/resources/count_basic/config.yml index 869370f..b369314 100644 --- a/resources/count_basic/config.yml +++ b/resources/count_basic/config.yml @@ -12,10 +12,4 @@ experiments: assignment_file: data/SRR10800986_barcodes_to_coords.tsv.gz design_file: data/design.fa configs: - exampleConfig: - filter: - bc_threshold: 10 - DNA: - min_counts: 1 - RNA: - min_counts: 1 + default: {} diff --git a/workflow/Snakefile b/workflow/Snakefile index 82803a0..e34d89a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -53,10 +53,11 @@ rule all: ] ), getAssignment_helper( - [ - "results/assignment/{assignment}/statistic/total_counts.tsv.gz", - "results/assignment/{assignment}/statistic/assignment/bam_stats.txt", - ] + ["results/assignment/{assignment}/statistic/total_counts.tsv.gz"] + ), + getAssignment_helper( + ["results/assignment/{assignment}/statistic/assignment/bam_stats.txt"], + match_method="bwa", ), ## experiments # statistic BC nucleotide composition @@ -155,10 +156,11 @@ rule all_assignments: ] ), getAssignment_helper( - [ - "results/assignment/{assignment}/statistic/total_counts.tsv.gz", - "results/assignment/{assignment}/statistic/assignment/bam_stats.txt", - ] + ["results/assignment/{assignment}/statistic/total_counts.tsv.gz"] + ), + getAssignment_helper( + ["results/assignment/{assignment}/statistic/assignment/bam_stats.txt"], + match_method="bwa", ), diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index e536435..6dc4f17 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -115,160 +115,8 @@ rule assignment_merge: """ -rule assignment_bwa_ref: - """ - Create mapping reference for BWA from design file. - """ - input: - lambda wc: config["assignments"][wc.assignment]["reference"], - output: - ref="results/assignment/{assignment}/reference/reference.fa", - bwa=expand( - "results/assignment/{{assignment}}/reference/reference.fa.{ext}", - ext=["fai"] + assignment_bwa_dicts, - ), - d="results/assignment/{assignment}/reference/reference.fa.dict", - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - log: - temp("results/logs/assignment/bwa_ref.{assignment}.log"), - shell: - """ - cat {input} | awk '{{gsub(/[\\]\\[]/,"_")}}$0' > {output.ref}; - bwa index -a bwtsw {output.ref} &> {log}; - samtools faidx {output.ref} &>> {log}; - picard CreateSequenceDictionary REFERENCE={output.ref} OUTPUT={output.d} &>> {log} - """ - - -rule assignment_mapping: - """ - Map the reads to the reference and sort. - """ - input: - reads="results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz", - reference="results/assignment/{assignment}/reference/reference.fa", - bwa_index=expand( - "results/assignment/{{assignment}}/reference/reference.fa.{ext}", - ext=["fai", "dict"] + assignment_bwa_dicts, - ), - output: - bam=temp("results/assignment/{assignment}/bam/merge_split{split}.mapped.bam"), - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - threads: config["global"]["threads"] - log: - temp("results/logs/assignment/mapping.{assignment}.{split}.log"), - shell: - """ - bwa mem -t {threads} -L 80 -M -C {input.reference} <( - gzip -dc {input.reads} - ) | samtools sort -l 0 -@ {threads} > {output} 2> {log} - """ - - -rule assignment_getBCs: - """ - Get the barcodes. - """ - input: - "results/assignment/{assignment}/bam/merge_split{split}.mapped.bam", - output: - temp("results/assignment/{assignment}/BCs/barcodes_incl_other.{split}.tsv"), - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - params: - alignment_start_min=lambda wc: config["assignments"][wc.assignment][ - "alignment_start" - ]["min"], - alignment_start_max=lambda wc: config["assignments"][wc.assignment][ - "alignment_start" - ]["max"], - sequence_length_min=lambda wc: config["assignments"][wc.assignment][ - "sequence_length" - ]["min"], - sequence_length_max=lambda wc: config["assignments"][wc.assignment][ - "sequence_length" - ]["max"], - mapping_quality_min=lambda wc: config["assignments"][wc.assignment][ - "min_mapping_quality" - ], - log: - temp("results/logs/assignment/getBCs.{assignment}.{split}.log"), - shell: - """ - samtools view -F 1792 {input} | \ - awk -v "OFS=\\t" '{{ - split($(NF),a,":"); - split(a[3],a,","); - if (a[1] !~ /N/) {{ - if (($5 >= {params.mapping_quality_min}) && ($4 >= {params.alignment_start_min}) && ($4 <= {params.alignment_start_max}) && (length($10) >= {params.sequence_length_min}) && (length($10) <= {params.sequence_length_max})) {{ - print a[1],$3,$4";"$6";"$12";"$13";"$5 - }} else {{ - print a[1],"other","NA" - }} - }} - }}' | sort -k1,1 -k2,2 -k3,3 > {output} 2> {log} - """ - - -rule assignment_collect: - """ - Collect mapped reads. - """ - input: - bams=expand( - "results/assignment/{{assignment}}/bam/merge_split{split}.mapped.bam", - split=range(0, getSplitNumber()), - ), - output: - "results/assignment/{assignment}/aligned_merged_reads.bam", - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - threads: config["global"]["threads"] - log: - temp("results/logs/assignment/collect.{assignment}.log"), - shell: - """ - samtools merge -@ {threads} {output} {input.bams} 2> {log} - """ - - -rule assignment_idx_bam: - """ - Index the BAM file - """ - input: - "results/assignment/{assignment}/aligned_merged_reads.bam", - output: - "results/assignment/{assignment}/aligned_merged_reads.bam.bai", - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - log: - "results/logs/assignment/{assignment}/assignment_idx_bam.log", - shell: - """ - samtools index {input} 2> {log} - """ - - -rule assignment_flagstat: - """ - Run samtools flagstat - """ - input: - bam="results/assignment/{assignment}/aligned_merged_reads.bam", - idx="results/assignment/{assignment}/aligned_merged_reads.bam.bai", - output: - "results/assignment/{assignment}/statistic/assignment/bam_stats.txt", - conda: - "../envs/bwa_samtools_picard_htslib.yaml" - log: - temp("results/logs/assignment/flagstat.{assignment}.log"), - shell: - """ - samtools flagstat {input.bam} > {output} 2> {log} - """ +include: "assignment/mapping_exact.smk" +include: "assignment/mapping_bwa.smk" rule assignment_collectBCs: @@ -276,7 +124,12 @@ rule assignment_collectBCs: Get the barcodes. """ input: - expand( + lambda wc: expand( + "results/assignment/{{assignment}}/BCs/barcodes_exact.{split}.tsv", + split=range(0, getSplitNumber()), + ) + if config["assignments"][wc.assignment]["alignment_tool"]["tool"] == "exact" + else expand( "results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv", split=range(0, getSplitNumber()), ), @@ -291,7 +144,9 @@ rule assignment_collectBCs: temp("results/logs/assignment/collectBCs.{assignment}.log"), shell: """ - sort --batch-size={params.batch_size} --parallel={threads} -k1,1 -k2,2 -k3,3 -m {input} | gzip -c > {output} 2> {log} + export LC_ALL=C # speed up sort + sort -S 7G --batch-size={params.batch_size} --parallel={threads} -k1,1 -k2,2 -k3,3 -m {input} | \ + gzip -c > {output} 2> {log} """ diff --git a/workflow/rules/assignment/mapping_bwa.smk b/workflow/rules/assignment/mapping_bwa.smk new file mode 100644 index 0000000..da99eca --- /dev/null +++ b/workflow/rules/assignment/mapping_bwa.smk @@ -0,0 +1,155 @@ +rule assignment_bwa_ref: + """ + Create mapping reference for BWA from design file. + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + lambda wc: config["assignments"][wc.assignment]["reference"], + output: + ref="results/assignment/{assignment}/reference/reference.fa", + bwa=expand( + "results/assignment/{{assignment}}/reference/reference.fa.{ext}", + ext=["fai"] + assignment_bwa_dicts, + ), + d="results/assignment/{assignment}/reference/reference.fa.dict", + log: + temp("results/logs/assignment/bwa_ref.{assignment}.log"), + shell: + """ + cat {input} | awk '{{gsub(/[\\]\\[]/,"_")}}$0' > {output.ref}; + bwa index -a bwtsw {output.ref} &> {log}; + samtools faidx {output.ref} &>> {log}; + picard CreateSequenceDictionary REFERENCE={output.ref} OUTPUT={output.d} &>> {log} + """ + + +rule assignment_mapping_bwa: + """ + Map the reads to the reference and sort unsing bwa mem + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + reads="results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz", + reference="results/assignment/{assignment}/reference/reference.fa", + bwa_index=expand( + "results/assignment/{{assignment}}/reference/reference.fa.{ext}", + ext=["fai", "dict"] + assignment_bwa_dicts, + ), + output: + bam=temp("results/assignment/{assignment}/bam/merge_split{split}.mapped.bam"), + threads: config["global"]["threads"] + log: + temp("results/logs/assignment/mapping.{assignment}.{split}.log"), + shell: + """ + bwa mem -t {threads} -L 80 -M -C {input.reference} <( + gzip -dc {input.reads} + ) | samtools sort -l 0 -@ {threads} > {output} 2> {log} + """ + + +rule assignment_getBCs: + """ + Get the barcodes. + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + "results/assignment/{assignment}/bam/merge_split{split}.mapped.bam", + output: + temp("results/assignment/{assignment}/BCs/barcodes_incl_other.{split}.tsv"), + params: + alignment_start_min=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["alignment_start"]["min"], + alignment_start_max=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["alignment_start"]["max"], + sequence_length_min=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["sequence_length"]["min"], + sequence_length_max=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["sequence_length"]["max"], + mapping_quality_min=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["min_mapping_quality"], + log: + temp("results/logs/assignment/getBCs.{assignment}.{split}.log"), + shell: + """ + export LC_ALL=C # speed up sorting + samtools view -F 1792 {input} | \ + awk -v "OFS=\\t" '{{ + split($(NF),a,":"); + split(a[3],a,","); + if (a[1] !~ /N/) {{ + if (($5 >= {params.mapping_quality_min}) && ($4 >= {params.alignment_start_min}) && ($4 <= {params.alignment_start_max}) && (length($10) >= {params.sequence_length_min}) && (length($10) <= {params.sequence_length_max})) {{ + print a[1],$3,$4";"$6";"$12";"$13";"$5 + }} else {{ + print a[1],"other","NA" + }} + }} + }}' | sort -k1,1 -k2,2 -k3,3 -S 7G > {output} 2> {log} + """ + + +rule assignment_collect: + """ + Collect mapped reads. + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + bams=expand( + "results/assignment/{{assignment}}/bam/merge_split{split}.mapped.bam", + split=range(0, getSplitNumber()), + ), + output: + "results/assignment/{assignment}/aligned_merged_reads.bam", + threads: config["global"]["threads"] + log: + temp("results/logs/assignment/collect.{assignment}.log"), + shell: + """ + samtools merge -@ {threads} {output} {input.bams} 2> {log} + """ + + +rule assignment_idx_bam: + """ + Index the BAM file + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + "results/assignment/{assignment}/aligned_merged_reads.bam", + output: + "results/assignment/{assignment}/aligned_merged_reads.bam.bai", + log: + "results/logs/assignment/{assignment}/assignment_idx_bam.log", + shell: + """ + samtools index {input} 2> {log} + """ + + +rule assignment_flagstat: + """ + Run samtools flagstat + """ + conda: + "../../envs/bwa_samtools_picard_htslib.yaml" + input: + bam="results/assignment/{assignment}/aligned_merged_reads.bam", + idx="results/assignment/{assignment}/aligned_merged_reads.bam.bai", + output: + "results/assignment/{assignment}/statistic/assignment/bam_stats.txt", + log: + temp("results/logs/assignment/flagstat.{assignment}.log"), + shell: + """ + samtools flagstat {input.bam} > {output} 2> {log} + """ diff --git a/workflow/rules/assignment/mapping_exact.smk b/workflow/rules/assignment/mapping_exact.smk new file mode 100644 index 0000000..24f1be5 --- /dev/null +++ b/workflow/rules/assignment/mapping_exact.smk @@ -0,0 +1,59 @@ +rule assignment_mapping_exact_reference: + """ + Create reference to map the exact design + """ + conda: + "../../envs/default.yaml" + input: + lambda wc: config["assignments"][wc.assignment]["reference"], + output: + "results/assignment/{assignment}/reference/reference_exact.fa", + log: + temp("results/logs/assignment/mapping_exact_reference.{assignment}.log"), + shell: + """ + paste <( + cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}'; + cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}'; + ) <( + cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}'; + cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}' | tr ACGTacgt TGCAtgca | rev; + ) > {output} + """ + + +rule assignment_mapping_exact: + """ + Map the reads to the reference and sort using exact match. + """ + conda: + "../../envs/default.yaml" + input: + reads="results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz", + reference="results/assignment/{assignment}/reference/reference_exact.fa", + output: + temp("results/assignment/{assignment}/BCs/barcodes_exact.{split}.tsv"), + log: + temp("results/logs/assignment/mapping_exact.{assignment}.{split}.log"), + params: + alignment_start=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["alignment_start"], + sequence_length=lambda wc: config["assignments"][wc.assignment][ + "alignment_tool" + ]["configs"]["sequence_length"], + shell: + """ + # Look up exact matches in design file + export LC_ALL=C # speed up sort + + awk -v "OFS=\\t" 'NR==FNR {{a[$2] = $1; next}} {{if ($3 in a) print $2,a[$3],"{params.sequence_length}M"; else print $2,"other","NA"}}' \ + <( + cat {input.reference} | awk -v "OFS=\\t" '{{print $1,substr($2, {params.alignment_start},{params.sequence_length})}}' + ) \ + <( + zcat {input.reads} | awk 'NR%4==2 || NR%4==1' | paste - - + ) | \ + sed 's/,YI:Z[^\\t]*//g' | sed 's/XI:Z://g' | \ + sort -k1,1 -k2,2 -k3,3 -S 7G > {output} 2> {log} + """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d0ae5e6..5e4dbbf 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -73,9 +73,19 @@ class MissingVariantInConfigException(Exception): ##### get helpers for different things (like Conditions etc) ##### -def getAssignments(): +def getAssignments(match_method=None): if "assignments" in config: - return list(config["assignments"].keys()) + if match_method: + output = [] + for assignment in config["assignments"]: + if ( + config["assignments"][assignment]["alignment_tool"]["tool"] + == match_method + ): + output.append(assignment) + return output + else: + return list(config["assignments"].keys()) else: return [] @@ -287,7 +297,7 @@ def getOutputProjectConditionConfigType_helper(files): return output -def getOutputProjectConditionType_helper(files): +def getOutputProjectConditionType_helper(file): """ Inserts {project}, {condition} and {type} from config into given file. """ @@ -419,10 +429,10 @@ def getOutputVariants_helper(files, betweenReplicates=False): return output -def getAssignment_helper(files): +def getAssignment_helper(files, match_method=None): output = [] for file in files: - output += expand(file, assignment=getAssignments()) + output += expand(file, assignment=getAssignments(match_method)) return output diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 24d5af2..d53beb3 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -1,6 +1,8 @@ --- $schema: "https://json-schema.org/draft/2020-12/schema" +$id: /schemas/config.schema.yaml + description: snakemake configuration file type: object @@ -35,19 +37,82 @@ properties: description: name of the assignment ^([^_\.]+)$: type: object - patternProperties: - ^((sequence_length)|(alignment_start))$: + properties: + alignment_tool: type: object properties: - min: - type: integer - max: - type: integer - additionalProperties: false + tool: + type: string + enum: + - exact + - bwa + default: bwa + allOf: + - if: + properties: + tool: + const: bwa + then: + properties: + configs: + type: object + properties: + min_mapping_quality: + type: integer + minimum: 0 + default: 1 + sequence_length: + type: object + properties: + min: + type: integer + max: + type: integer + additionalProperties: false + required: + - min + - max + alignment_start: + type: object + properties: + min: + type: integer + max: + type: integer + additionalProperties: false + required: + - min + - max + additionalProperties: false + required: + - sequence_length + - alignment_start + - min_mapping_quality + required: + - configs + - if: + properties: + tool: + const: exact + then: + properties: + configs: + type: object + properties: + sequence_length: + type: integer + minimum: 1 + alignment_start: + type: integer + minimum: 1 + additionalProperties: false + required: + - sequence_length + - alignment_start + required: + - configs required: - - min - - max - properties: + - tool bc_length: type: integer BC_rev_comp: @@ -76,10 +141,6 @@ properties: type: string minItems: 1 uniqueItems: true - min_mapping_quality: - type: integer - default: 1 - minimum: 0 NGmerge: type: object properties: @@ -114,7 +175,7 @@ properties: type: number exclusiveMinimum: 0.5 maximum: 1 - default: 0.7 + default: 0.75 unknown_other: type: boolean default: false @@ -129,20 +190,18 @@ properties: minProperties: 1 oneOf: - required: - - linker_length + - linker_length - required: - - linker + - linker - required: - - BC + - BC required: - FW - REV - bc_length - reference - configs - - alignment_start - - sequence_length - - min_mapping_quality + - alignment_tool - NGmerge additionalProperties: false additionalProperties: false