From 28045aea23d6fa03f3883b3dc44b3cbc3e8f6205 Mon Sep 17 00:00:00 2001 From: Max Schubach Date: Thu, 2 Mar 2023 15:30:53 +0100 Subject: [PATCH] feat: configurable min mapping quality --- config/example_config.yaml | 1 + config/sbatch.yml | 3 ++- docs/config.rst | 2 ++ workflow/rules/assignment.smk | 17 ++++++++++++----- workflow/schemas/config.schema.yaml | 7 ++++++- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/config/example_config.yaml b/config/example_config.yaml index 9804460..4efdffa 100644 --- a/config/example_config.yaml +++ b/config/example_config.yaml @@ -12,6 +12,7 @@ assignments: alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches min: 15 # integer max: 17 # integer + min_mapping_quality: 1 # integer >=0 Please use 0 when you have oligos that differ by 1 base in your reference/design file FW: - resources/Assignment_BasiC/R1.fastq.gz BC: diff --git a/config/sbatch.yml b/config/sbatch.yml index f529524..52e4790 100644 --- a/config/sbatch.yml +++ b/config/sbatch.yml @@ -43,7 +43,8 @@ assignment_getBCs: queue: medium assignment_collectBCs: time: "2-04:00" - threads: 1 + threads: 20 + mem: 10G queue: medium assignment_statistic_totalCounts: time: "0-01:00" diff --git a/docs/config.rst b/docs/config.rst index 0b1aaaa..ae90bb1 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -49,6 +49,8 @@ Each asignment you want to process you have to giv him a name like :code:`exampl Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the reference file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths. :alignment_start: Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1. +:min_mapping_quality: + (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 0. Otherwise the default value of 1 is recommended. :bc_length: Length of the barcode. Must match with the length of :code:`R2`. :BC_rev_comp: diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index 863f988..cc16d69 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -178,6 +178,7 @@ rule assignment_merge: -2 {input.REV} \ -m {params.min_overlap} -p {params.frac_mismatches_allowed} \ -d \ + -e {params.min_dovetailed_overlap} \ -z \ -o {output.join} \ -i -f {output.un} \ @@ -247,7 +248,7 @@ rule assignment_getBCs: input: "results/assignment/{assignment}/bam/merge_split{split}.mapped.bam", output: - temp("results/assignment/{assignment}/BCs/barcodes_incl_other.{split}.tsv.gz"), + temp("results/assignment/{assignment}/BCs/barcodes_incl_other.{split}.tsv"), conda: "../envs/bwa_samtools_picard_htslib.yaml" params: @@ -263,6 +264,9 @@ rule assignment_getBCs: sequence_length_max=lambda wc: config["assignments"][wc.assignment][ "sequence_length" ]["max"], + mapping_quality_min=lambda wc: config["assignments"][wc.assignment][ + "min_mapping_quality" + ], log: temp("results/logs/assignment/getBCs.{assignment}.{split}.log"), shell: @@ -272,13 +276,13 @@ rule assignment_getBCs: split($(NF),a,":"); split(a[3],a,","); if (a[1] !~ /N/) {{ - if (($5 > 0) && ($4 >= {params.alignment_start_min}) && ($4 <= {params.alignment_start_max}) && (length($10) >= {params.sequence_length_min}) && (length($10) <= {params.sequence_length_max})) {{ + if (($5 >= {params.mapping_quality_min}) && ($4 >= {params.alignment_start_min}) && ($4 <= {params.alignment_start_max}) && (length($10) >= {params.sequence_length_min}) && (length($10) <= {params.sequence_length_max})) {{ print a[1],$3,$4";"$6";"$12";"$13";"$5 }} else {{ print a[1],"other","NA" }} }} - }}' | gzip -c > {output} 2> {log} + }}' | sort -k1,1 -k2,2 -k3,3 > {output} 2> {log} """ @@ -347,18 +351,21 @@ rule assignment_collectBCs: """ input: expand( - "results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv.gz", + "results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv", split=range(0, getSplitNumber()), ), output: "results/assignment/{assignment}/barcodes_incl_other.sorted.tsv.gz", + params: + batch_size=getSplitNumber(), + threads: 20 conda: "../envs/default.yaml" log: temp("results/logs/assignment/collectBCs.{assignment}.log"), shell: """ - zcat {input} | sort -k1,1 -k2,2 -k3,3 | gzip -c > {output} 2> {log} + sort --batch-size={params.batch_size} --parallel={threads} -k1,1 -k2,2 -k3,3 -m {input} | gzip -c > {output} 2> {log} """ diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index cf4581b..24d5af2 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -76,6 +76,10 @@ properties: type: string minItems: 1 uniqueItems: true + min_mapping_quality: + type: integer + default: 1 + minimum: 0 NGmerge: type: object properties: @@ -87,7 +91,7 @@ properties: default: 0.1 min_dovetailed_overlap: type: integer - default: 20 + default: 50 required: - min_overlap - frac_mismatches_allowed @@ -138,6 +142,7 @@ properties: - configs - alignment_start - sequence_length + - min_mapping_quality - NGmerge additionalProperties: false additionalProperties: false