feat: faster design check

kircherlab · Jul 30, 2024 · 315b402 · 315b402
1 parent 823050a
commit 315b402
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 21 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,4 @@
 {
-    "editor.wordWrap": "on"
-}
+    "editor.wordWrap": "on",
+    "python.analysis.typeCheckingMode": "basic"
+}
diff --git a/docs/config.rst b/docs/config.rst
@@ -56,7 +56,7 @@ Each assignment you want to process you have to giv him a name like :code:`examp
         :alignment_start (bwa):
             Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1.
         :min_mapping_quality (bwa):
-            (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 1. For regions only with larger edit distances 30 or 40 might be a good choice. Default is 1. 
+            (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 1. For regions only with larger edit distances 30 or 40 might be a good choice. Default :code:`1`. 
         :sequence_length (exact):
             Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the design file. Only one length design is supported.
         :alignment_start (exact):
@@ -80,14 +80,16 @@ Each assignment you want to process you have to giv him a name like :code:`examp
     (Optional) Options for NGmerge. NGmerge is used merge FW and REV reads. The following options are possible (we recommend to use the default values):
 
     :min_overlap:
-        (Optional) Minimum overlap of the reads. Default is set to 20.
+        (Optional) Minimum overlap of the reads. Default :code:`20`.
     :frac_mismatches_allowed:
-        (Optional) Fraction of mismatches allowed in the overlap. Default is set to 0.1.
+        (Optional) Fraction of mismatches allowed in the overlap. Default :code:`0.1`.
     :min_dovetailed_overlap:
-        (Optional) Minimum dovetailed overlap. Default is set to 10.
+        (Optional) Minimum dovetailed overlap. Default :code:`10`.
 
 :design_file:
     Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. The oligo name later used to link barcode to oligo. The sequence is used to map the reads to the oligos. Adapters can be in the seuqence and therefore :code:`alignment_start` has to be adjusted.
+:fast_design_check:
+    (Optional) Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. Cannot find substrings as part of any sequence. Set to false for more correct, but slower, search. Default :code:`true`.
 :configs:
     After mapping the reads to the design file and extracting the barcodes per oligo the configuration (using different names) can be used to generate multiple filtering and configuration settings of the final maq oligo to barcode. Each configuration is a dictionary with the following keys:
 

diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk
@@ -46,14 +46,19 @@ rule assignment_check_design:
                 "sequence_length"
             ]["min"]
         ),
+        fast_check=lambda wc: (
+            "--fast-dict"
+            if config["assignments"][wc.assignment]["fast_design_check"]
+            else "--slow-string-search"
+        ),
     log:
         log=temp("results/logs/assignment/check_design.{assignment}.log"),
         err="results/assignment/{assignment}/design_check.err",
     shell:
         """
         trap "cat {log.err}" ERR
         cp {input.design} {output.ref}
-        python {input.script} --input {output.ref} --start {params.start} --length {params.length} > {log.log} 2> {log.err};
+        python {input.script} --input {output.ref} --start {params.start} --length {params.length} {fast_check} > {log.log} 2> {log.err};
         """
 
 

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -157,6 +157,9 @@ properties:
             additionalProperties: false
           design_file:
             type: string
+          fast_design_check:
+            type: boolean
+            default: true
           configs:
             type: object
             patternProperties:

diff --git a/workflow/scripts/assignment/check_design_file.py b/workflow/scripts/assignment/check_design_file.py
@@ -12,7 +12,6 @@
 import numpy as np
 
 
-
 # options
 @click.command()
 @click.option(
@@ -36,7 +35,13 @@
     type=int,
     help="length of the sequence to lok at,",
 )
-def cli(input_file, start, length):
+@click.option(
+    "--fast-dict/--slow-string-search",
+    "fast_search",
+    default=True,
+    help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.",
+)
+def cli(input_file, start, length, fast_search):
 
     seq_dict = dict()
 
@@ -55,17 +60,27 @@ def cli(input_file, start, length):
 
     # check for illegal characters
     click.echo("Searching for illegal characters in header...")
-    illegal_characters =  np.array([True if "[" in i or "]" in i else False for i in set(ids)], dtype=bool).sum()
+    illegal_characters = np.array(
+        [True if "[" in i or "]" in i else False for i in set(ids)], dtype=bool
+    ).sum()
     if illegal_characters > 0:
-        click.echo(f"{illegal_characters} headers contain illegal characters ('[',']').", err=True)
+        click.echo(
+            f"{illegal_characters} headers contain illegal characters ('[',']').",
+            err=True,
+        )
         exit(1)
 
     # build seq dict
     click.echo("Building sequence dictionary...")
     for i in range(len(fa)):
-        names = seq_dict.get(fa[i].seq, set())
+        if fast_search:
+            seq = fa[i][start - 1 : start + length - 1].seq
+        else:
+            seq = fa[i].seq
+
+        names = seq_dict.get(seq, set())
         names.add(fa[i].name)
-        seq_dict[fa[i].seq] = names
+        seq_dict[seq] = names
 
     # search for collisions
     click.echo("Searching for collisions...")
@@ -76,11 +91,15 @@ def cli(input_file, start, length):
 
         forward_collition = set()
         antisense_collition = set()
-        for seq, names in seq_dict.items():
-            if sub_seq_forward in seq:
-                forward_collition.update(names)
-            if sub_seq_antisense in seq:
-                antisense_collition.update(names)
+        if fast_search:
+            forward_collition.update(seq_dict.get(sub_seq_forward, set()))
+            antisense_collition.update(seq_dict.get(sub_seq_antisense, set()))
+        else:
+            for seq, names in seq_dict.items():
+                if sub_seq_forward in seq:
+                    forward_collition.update(names)
+                if sub_seq_antisense in seq:
+                    antisense_collition.update(names)
 
         if len(forward_collition) > 1:
             forward_collitions.append(forward_collition)
@@ -92,14 +111,27 @@ def cli(input_file, start, length):
     antisense_collition = [list(i) for i in set(tuple(i) for i in antisense_collition)]
 
     if (len(forward_collitions) > 0) or (len(antisense_collitions) > 0):
-        click.echo("Collisions found:", err=True)
+        click.echo(
+            f"Found {len(forward_collitions)} forward and {len(antisense_collitions)} antisense collisions",
+            err=True,
+        )
+        click.echo(
+            "-----------------FORWARD COLLISIONS-----------------",
+            err=True,
+        )
         for i in range(len(forward_collitions)):
             click.echo(
-                f"Forward collision {i+1} for sequences:\t{"\t".join(forward_collitions[i])}", err=True
+                "\t".join(forward_collitions[i]),
+                err=True,
             )
+
+        click.echo(
+            "-----------------ANTISENSE COLLISIONS-----------------",
+            err=True,
+        )
         for i in range(len(antisense_collitions)):
             click.echo(
-                f"Antisense collision {i+1} for sequences:\t{"\t".join(antisense_collitions[i])}",
+                "\t".join(antisense_collitions[i]),
                 err=True,
             )
         exit(1)