AlexsLemonade · jaclyn-taroni · Dec 17, 2019 · Dec 11, 2019 · Dec 11, 2019 · Dec 11, 2019
diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
@@ -2,15 +2,16 @@
 # Updated Dec 5, 2019
 
 ## Define the ending file(s) that we want
-ALL_FREEC= expand("../../scratch/interim/{sample}.freec.dup.bed", sample=config["samples"])
-ALL_CNVKIT= expand("../../scratch/interim/{sample}.cnvkit.dup.bed", sample=config["samples"])
-ALL_MANTA= expand("../../scratch/interim/{sample}.manta.dup.bed", sample=config["samples"])
+OUTPUT= expand("../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed",
+               sample=config["samples"], 
+               caller=["freec", "cnvkit", "manta"],
+               dupdel=["dup", "del"])
 
 
 ## Define the first rule of the Snakefile. This rule determines what the final file is and which steps to be taken.
 rule all:
     input:
-        ALL_FREEC + ALL_CNVKIT + ALL_MANTA
+        OUTPUT
 
 
 #########################      FILTER STEP FOR THE 3 CALL METHODS ####################################
@@ -33,12 +34,14 @@ rule freec_filter:
         ## The first awk looks at column 6 to filter out for loss/gain. Then it prints out 6 of the 7 columns above
         ## The pipe into the second awk filters the CNV length, freec pval, and add in the CNV type
         ## The last pipe is to sort first digit of chromosome number numerically
-        """awk '$6~/loss/ {{print $2,$3,$4,($4-$3 + 1),$5,$9}}' {input.events} """
+        """awk '$6~/loss/ {{print "chr"$2,$3,$4,($4-$3 + 1),$5,$9}}' {input.events} """
         """ | awk '{{if ($4 > {params.SIZE_CUTOFF} && $6 < {params.FREEC_PVAL}){{print $0,"DEL"}}}}' """
-        """ | sort -k1,1 -k2,2n > {output.freec_del} && """
-        """awk '$6~/gain/ {{print $2,$3,$4,($4-$3 + 1),$5,$9}}' {input.events} """
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.freec_del} && """
+        """awk '$6~/gain/ {{print "chr"$2,$3,$4,($4-$3 + 1),$5,$9}}' {input.events} """
         """ | awk '{{if ($4 > {params.SIZE_CUTOFF} && $6 < {params.FREEC_PVAL}){{print $0,"DUP"}}}}' """
-        """ | sort -k1,1 -k2,2n > {output.freec_dup}"""
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.freec_dup}"""
 
 rule cnvkit_filter:
     input:
@@ -58,10 +61,12 @@ rule cnvkit_filter:
         ## The last pipe is to sort first digit of chromosome number numerically
         """awk '$7<2 {{print $2,$3,$4,($4-$3 + 1),$7,"NA"}}' {input.events} """
         """ | awk '{{if ($4 > {params.SIZE_CUTOFF}){{print $0,"DEL"}}}}' """
-        """ | sort -k1,1 -k2,2n > {output.cnvkit_del} && """
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.cnvkit_del} && """
         """awk '$7>2 {{print $2,$3,$4,($4-$3 + 1),$7,"NA"}}' {input.events} """
         """ | awk '{{if ($4 > {params.SIZE_CUTOFF}){{print $0,"DUP"}}}}' """
-        """ | sort -k1,1 -k2,2n > {output.cnvkit_dup}"""
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.cnvkit_dup}"""
 
 rule manta_filter:
     input:
@@ -79,7 +84,28 @@ rule manta_filter:
         ## The first awk looks at column 6 to filter out for loss/gain (DEL/DUP). Then it prints out 6 of the 7 columns above. Put NA for both p-value and copy number since MANTA results don't have these values.
         ## the first awk also filters out for CNV length
         ## The last pipe is to sort first digit of chromosome number numerically
-        """awk '$6~/DEL/ {{if ($5 > {params.SIZE_CUTOFF}) {{print$2,$3,$4,$5,"NA","NA",$6}}}}' {input} """
-        """ | sort -k1,1 -k2,2n > {output.manta_del} && """
-        """awk '$6~/DUP/ {{if ($5 > {params.SIZE_CUTOFF}) {{print$2,$3,$4,$5,"NA","NA",$6}}}}' {input} """
-        """ | sort -k1,1 -k2,2n > {output.manta_dup}"""
+        """awk '$6~/DEL/ {{if ($5 > {params.SIZE_CUTOFF}) {{print "chr"$2,$3,$4,$5,"NA","NA",$6}}}}' {input} """
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.manta_del} && """
+        """awk '$6~/DUP/ {{if ($5 > {params.SIZE_CUTOFF}) {{print "chr"$2,$3,$4,$5,"NA","NA",$6}}}}' {input} """
+        """ | sort -k1,1 -k2,2n """
+        """ | tr [:blank:] '\t' > {output.manta_dup}"""
+
+
+rule filter_telomeres:
+    input:
+        ## Define the location of the input file and take the path/extension from the config file
+        script=os.path.join(config["scripts"], "get_rid_bad_segments.py"),
+        bad_list=os.path.join(config["scripts"], "bad_chromosomal_seg_updated_merged.bed"),
+        bedfile="../../scratch/interim/{sample}.{caller}.{dupdel}.bed"
+    output:
+        ## Define the output files' names
+        filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed"
+    wildcard_constraints:
+        caller = "cnvkit|freec|manta",
+        dupdel = "del|dup"
+    threads: 1
+    shell:
+        ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. 
+        "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}"
+