Snakefile-ncov

"""
Snakefile specific for running assembly for the hCoV-19 sequences of
the Seattle Flu Study.

To run:
    $ snakemake -k --snakefile Snakefile-ncov --configfile <config_file.json>
"""
import json


# Include the main assembly pipeline
include: "Snakefile-base-flu"

# input function for the rule aggregate
def aggregate_input(wildcards):
    """
    Returns input for rule aggregate based on output from checkpoint align_rate.
    Set minimum align rate in config under "min_align_rate".
    """
    with open(checkpoints.mapped_reads.get(sample=wildcards.sample, reference=wildcards.reference).output[0]) as f:
        summary = json.load(f)
        all_segments_aligned = summary["all_segments_aligned"]
        min_reads = summary["minimum_reads_required"]
        mapped = summary["mapped_reads"]

        if not all_segments_aligned or mapped <= min_reads:
            return rules.not_mapped.output.not_mapped
        else:
            return rules.fasta_headers.output.masked_consensus

rule all:
    input:
        # pre_fastqc = expand("summary/pre_trim_fastqc/{fname}_fastqc.html",
        #        fname=glob.glob(config['fastq_directory']),
        post_fastqc = expand("summary/post_trim_fastqc/{sample}.trimmed_{tr}_fastqc.{ext}",
               sample=all_ids,
               tr=["1P", "1U", "2P", "2U"],
               ext=["zip", "html"]),
        bamstats = expand("summary/bamstats/{reference}/{sample}.coverage_stats.txt", filtered_product,
               sample=all_ids,
               reference=all_references),
        aggregate = expand("summary/aggregate/{reference}/{sample}.log", filtered_product,
                sample=all_ids,
                reference=all_references),


rule fasta_headers:
    input:
        masked_consensus = rules.vcf_to_consensus.output
    output:
        masked_consensus = "consensus_genomes/{reference}/{sample}.masked_consensus.fasta"
    shell:
        """
        cat {input.masked_consensus} | \
            perl -pi -e 's/(?<=>)[^>|]*(?<=|)/{wildcards.sample}/g' > \
            {output.masked_consensus}.temp
        awk '{{split(substr($0,2),a,"|"); \
            if(a[2]) print ">"a[1]"|"a[1]"-"a[2]"-"a[3]"|"a[2]"|"a[3]; \
            else print; }}' \
            {output.masked_consensus}.temp > {output.masked_consensus}
        rm {output.masked_consensus}.temp
        """


rule aggregate:
    input:
        aggregate_input = aggregate_input
    output:
        aggregate_summary = "summary/aggregate/{reference}/{sample}.log"
    run:
        shell("echo 'Final output: {input.aggregate_input}' > {output}")