Skip to content

Commit

Permalink
feat: make filtering consistent
Browse files Browse the repository at this point in the history
  • Loading branch information
visze committed Apr 5, 2022
1 parent 84dd132 commit 5f7a4c5
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 26 deletions.
4 changes: 2 additions & 2 deletions workflow/rules/assigned_counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ rule dna_rna_merge:
params:
minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][
wc.config
]["minRNACounts"],
]["filter"]["RNA"]["minCounts"],
minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][
wc.config
]["minDNACounts"],
]["filter"]["DNA"]["minCounts"],
log:
"logs/experiments/{project}/assigned_counts/{assignment}/{config}/dna_rna_merge.{condition}_{replicate}.log",
shell:
Expand Down
32 changes: 18 additions & 14 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,12 @@ def useSampling(project, conf, dna_or_rna):

def withoutZeros(project, conf):
return (
config["experiments"][project]["configs"][conf]["minDNACounts"] > 0
and config["experiments"][project]["configs"][conf]["minRNACounts"] > 0
config["experiments"][project]["configs"][conf]["filter"]["DNA"]["minCounts"]
> 0
and config["experiments"][project]["configs"][conf]["filter"]["RNA"][
"minCounts"
]
> 0
)


Expand Down Expand Up @@ -449,6 +453,16 @@ def aggregate_demultiplex_input(project):
return output


def counts_getFilterConfig(project, conf, dna_or_rna, command):
value = config["experiments"][project]["configs"][conf]["filter"][dna_or_rna][
command
]
if isinstance(value, int):
return "--%s %d" % (command, value)
else:
return "--%s %f" % (command, value)


def counts_getSamplingConfig(project, conf, dna_or_rna, command):
if useSampling(project, conf, dna_or_rna):
if dna_or_rna in config["experiments"][project]["configs"][conf]["sampling"]:
Expand All @@ -462,19 +476,9 @@ def counts_getSamplingConfig(project, conf, dna_or_rna, command):
dna_or_rna
][command]
if isinstance(value, int):
return "--%s %d" % (
command,
config["experiments"][project]["configs"][conf]["sampling"][
dna_or_rna
][command],
)
return "--%s %d" % (command, value)
else:
return "--%s %f" % (
command,
config["experiments"][project]["configs"][conf]["sampling"][
dna_or_rna
][command],
)
return "--%s %f" % (command, value)

return ""

Expand Down
23 changes: 16 additions & 7 deletions workflow/rules/counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ checkpoint create_demultiplexed_BAM_umi:
"""



rule aggregate_demultiplex:
input:
lambda wc: aggregate_demultiplex_input(wc.project),
Expand Down Expand Up @@ -236,6 +235,9 @@ rule final_counts_umi_samplerer:
wc.project, wc.config, wc.type, "total"
),
seed=lambda wc: counts_getSamplingConfig(wc.project, wc.config, wc.type, "seed"),
filtermincounts=counts_getFilterConfig(
wc.project, wc.config, wc.type, "minCounts"
),
log:
"logs/experiments/{project}/counts/final_counts_umi_samplerer.{condition}_{replicate}_{type}_{config}.log",
shell:
Expand All @@ -245,7 +247,8 @@ rule final_counts_umi_samplerer:
{params.downsampling} \
{params.samplingtotal} \
{params.seed} \
--output {output} > {log}
{params.filtermincounts} \
--output {output} &> {log}
"""


Expand All @@ -264,6 +267,12 @@ rule dna_rna_merge_counts:
"results/experiments/{project}/{raw_or_assigned}/{condition}_{replicate}.merged.config.{config}.tsv.gz",
params:
zero=lambda wc: "false" if withoutZeros(wc.project, wc.config) else "true",
minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][
wc.config
]["filter"]["RNA"]["minCounts"],
minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][
wc.config
]["filter"]["DNA"]["minCounts"],
log:
"logs/experiments/{project}/{raw_or_assigned}/dna_rna_merge_counts.{condition}_{replicate}_{config}.log",
shell:
Expand All @@ -274,13 +283,13 @@ rule dna_rna_merge_counts:
echo "Using no zeros";
join -1 1 -2 1 -t"$(echo -e '\\t')" \
<( zcat {input.dna} | sort ) \
<( zcat {input.rna} | sort) | \
gzip -c > {output};
<( zcat {input.rna} | sort);
else
echo "Allowing zeros!"
join -e 0 -a1 -a2 -t"$(echo -e '\\t')" -o 0 1.2 2.2 \
<( zcat {input.dna} | sort ) \
<( zcat {input.rna} | sort) | \
gzip -c > {output}
fi > {log}
<( zcat {input.rna} | sort);
fi | \
awk -v 'OFS=\\t' '{{if ($2 >= {params.minDNACounts} && $3 >= {params.minRNACounts}) {{print $0}}}}' | \
gzip -c > {output};
"""
13 changes: 10 additions & 3 deletions workflow/scripts/count/samplerer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,17 @@
('seed'),
required=False, type = int,
help= 'Use seed for random number generator')
@click.option('--minCounts',
('min_counts_val'),
required=False, type = int,
help= 'Remove all BCs with lower values.')
@click.option('--output',
'output_file',
required=True,
type=click.Path(writable=True),
help='Output file.')

def cli(input_file, prop_val, total_val, threshold_val, seed, output_file):
def cli(input_file, prop_val, total_val, threshold_val, seed, min_counts_val, output_file):
# set seed if defined
if seed:
random.seed(seed)
Expand All @@ -62,9 +66,12 @@ def cli(input_file, prop_val, total_val, threshold_val, seed, output_file):

click.echo("Adjusting barcodes with given proportion %f" % pp)
df_.iloc[:,1] = df_.iloc[:,1].astype(int).apply(lambda x: int(math.floor(x*pp) + (0.0 if random.random() > (x*pp-math.floor(x*pp)) else 1.0)))
if threshold_val != None:
if threshold_val:
click.echo("Adjusting barcodes with counts > threshold...")
df_.iloc[:,1] = df_.iloc[:,1].astype(int).apply(lambda x: threshold_val if x > threshold_val else x)
df_.iloc[:,1] = df_.iloc[:,1].astype(int).apply(lambda x: threshold_val if x > threshold_val else x)

if min_counts_val:
df_ = df_[df_.iloc[:,1] >= min_counts_val]

click.echo("Writing count file...")
df_.to_csv(output_file, sep="\t", index=False, header=None, compression='gzip')
Expand Down

0 comments on commit 5f7a4c5

Please sign in to comment.