Skip to content

Commit

Permalink
add code for BC output
Browse files Browse the repository at this point in the history
  • Loading branch information
pi-zz-a committed Nov 7, 2023
1 parent b68dfc6 commit 2e534d1
Show file tree
Hide file tree
Showing 10 changed files with 187 additions and 3 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ results
.ipynb_checkpoints
*.pyc
docs/_build/
config/sbatch_omics_cluster.yaml

1 change: 1 addition & 0 deletions config/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ experiments:
label_file: resources/labels.tsv # optional
configs:
exampleConfig:
BC_output: true
filter:
bc_threshold: 10
DNA:
Expand Down
5 changes: 5 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#### imports ####
#################
import pandas as pd
import pdb

##################
#### SETTINGS ####
Expand Down Expand Up @@ -217,6 +218,10 @@ rule all_experiments:
getOutputProjectConditionAssignmentConfig_helper(
"results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_minThreshold_merged.combined.tsv.gz"
),
# assignment individual barcode counts
getOutputProjectConditionAssignmentConfigBCoutput_helper(
"results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz"
),
# assignment statistic
getOutputProjectConditionAssignmentConfig_helper(
"results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_all_barcodesPerInsert_box.png"
Expand Down
3 changes: 2 additions & 1 deletion workflow/envs/python27.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
- conda-forge
dependencies:
- htslib
- pysam>0.16.0
#- pysam>0.16.0
- pysam
- python=2.7
- samtools
1 change: 1 addition & 0 deletions workflow/envs/python3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ dependencies:
- matplotlib
- numpy
- pandas
- polars
- python
41 changes: 41 additions & 0 deletions workflow/rules/assigned_counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ rule assigned_counts_dna_rna_merge:
script=getScript("count/merge_label.py"),
output:
counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.tsv.gz",
bc_counts="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcode_assigned_counts.tsv.gz",
stats="results/experiments/{project}/statistic/assigned_counts/{assignment}/{config}/{condition}_{replicate}_merged_assigned_counts.statistic.tsv.gz",
params:
minRNACounts=lambda wc: config["experiments"][wc.project]["configs"][
Expand All @@ -104,6 +105,7 @@ rule assigned_counts_dna_rna_merge:
minDNACounts=lambda wc: config["experiments"][wc.project]["configs"][
wc.config
]["filter"]["DNA"]["min_counts"],
BC_output=lambda wc: config["experiments"][wc.project]["configs"][wc.config]["BC_output"],
log:
temp(
"results/logs/assigned_counts/{assignment}/dna_rna_merge.{project}.{condition}.{replicate}.{config}.log"
Expand All @@ -114,6 +116,8 @@ rule assigned_counts_dna_rna_merge:
--minRNACounts {params.minRNACounts} --minDNACounts {params.minDNACounts} \
--assignment {input.association} \
--output {output.counts} \
--toggleBC {params.BC_output} \
--bcOutput {output.bc_counts} \
--statistic {output.stats} &> {log}
"""

Expand Down Expand Up @@ -168,6 +172,43 @@ rule assigned_counts_make_master_tables:
--statistic {output.statistic} &> {log}
"""

rule combine_replicates_barcode_output:
conda:
"../envs/python3.yaml"
input:
bc_counts=lambda wc: expand("results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcode_assigned_counts.tsv.gz",
replicate=getReplicatesOfCondition(wc.project, wc.condition),
project=wc.project,
condition=wc.condition,
assignment=wc.assignment,
config=wc.config),
script=getScript("count/merge_replicates_barcode_counts.py"),
output:
bc_merged="results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_allreps_merged_barcode_assigned_counts.tsv.gz",
params:
thresh=lambda wc: config["experiments"][wc.project]["configs"][wc.config]["filter"] \
["bc_threshold"],
replicates=lambda wc: ",".join(
getReplicatesOfCondition(wc.project, wc.condition)
),
bc_counts=lambda wc: ",".join(expand("results/experiments/{project}/assigned_counts/{assignment}/{config}/{condition}_{replicate}_barcode_assigned_counts.tsv.gz",
replicate=getReplicatesOfCondition(wc.project, wc.condition),
project=wc.project,
condition=wc.condition,
assignment=wc.assignment,
config=wc.config,)),
log:
temp(
"results/logs/assigned_counts/combine_replicates.{project}.{condition}.{config}.{assignment}.barcodes.log"
),
shell:
"""
python {input.script} --counts {params.bc_counts} \
--threshold {params.thresh} \
--replicates {params.replicates} \
--output {output.bc_merged} &> {log}
"""


rule assigned_counts_combine_replicates:
"""
Expand Down
31 changes: 30 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#### Global functions ####
################################
from snakemake.workflow import srcdir
import pdb

SCRIPTS_DIR = srcdir("../scripts")

Expand Down Expand Up @@ -115,6 +116,10 @@ def getConditions(project):
return list(exp.Condition.unique())


def getBCoutput(project, conf):
return config["experiments"][project]["configs"][conf]["BC_output"]


def getProjectAssignments(project):
if (
"assignments" in config["experiments"][project]
Expand Down Expand Up @@ -145,7 +150,7 @@ def getVariantsBCThreshold(project):
def getFW(project, condition, replicate, rnaDna_type):
exp = getExperiments(project)
exp = exp[exp.Condition == condition]
exp = exp[exp.Replicate.astype(str) == replicate]
exp = exp[exp.Replicate.astype(str) == replicate]
return [
"%s/%s" % (config["experiments"][project]["data_folder"], f)
for f in exp["%s_BC_F" % rnaDna_type].iloc[0].split(";")
Expand Down Expand Up @@ -341,6 +346,30 @@ def getOutputProjectConditionAssignmentConfig_helper(file):
return output


def getOutputProjectConditionAssignmentConfigBCoutput_helper(file):
"""
Inserts {project}, {condition}, {assignment}, {config} and {BC output} (from configs of project) from config into given file.
"""
output = []
projects = getProjects()
for project in projects:
try:
conditions = getConditions(project)
for condition in conditions:
for c in getConfigs(project):
if getBCoutput(project, c):
output += expand(
file,
project=project,
condition=condition,
assignment=getProjectAssignments(project),
config=c
)
except MissingAssignmentInConfigException:
continue
return output


def getOutputProjectAssignmentConfig_helper(file, betweenReplicates=False):
"""
Inserts {project}, {assignment} and {config} (from configs of project) from config into given file.
Expand Down
3 changes: 3 additions & 0 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ properties:
^([^_\.]+)$:
type: object
properties:
BC_output:
type: boolean
default: false
filter:
type: object
properties:
Expand Down
19 changes: 18 additions & 1 deletion workflow/scripts/count/merge_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,25 @@
required=True,
type=click.Path(writable=True),
help='Statistic output file.')
def cli(counts_file, assignment_file, minRNACounts, minDNACounts, output_file, statistic_file):
@click.option('--toggleBC',
'bc_output',
default=False,
show_default=True,
type=click.Path(writable=True),
help='Output individual barcode counts')
@click.option('--bcOutput',
'bc_output_file',
default="",
type=click.Path(writable=True),
help='Output file for individual barcode counts')
def cli(counts_file, assignment_file, minRNACounts, minDNACounts, output_file, statistic_file, bc_output, bc_output_file):
# pseudocount = 1 if minRNACounts == 0 or minDNACounts == 0 else 0
pseudocountDNA = 1 if minDNACounts == 0 else 0
pseudocountRNA = 1 if minRNACounts == 0 else 0

if bc_output and not bc_output_file:
raise click.BadArgumentUsage("Barcode output file has to be provided if toggleBC is on.")

# statistic
statistic = pd.DataFrame(data={'oligos design': [0], 'barcodes design': [0],
'oligos dna/rna': [0], 'barcodes dna/rna': [0],
Expand Down Expand Up @@ -84,6 +98,9 @@ def cli(counts_file, assignment_file, minRNACounts, minDNACounts, output_file, s
statistic['matched barcodes'] = statistic['barcodes dna/rna'] - statistic['unknown barcodes dna/rna']
statistic['% matched barcodes'] = (statistic['matched barcodes']/statistic['barcodes dna/rna'])*100.0

if bc_output:
counts.to_csv(bc_output_file, index=False, sep='\t', compression='gzip')

# remove Barcorde. Not needed anymore
counts.drop(['Barcode'], axis=1, inplace=True)

Expand Down
84 changes: 84 additions & 0 deletions workflow/scripts/count/merge_replicates_barcode_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import polars as pl
import os
import click
import gzip

@click.command()
@click.option('--counts',
'counts_files',
required=True,
type=str,
default="/data/humangen_kircherlab/pia/mprasnakeflow_test/count_basic/results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_1_barcode_assigned_counts.tsv.gz,/data/humangen_kircherlab/pia/mprasnakeflow_test/count_basic/results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_2_barcode_assigned_counts.tsv.gz,/data/humangen_kircherlab/pia/mprasnakeflow_test/count_basic/results/experiments/exampleCount/assigned_counts/fromFile/exampleConfig/HepG2_3_barcode_assigned_counts.tsv.gz",
help='Assigned barcode count files, one for each replicate separated with a comma.')
@click.option('--threshold',
'bc_thresh',
required=False,
default=10,
type=int,
help="Number of required barcodes (default 10)")
@click.option('--replicates',
'replicates',
default="1,2,3",
type=str,
help="List of replicate names, separated with a comma.")
@click.option('--output',
'output_file',
required=True,
default="/data/humangen_kircherlab/pia/mprasnakeflow_test/count_basic/test_output.tsv.gz",
type=click.Path(writable=True),
help='Output file.')

def cli(counts_files, bc_thresh, replicates, output_file):
"""
Merge the associated barcode count files of all replicates.
"""

filenames = counts_files.split(",")
reps = replicates.split(",")

# ensure there are as many replicates as there are files
if len(reps) != len(filenames):
raise(click.BadParameter('Number of replicates ({}) doesn\'t equal the number of files ({}).'
.format(len(reps), len(filenames))))

# check if every file exists
for file in filenames:
if not os.path.exists(file):
raise(click.BadParameter('{}: file not found'.format(file)))

all_reps = []
for file in filenames:
curr_rep = -1
# find the replicate name of the current file
for rep in reps:
if rep in os.path.basename(file).split("_")[1]:
curr_rep=rep
break
if curr_rep == -1:
raise(click.BadParameter('{}: incorrect file'.format(file)))
all_reps.append(pl.read_csv(file, separator="\t").with_columns(pl.lit(curr_rep).alias("replicate")))

df = pl.concat(all_reps)
df = df.filter(pl.col("name") != "no_BC")
# only keep oligo's with a number of barcodes of at least the given threshold
df_filtered = df.filter(pl.col("Barcode").count().over(["name", "replicate"]) >= bc_thresh)

# pivot table to make a dna and rna count column for every replicate
df_filtered = df_filtered.pivot(values=["dna_count","rna_count"],
index=["Barcode", "name"],
columns="replicate")
df_filtered = df_filtered.sort("name")

# order columns to have dna then rna count of each replicate
col_order = sum([["dna_count_replicate_" + rep, "rna_count_replicate_" + rep] for rep in reps], [])
df_filtered = df_filtered.select(["Barcode", "name"] + col_order)

with gzip.open(output_file, 'wb') as f:
df_filtered.write_csv(f, separator="\t")



if __name__ == '__main__':
cli()


0 comments on commit 2e534d1

Please sign in to comment.