Skip to content

Commit

Permalink
feat: master variant table
Browse files Browse the repository at this point in the history
  • Loading branch information
visze committed Apr 8, 2022
1 parent 94cbef4 commit 6bda47c
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 3 deletions.
17 changes: 16 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ report: "report/workflow.rst"
wildcard_constraints:
project="[^/]+",
assignment="[^/]+",
condition="[^/]+",
condition="[^/_]+",
replicate="[^/_]+",


##### localrules #####
Expand Down Expand Up @@ -109,6 +110,10 @@ rule all:
"results/experiments/{project}/stats/variants/{assignment}/{config}/correlation_variantTable.tsv",
True,
),
getOutputVariants_helper(
"results/experiments/{project}/variants/{assignment}/{config}/{condition}_variantTable.tsv.gz",
False,
),


rule all_assignments:
Expand Down Expand Up @@ -231,6 +236,16 @@ rule all_stats_BCNucleotideComposition:
"results/experiments/{project}/stats/counts/BCNucleotideComposition/{condition}_{replicate}_{type}.tsv.gz"
),

rule all_variants:
input:
getOutputVariants_helper(
"results/experiments/{project}/stats/variants/{assignment}/{config}/correlation_variantTable.tsv",
True,
),
getOutputVariants_helper(
"results/experiments/{project}/variants/{assignment}/{config}/{condition}_variantTable.tsv.gz",
False,
),

###################
## SUB-WORKFLOWS ##
Expand Down
1 change: 0 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ def getOutputVariants_helper(file, betweenReplicates=False):
)
return output


def getAssignment_helper(file):
return expand(
file,
Expand Down
98 changes: 98 additions & 0 deletions workflow/scripts/variants/generateMasterVariantTable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Author: Max Schubach 2022

import pandas as pd
import numpy as np
import click


# options
@click.command()
@click.option('--input',
'input_files',
multiple=True,
required=True,
type=click.Path(exists=True, readable=True),
help='Variant file for each replicate.')
@click.option('--minRNACounts',
'minRNACounts',
default=1,
show_default=True,
type=int,
help='Minimum number of RNA counts required per barcode. If 0 pesudocounts are used.')
@click.option('--minDNACounts',
'minDNACounts',
default=0,
show_default=True,
type=int,
help='Minimum number of DNA counts required per barcode. If 0 pesudocounts are used.')
@click.option('--output',
'output_file',
required=True,
type=click.Path(writable=True),
help='Output file with combined variant expression (of all replicates).')
def cli(input_files, minRNACounts, minDNACounts, output_file):

pseudocountDNA = 1 if minDNACounts == 0 else 0
pseudocountRNA = 1 if minRNACounts == 0 else 0

df = pd.DataFrame()
# variant files
click.echo("Read variant files per replicate...")
for input_file in input_files:
df = pd.concat([df, pd.read_csv(input_file, header=0, sep="\t")], axis=0, sort=False)

click.echo("Create new expression values...")
df = df.groupby(['ID', 'REF', 'ALT']).agg(
dna_counts_REF=('dna_counts_REF', sum),
rna_counts_REF=('rna_counts_REF', sum), n_obs_bc_REF=('n_obs_bc_REF', sum),
dna_counts_ALT=('dna_counts_ALT', sum), rna_counts_ALT=('rna_counts_ALT', sum),
n_obs_bc_ALT=('n_obs_bc_ALT', sum)
).reset_index()

scaling = 10**6



df_total = df[['dna_counts_REF', 'rna_counts_REF', 'n_obs_bc_REF',
'dna_counts_ALT', 'rna_counts_ALT', 'n_obs_bc_ALT']].sum()

total_bc = df_total['n_obs_bc_REF'] + df_total['n_obs_bc_ALT']
total_dna = df_total['dna_counts_REF'] + df_total['dna_counts_ALT'] + total_bc * pseudocountDNA
total_rna = df_total['rna_counts_REF'] + df_total['rna_counts_ALT'] + total_bc * pseudocountRNA

scaling = 10**6

def normalize(df, total, count_type, ref_alt, pseudocount, scaling):
return(((df["%s_counts_%s" % (count_type, ref_alt)] + pseudocount * df['n_obs_bc_%s' % ref_alt])/df['n_obs_bc_%s' % ref_alt])/total*scaling)

for ref_alt in ["REF", "ALT"]:
for count_type in ["dna", "rna"]:
total = total_rna if count_type == 'rna' else total_dna
pseudocount = pseudocountRNA if count_type == 'rna' else pseudocountDNA

df['%s_normalized_%s' % (count_type, ref_alt)] = normalize(
df, total, count_type, ref_alt, pseudocount, scaling)

df['ratio_%s' % ref_alt] = df['rna_normalized_%s' % ref_alt] / df['dna_normalized_%s' % ref_alt]
df['log2_%s' % ref_alt] = np.log2(df['ratio_%s' % ref_alt])

df["log2_expression"] = np.log2(df['ratio_ALT']/df['ratio_REF'])

# fill NA and set correct output types
df.fillna(0, inplace=True)
df = df.astype(dtype={'dna_counts_REF': 'int64', 'rna_counts_REF': 'int64', 'n_obs_bc_REF': 'int64',
'dna_counts_ALT': 'int64', 'rna_counts_ALT': 'int64', 'n_obs_bc_ALT': 'int64'}, copy=False)

df = df.reindex(columns=["ID", "REF", "ALT", "dna_counts_REF", "rna_counts_REF",
"dna_normalized_REF", "rna_normalized_REF", "ratio_REF", "log2_REF",
"n_obs_bc_REF", "dna_counts_ALT", "rna_counts_ALT",
"dna_normalized_ALT", "rna_normalized_ALT", "ratio_ALT",
"log2_ALT", "n_obs_bc_ALT", "log2_expression"])

# write output
click.echo("Write files...")
df.to_csv(output_file, index=False, sep='\t', header=True, compression='gzip')


if __name__ == '__main__':
cli()
2 changes: 1 addition & 1 deletion workflow/scripts/variants/generateVariantTable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Author: Max Schubach 20202
# Author: Max Schubach 2022

import pandas as pd
import numpy as np
Expand Down

0 comments on commit 6bda47c

Please sign in to comment.