Skip to content

Commit

Permalink
condition scans based on explicit Mb region rather than a single marker
Browse files Browse the repository at this point in the history
  • Loading branch information
tomsasani committed Jan 19, 2024
1 parent 345d862 commit d376670
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 48 deletions.
84 changes: 41 additions & 43 deletions ihd/run_ihd_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,13 @@
compute_spectra,
perform_permutation_test,
perform_ihd_scan,
compute_genotype_similarity,
compute_manual_chisquare,
compute_manual_cosine_distance,
calculate_confint,
get_covariate_matrix,
calculate_covariate_by_marker,
)
from schema import IHDResultSchema, MutationSchema
import numba
import allel
from scipy.spatial.distance import squareform


def filter_mutation_data(
Expand Down Expand Up @@ -105,26 +101,28 @@ def main(args):
replace_dict = config_dict["genotypes"]
geno_asint = geno.replace(replace_dict).replace({1: np.nan})

# if specified, remove all markers with r2 > X with selected markers
if args.adj_marker is not None:
# measure LD between all pairs of markers
cols = [c for c in geno_asint.columns if c != "marker"]
gn = geno_asint[cols].values
gn[np.isnan(gn)] = -1
r = allel.rogers_huff_r(gn)
ld = squareform(r**2)
np.fill_diagonal(ld, 1.0)
marker_idxs = geno_asint[geno_asint["marker"] == args.adj_marker].index.values
_, ld_markers = np.where(ld[marker_idxs] >= 0.1)
geno_asint = geno_asint.iloc[geno_asint.index.difference(ld_markers)]
if args.adj_region != "None":
chrom = args.adj_region.split(":")[0]
start, end = list(map(float, args.adj_region.split(":")[1].split("-")))
# find markers within this region
markers_to_filter = markers[
(markers["chromosome"] == chrom)
& (markers["Mb"] >= start)
& (markers["Mb"] <= end)
]["marker"].unique()
marker_idxs = geno_asint[
geno_asint["marker"].isin(markers_to_filter)
].index.values
geno_asint = geno_asint.iloc[geno_asint.index.difference(marker_idxs)]

# convert genotype values to a matrix
geno_asint_filtered_matrix = geno_asint[samples].values
# get an array of marker names at the filtered genotyped loci
markers_filtered = geno_asint["marker"].values

# compute similarity between allele frequencies at each marker
genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix)
# genotype_similarity = compute_genotype_similarity(geno_asint_filtered_matrix)
genotype_similarity = np.ones(geno_asint_filtered_matrix.shape[0])
distance_method = compute_manual_cosine_distance
if args.distance_method == "chisquare":
distance_method = compute_manual_chisquare
Expand All @@ -149,7 +147,7 @@ def main(args):
genotype_similarity,
covariate_ratios,
distance_method=distance_method,
adjust_statistics=True,
adjust_statistics=False,
)

res_df = pd.DataFrame(
Expand All @@ -173,7 +171,7 @@ def main(args):
distance_method=distance_method,
n_permutations=args.permutations,
progress=args.progress,
adjust_statistics=True,
adjust_statistics=False,
)

# compute the Nth percentile of the maximum distance
Expand All @@ -186,31 +184,31 @@ def main(args):
# the peak observed distance
geno_asint_filtered_merged = geno_asint.merge(markers, on="marker")

combined_conf_int_df = []

conf_int_chroms = ["4", "6"]
for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"):
if chrom not in conf_int_chroms:
continue

chrom_genotype_matrix = chrom_df[samples].values
# compute confidence intervals on the chromosome
conf_int_lo, conf_int_hi, peak_markers = calculate_confint(
spectra,
chrom_genotype_matrix,
covariate_matrix,
distance_method=distance_method,
adjust_statistics=True,
conf_int=90.,
n_permutations=1_000,
)
# combined_conf_int_df = []

# conf_int_chroms = ["4", "6"]
# for chrom, chrom_df in geno_asint_filtered_merged.groupby("chromosome"):
# if chrom not in conf_int_chroms:
# continue

# chrom_genotype_matrix = chrom_df[samples].values
# # compute confidence intervals on the chromosome
# conf_int_lo, conf_int_hi, peak_markers = calculate_confint(
# spectra,
# chrom_genotype_matrix,
# covariate_matrix,
# distance_method=distance_method,
# adjust_statistics=True,
# conf_int=90.,
# n_permutations=1_000,
# )

conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])]
combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]])
# conf_int_df = chrom_df.iloc[np.array([conf_int_lo, conf_int_hi])]
# combined_conf_int_df.append(conf_int_df[["chromosome", "marker", "Mb"]])

combined_conf_int_df = pd.concat(combined_conf_int_df)
# combined_conf_int_df = pd.concat(combined_conf_int_df)

combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False)
# combined_conf_int_df.to_csv(f"{args.out}.ci.csv", index=False)

res_df.to_csv(args.out, index=False)

Expand Down Expand Up @@ -273,10 +271,10 @@ def main(args):
help="""If specified, use this column to perform a stratified permutation test by only permuting BXDs within groups defined by the column to account for population structure.""",
)
p.add_argument(
"-adj_marker",
"-adj_region",
default=None,
type=str,
help="""Comma-separated list of markers that we should adjust for when performing scan.""",
help="""If specified, a chromosomal region (chr:start-end) that we should adjust for in our AMSD scans. Start and end coordinates should be specified in Mb. Default is None""",
)
args = p.parse_args()

Expand Down
2 changes: 1 addition & 1 deletion ihd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def compute_haplotype_distance(
@numba.njit
def shuffle_spectra(
spectra: np.ndarray,
groups: np.ndarray = None,
groups: np.ndarray,
) -> np.ndarray:
"""Randomly shuffle the rows of a 2D numpy array of
mutation spectrum data of size (N, M), where N is the number
Expand Down
9 changes: 5 additions & 4 deletions scripts/rules/run_ihd.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
condition2markers = {"N": None, "D": "rs27509845", "B": "rs27509845"}
# region containing significant markers in unconditioned scan
condition2region = {"N": None, "D": "4:103.634906-125.068158", "B": "4:103.634906-125.068158"}


rule run_ihd:
Expand All @@ -8,18 +9,18 @@ rule run_ihd:
config = "data/json/{cross}.json",
py_script = "ihd/run_ihd_scan.py"
output: "csv/{cross}.k{k}.genome.condition_on_{condition}.results.csv"
params: adj_marker = lambda wc: condition2markers[wc.condition]
params: adj_region = lambda wc: condition2region[wc.condition]
shell:
"""
python {input.py_script} --mutations {input.singletons} \
--config {input.config} \
--out {output} \
-k {wildcards.k} \
-distance_method cosine \
-permutations 10000 \
-permutations 1000 \
-stratify_column true_epoch \
-adj_region {params.adj_region} \
-threads 4 \
-adj_marker {params.adj_marker} \
-progress
"""

Expand Down

0 comments on commit d376670

Please sign in to comment.