Skip to content

Commit

Permalink
fix(distance_features): correct mean distance equation and correct ro…
Browse files Browse the repository at this point in the history
…ws with negative values (#889)

* fix(distance_features): hack to set to null negative values

* fix(distance_features): correct mean distance equation
  • Loading branch information
ireneisdoomed authored Oct 31, 2024
1 parent 7bb74a5 commit fa38ca6
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 16 deletions.
37 changes: 22 additions & 15 deletions src/gentropy/dataset/l2g_features/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,22 @@ def common_distance_feature_logic(
distances_dataset = variant_index.get_distance_to_gene(distance_type=distance_type)
if "Mean" in feature_name:
# Weighting by the SNP contribution is only applied when we are averaging all distances
distance_score_expr = (
f.lit(genomic_window) - f.col(distance_type) + f.lit(1)
) * f.col("posteriorProbability")
agg_expr = f.mean(f.col("distance_score"))
df = study_loci_to_annotate.df.withColumn(
"variantInLocus", f.explode_outer("locus")
).select(
"studyLocusId",
f.col("variantInLocus.variantId").alias("variantId"),
f.col("variantInLocus.posteriorProbability").alias("posteriorProbability"),
)
distance_score_expr = (
f.lit(genomic_window) - f.col(distance_type) + f.lit(1)
) * f.col("posteriorProbability")
agg_expr = f.sum(f.col("distance_score"))
elif "Sentinel" in feature_name:
df = study_loci_to_annotate.df.select("studyLocusId", "variantId")
# For minimum distances we calculate the unweighted distance between the sentinel (lead) and the gene. This
distance_score_expr = f.lit(genomic_window) - f.col(distance_type) + f.lit(1)
agg_expr = f.first(f.col("distance_score"))
df = study_loci_to_annotate.df.select("studyLocusId", "variantId")
return (
df.join(
distances_dataset.withColumnRenamed("targetId", "geneId"),
Expand All @@ -66,10 +66,15 @@ def common_distance_feature_logic(
)
.withColumn(
"distance_score",
f.log10(distance_score_expr) / f.log10(f.lit(genomic_window + 1)),
distance_score_expr,
)
.groupBy("studyLocusId", "geneId")
.agg(agg_expr.alias(feature_name))
.agg(agg_expr.alias("distance_score_agg"))
.withColumn(
feature_name,
f.log10(f.col("distance_score_agg")) / f.log10(f.lit(genomic_window + 1)),
)
.drop("distance_score_agg")
)


Expand Down Expand Up @@ -120,7 +125,6 @@ def common_neighbourhood_distance_feature_logic(
class DistanceTssMeanFeature(L2GFeature):
"""Average distance of all tagging variants to gene TSS."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceTssMean"

Expand All @@ -147,6 +151,11 @@ def compute(
feature_name=cls.feature_name,
distance_type=distance_type,
**feature_dependency,
).withColumn(
cls.feature_name,
f.when(f.col(cls.feature_name) < 0, f.lit(0.0)).otherwise(
f.col(cls.feature_name)
),
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
Expand All @@ -159,7 +168,6 @@ def compute(
class DistanceTssMeanNeighbourhoodFeature(L2GFeature):
"""Minimum mean distance to TSS for all genes in the vicinity of a studyLocus."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceTssMeanNeighbourhood"

Expand Down Expand Up @@ -198,7 +206,6 @@ def compute(
class DistanceSentinelTssFeature(L2GFeature):
"""Distance of the sentinel variant to gene TSS. This is not weighted by the causal probability."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceSentinelTss"

Expand Down Expand Up @@ -237,7 +244,6 @@ def compute(
class DistanceSentinelTssNeighbourhoodFeature(L2GFeature):
"""Distance between the sentinel variant and a gene TSS as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceSentinelTssNeighbourhood"

Expand Down Expand Up @@ -276,7 +282,6 @@ def compute(
class DistanceFootprintMeanFeature(L2GFeature):
"""Average distance of all tagging variants to the footprint of a gene."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceFootprintMean"

Expand All @@ -303,6 +308,11 @@ def compute(
feature_name=cls.feature_name,
distance_type=distance_type,
**feature_dependency,
).withColumn(
cls.feature_name,
f.when(f.col(cls.feature_name) < 0, f.lit(0.0)).otherwise(
f.col(cls.feature_name)
),
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
Expand All @@ -315,7 +325,6 @@ def compute(
class DistanceFootprintMeanNeighbourhoodFeature(L2GFeature):
"""Minimum mean distance to footprint for all genes in the vicinity of a studyLocus."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceFootprintMeanNeighbourhood"

Expand Down Expand Up @@ -354,7 +363,6 @@ def compute(
class DistanceSentinelFootprintFeature(L2GFeature):
"""Distance between the sentinel variant and the footprint of a gene."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceSentinelFootprint"

Expand Down Expand Up @@ -393,7 +401,6 @@ def compute(
class DistanceSentinelFootprintNeighbourhoodFeature(L2GFeature):
"""Distance between the sentinel variant and a gene footprint as a relation of the distnace with all the genes in the vicinity of a studyLocus. This is not weighted by the causal probability."""

fill_na_value = 500_000
feature_dependency_type = VariantIndex
feature_name = "distanceSentinelFootprintNeighbourhood"

Expand Down
2 changes: 1 addition & 1 deletion tests/gentropy/dataset/test_l2g_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ class TestCommonDistanceFeatureLogic:
(
"distanceTssMean",
[
{"studyLocusId": "1", "geneId": "gene1", "distanceTssMean": 0.08},
{"studyLocusId": "1", "geneId": "gene1", "distanceTssMean": 0.52},
{"studyLocusId": "1", "geneId": "gene2", "distanceTssMean": 0.63},
],
),
Expand Down

0 comments on commit fa38ca6

Please sign in to comment.