Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional_samples_to_drop option to run_pca_with_relateds #489

Merged
merged 5 commits into from
Oct 17, 2022
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 24 additions & 17 deletions gnomad/sample_qc/ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,52 +259,59 @@ def assign_population_pcs(
def run_pca_with_relateds(
qc_mt: hl.MatrixTable,
related_samples_to_drop: Optional[hl.Table],
additional_samples_to_drop: Optional[hl.Table],
n_pcs: int = 10,
klaricch marked this conversation as resolved.
Show resolved Hide resolved
autosomes_only: bool = True,
) -> Tuple[List[float], hl.Table, hl.Table]:
"""
Run PCA excluding the given related samples, and project those samples in the PC space to return scores for all samples.
Run PCA excluding the given related or additional samples, and project those samples in the PC space to return scores for all samples.

The `related_samples_to_drop` Table has to be keyed by the sample ID and all samples present in this
table will be excluded from the PCA.
The `related_samples_to_drop` and `additional_samples_to_drop` Tables have to be keyed by the sample ID and all samples present in these
tables will be excluded from the PCA.

The loadings Table returned also contains a `pca_af` annotation which is the allele frequency
used for PCA. This is useful to project other samples in the PC space.

:param qc_mt: Input QC MT
:param related_samples_to_drop: Optional table of related samples to drop
:param related_samples_to_drop: Optional table of related samples to drop when generating the PCs, these samples will be projected in the PC space
:param additional_samples_to_drop: Optional table of additional samples to drop when generating the PCs, these samples will be projected in the PC space
:param n_pcs: Number of PCs to compute
:param autosomes_only: Whether to run the analysis on autosomes only
:return: eigenvalues, scores and loadings
"""
unrelated_mt = qc_mt.persist()

klaricch marked this conversation as resolved.
Show resolved Hide resolved
if autosomes_only:
unrelated_mt = filter_to_autosomes(unrelated_mt)
qc_mt = filter_to_autosomes(qc_mt)

# 'pca_mt' is the MatrixTable to use for generating the PCs
# If samples to drop are provided in related_samples_to_drop or additional_samples_to_drop, 'project_pca_mt' will also be generated and will contain the samples to project in the PC space
klaricch marked this conversation as resolved.
Show resolved Hide resolved
pca_mt = qc_mt

if related_samples_to_drop:
unrelated_mt = qc_mt.filter_cols(
hl.is_missing(related_samples_to_drop[qc_mt.col_key])
pca_mt = pca_mt.filter_cols(
hl.is_missing(related_samples_to_drop[pca_mt.col_key])
)
if additional_samples_to_drop:
pca_mt = pca_mt.filter_cols(
hl.is_missing(additional_samples_to_drop[pca_mt.col_key])
)

pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
unrelated_mt.GT, k=n_pcs, compute_loadings=True
pca_mt.GT, k=n_pcs, compute_loadings=True
)
pca_af_ht = unrelated_mt.annotate_rows(
pca_af=hl.agg.mean(unrelated_mt.GT.n_alt_alleles()) / 2
pca_af_ht = pca_mt.annotate_rows(
pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2
).rows()
pca_loadings = pca_loadings.annotate(
pca_af=pca_af_ht[pca_loadings.key].pca_af
) # TODO: Evaluate if needed to write results at this point if relateds or not

if not related_samples_to_drop:
if not related_samples_to_drop and not additional_samples_to_drop:
return pca_evals, pca_scores, pca_loadings
else:
pca_loadings = pca_loadings.persist()
pca_scores = pca_scores.persist()
related_mt = qc_mt.filter_cols(
hl.is_defined(related_samples_to_drop[qc_mt.col_key])
)
related_scores = pc_project(related_mt, pca_loadings)
pca_scores = pca_scores.union(related_scores)
project_pca_mt = qc_mt.filter_cols(hl.is_missing(pca_mt.cols()[qc_mt.col_key]))
projected_scores = pc_project(project_pca_mt, pca_loadings)
pca_scores = pca_scores.union(projected_scores)
return pca_evals, pca_scores, pca_loadings