From 3f382fccf29bc99a3080f57b9c22ecc0b8857e81 Mon Sep 17 00:00:00 2001 From: klaricch Date: Mon, 15 Aug 2022 15:45:13 -0400 Subject: [PATCH 1/6] add repartition option before ld prune --- gnomad/sample_qc/pipeline.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 2e5335c1e..914d12f21 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -122,6 +122,7 @@ def get_qc_mt( filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, checkpoint_path: Optional[str] = None, + partitions: Optional[int] = None, ) -> hl.MatrixTable: """ Create a QC-ready MT. @@ -149,6 +150,7 @@ def get_qc_mt( :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept) :param high_conf_regions: If given, the data will be filtered to only include variants in those regions :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. + :param partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. 'checkpoint_path' must also be specified as the MT will first be written to the checkpoint_path before being reread with with new number of partitions. :return: Filtered MT """ logger.info("Creating QC MatrixTable") @@ -157,6 +159,9 @@ def get_qc_mt( "The LD-prune step of this function requires non-preemptible workers only!" ) + if partitions and not checkpoint_path: + raise ValueError("checkpoint_path must be supplied if repartitioning!") + qc_mt = filter_low_conf_regions( mt, filter_lcr=filter_lcr, @@ -182,8 +187,13 @@ def get_qc_mt( if ld_r2 is not None: if checkpoint_path: - logger.info("Checkpointing the MT and LD pruning") - qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) + if partitions: + logger.info("Repartitioning MT and LD pruning") + qc_mt.write(checkpoint_path, overwrite=True) + qc_mt = hl.read_matrix_table(checkpoint_path, _n_partitions=partitions) + else: + logger.info("Checkpointing the MT and LD pruning") + qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) else: logger.info("Persisting the MT and LD pruning") qc_mt = qc_mt.persist() From e193f104f28f841fbf15bf2c09e86ca56170ee24 Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 16 Aug 2022 12:26:07 -0400 Subject: [PATCH 2/6] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 914d12f21..36d611c4a 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -188,7 +188,7 @@ def get_qc_mt( if ld_r2 is not None: if checkpoint_path: if partitions: - logger.info("Repartitioning MT and LD pruning") + logger.info("Checkpointing and repartitioning the MT and LD pruning") qc_mt.write(checkpoint_path, overwrite=True) qc_mt = hl.read_matrix_table(checkpoint_path, _n_partitions=partitions) else: From 8a7ac953d08e5206a1b9e3a7f2b4f5484169982e Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 16 Aug 2022 12:26:12 -0400 Subject: [PATCH 3/6] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 36d611c4a..430037813 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -150,7 +150,7 @@ def get_qc_mt( :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept) :param high_conf_regions: If given, the data will be filtered to only include variants in those regions :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. - :param partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. 'checkpoint_path' must also be specified as the MT will first be written to the checkpoint_path before being reread with with new number of partitions. + :param partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. `checkpoint_path` must also be specified as the MT will first be written to the `checkpoint_path` before being reread with the new number of partitions. :return: Filtered MT """ logger.info("Creating QC MatrixTable") From 25be3c6758aff99b0fbc797daf6f3eefc29f1e75 Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 16 Aug 2022 12:30:37 -0400 Subject: [PATCH 4/6] partitions to n_partitions --- gnomad/sample_qc/pipeline.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 430037813..c62e77e65 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -122,7 +122,7 @@ def get_qc_mt( filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, checkpoint_path: Optional[str] = None, - partitions: Optional[int] = None, + n_partitions: Optional[int] = None, ) -> hl.MatrixTable: """ Create a QC-ready MT. @@ -136,22 +136,22 @@ def get_qc_mt( In addition, the MT will be LD-pruned if `ld_r2` is set. - :param mt: Input MT + :param mt: Input MT. :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation. :param min_af: Minimum allele frequency to keep. Not applied if set to ``None``. :param min_callrate: Minimum call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. - :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30 - :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning) - :param filter_lcr: Filter LCR regions - :param filter_decoy: Filter decoy regions - :param filter_segdup: Filter segmental duplication regions - :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept) - :param high_conf_regions: If given, the data will be filtered to only include variants in those regions + :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30. + :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning). + :param filter_lcr: Filter LCR regions. + :param filter_decoy: Filter decoy regions. + :param filter_segdup: Filter segmental duplication regions. + :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept). + :param high_conf_regions: If given, the data will be filtered to only include variants in those regions. :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. - :param partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. `checkpoint_path` must also be specified as the MT will first be written to the `checkpoint_path` before being reread with the new number of partitions. - :return: Filtered MT + :param n_partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. `checkpoint_path` must also be specified as the MT will first be written to the `checkpoint_path` before being reread with the new number of partitions. + :return: Filtered MT. """ logger.info("Creating QC MatrixTable") if ld_r2 is not None: @@ -159,7 +159,7 @@ def get_qc_mt( "The LD-prune step of this function requires non-preemptible workers only!" ) - if partitions and not checkpoint_path: + if n_partitions and not checkpoint_path: raise ValueError("checkpoint_path must be supplied if repartitioning!") qc_mt = filter_low_conf_regions( @@ -187,10 +187,10 @@ def get_qc_mt( if ld_r2 is not None: if checkpoint_path: - if partitions: + if n_partitions: logger.info("Checkpointing and repartitioning the MT and LD pruning") qc_mt.write(checkpoint_path, overwrite=True) - qc_mt = hl.read_matrix_table(checkpoint_path, _n_partitions=partitions) + qc_mt = hl.read_matrix_table(checkpoint_path, _n_partitions=n_partitions) else: logger.info("Checkpointing the MT and LD pruning") qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) From 665a49634a6b654798740f04a5276c7d8717666e Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 16 Aug 2022 13:21:37 -0400 Subject: [PATCH 5/6] run black --- gnomad/sample_qc/pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index c62e77e65..42bb2c94c 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -190,7 +190,9 @@ def get_qc_mt( if n_partitions: logger.info("Checkpointing and repartitioning the MT and LD pruning") qc_mt.write(checkpoint_path, overwrite=True) - qc_mt = hl.read_matrix_table(checkpoint_path, _n_partitions=n_partitions) + qc_mt = hl.read_matrix_table( + checkpoint_path, _n_partitions=n_partitions + ) else: logger.info("Checkpointing the MT and LD pruning") qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) From ddbab1ac4c886fa5dea89f003ed8e6478b740c71 Mon Sep 17 00:00:00 2001 From: klaricch Date: Tue, 16 Aug 2022 13:26:26 -0400 Subject: [PATCH 6/6] small edit --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index d96512134..2904724b1 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -139,7 +139,7 @@ def get_qc_mt( In addition, the MT will be LD-pruned if `ld_r2` is set. - :param mt: Input MT + :param mt: Input MT. :param bi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too. :param snv_only: Whether to only keep SNVs or include other variant types. :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation.