Skip to content

Commit

Permalink
Merge pull request #4 from thehyve/susie
Browse files Browse the repository at this point in the history
Update manifest script to filter susie prefixes
(required by thehyve/otg-data-loading#201)
  • Loading branch information
ricardo-lourenco authored Mar 6, 2024
2 parents 6936bed + e203083 commit 6e01ad9
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
25 changes: 22 additions & 3 deletions 2_generate_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
import pandas as pd
import yaml
import numpy as np
import argparse


def main():
args = parse_args()
# Load config
with open('/coloc/configs/config.yaml') as config_input:
config = yaml.load(config_input, Loader=yaml.FullLoader)
Expand All @@ -34,6 +36,8 @@ def main():
if config['custom_studies']:
custom_studies = pd.read_parquet(config['custom_studies'], columns=['study_id']).study_id.unique()

# SuSiE prefixes regex pattern -> (x|y|z|...)
susie_pattern = '(' + '|'.join(args.susie_prefixes.split(";")) + ')'

# Out path patterns
data_out = '/output'
Expand Down Expand Up @@ -89,9 +93,9 @@ def construct_left_right_hive_partition_dirs(rec):
out_record['{}_sumstats'.format(side)] = public_sumstats.format(type=study_type, study_id=study_id)


# If FinnGen, then don't specify LD, as we won't do conditioning
# If SuSiE, then don't specify LD, as we won't do conditioning
ld_path = ukb_ld_path.format(chrom=in_record['{}_lead_chrom'.format(side)])
if re.match('FINNGEN', study_id):
if re.match(rf'{susie_pattern}', study_id):
ld_path = None
out_record['{}_ld'.format(side)] = ld_path

Expand Down Expand Up @@ -131,7 +135,22 @@ def construct_left_right_hive_partition_dirs(rec):

return 0

def parse_args():
''' Load command line args
'''
parser = argparse.ArgumentParser()

parser.add_argument('--susie_prefixes',
help=('List of semicolon-separated SuSiE prefixes (i.e. "x;y;z")'),
metavar='<str>',
type=str,
const='',
nargs='?',
required=True
)

args = parser.parse_args()
return args

if __name__ == '__main__':

main()
5 changes: 3 additions & 2 deletions run_coloc_pipeline_opt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
set -euo pipefail

NCORES=$1
# export PYSPARK_SUBMIT_ARGS=$2
SUSIE=$2
# export PYSPARK_SUBMIT_ARGS=$3
#export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell"

echo "Running on $NCORES cores"
Expand Down Expand Up @@ -33,7 +34,7 @@ time /bin/bash 1_find_overlaps.sh # 10 min last run
# output:
# - /configs/manifest_unfiltered.json.gz
echo [$(date +"%Y-%m-%d %H:%M:%S")] Running script: 2_generate_manifest.py
time python 2_generate_manifest.py # ~24 min last run
time python 2_generate_manifest.py --susie_prefixes $SUSIE # ~24 min last run

#cp /configs/manifest_unfiltered.json.gz /configs/manifest_unfiltered.all.json.gz

Expand Down

0 comments on commit 6e01ad9

Please sign in to comment.