Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script for downloading local reference files and images #230

Merged
merged 21 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions get_refs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
#!/usr/bin/env python3

# Download reference files for the scpca-nf nextflow workflow to enable running
# the workflow without internet access by compute nodes. Optionally pulls
# container images for singularity or docker.
#
# Example usage:
# python3 get_refs.py --singularity


import argparse
import os
import re
import shutil
import subprocess
import sys
import urllib.request

from pathlib import Path

parser = argparse.ArgumentParser()
parser.add_argument("--refdir", type=str,
default="scpca-references",
help = "destination directory for downloaded reference files")
parser.add_argument("--paramfile", type=str,
default="localref_params.yaml",
help = "path to nextflow param file to write (default: `localref_params.yaml`)")
parser.add_argument("--overwrite_refs",
action = "store_true",
help = "replace previously downloaded files")
parser.add_argument("--revision", type=str,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might suggest this be named --version, --workflow_version, --release, etc. or similar. Up to you.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am using revision because that is the term used by nextflow. I don't love it, but I wanted to be consistent with the terminology used there.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed for consistency! But it's a disappointing fact to learn as I start my nextflow learning adventure.

default="main",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering if there is a reason to set this default to development instead, since that matches the current functionality. But, I am disagreeing with myself a good deal even as I write this, because of this that will get messy when this eventually is merged into main and until merge one just provides via command line.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes... in theory it should work with main once merged, so this should only be a problem for us and the initial testers, who will get a bit more hand-holding.

metavar = "vX.X.X",
help = "tag for a specific workflow version (defaults to latest revision)")
parser.add_argument("--star_index",
action = "store_true",
help = "get STAR index (required for genetic demultiplexing)")
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument("--cellranger_index",
action = "store_true",
help = "get Cell Ranger index (required for spatial data)")
parser.add_argument("--docker",
action = "store_true",
help = "pull and cache images for docker")
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument("--singularity",
action = "store_true",
help = "pull and cache images for singularity")
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument("--singularity_cache", type=str,
metavar = "CACHE_DIR",
help = "cache directory for singularity")
args = parser.parse_args()

# scpca-nf resource urls
reffile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/reference_paths.config"
containerfile_url = f"https://raw.githubusercontent.com/AlexsLemonade/scpca-nf/{args.revision}/config/containers.config"

# download reference file
print("Getting list of required reference files")
try:
ref_file = urllib.request.urlopen(reffile_url)
except urllib.error.URLError as e:
print(e.reason)
print(f"The file download failed for {reffile_url}; please check the URL for errors")
print(f"Is `{args.revision}` a valid release tag?")
exit(1)

# parse reference file
# gets all of the `param` variables that are set in `reference_paths.config`
# and stores then in a dict
refs = {}
ref_re = re.compile(r'(?P<id>.+?)\s*=\s*([\'"])(?P<loc>.+)\2')
for line in ref_file:
match = ref_re.search(line.decode())
if match:
refs[match.group('id')] = match.group('loc')

# regular expressions for parameter expansion
root_re = re.compile(r'\$\{?(params.)?ref_rootdir\}?$')
refdir_re = re.compile(r'\$\{?(params.)?ref_dir\}?$')

# get assembly and root location
assembly = refs.get("assembly", "NA")
# split out protocol from the root URI
root_parts = refs.get("ref_rootdir").split('://', maxsplit = 1)
if root_parts[0] == 's3':
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment here that you are grabbing the bucket name? I was a bit confused here on what 0, 1 were

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this isn't quite correct, I realized (or as flexible as it needs to be). I will add comments as I update.

# if S3, convert bucket path to https:// url
bucket_path = root_parts[1].split("/", maxsplit = 1)
url_root = f"https://{bucket_path[0]}.s3.amazonaws.com"
if len(bucket_path) > 1:
url_root += f"/{bucket_path[1]}"
elif root_parts[0] in ['http', 'https', 'ftp']:
# otherwise, just get the location
url_root = refs.get("ref_rootdir")
else:
print("`ref_rootdir` is not a supported remote location.")
exit(1)



# set the base directory (usually corresponding to a genome version)
genome_dir = Path(refs.get("ref_dir"))
# remove the first element if it is a variable
if root_re.match(genome_dir.parts[0]):
genome_dir = genome_dir.relative_to(genome_dir.parts[0])

# single-file references
# the keys here are the param variables we will be downloading
ref_keys =[
"ref_fasta",
"ref_fasta_index",
"ref_gtf",
"mito_file",
"t2g_3col_path",
"t2g_bulk_path"
]
ref_paths = [Path(refs.get(k)) for k in ref_keys]
# replace initial part of path if it is `$params.ref_dir` or similar
ref_paths = [genome_dir / p.relative_to(p.parts[0])
if refdir_re.match(p.parts[0]) else p
for p in ref_paths]

# salmon index files within index dir (must be downloaded individually through http)
salmon_index_files = [
"complete_ref_lens.bin",
"ctable.bin",
"ctg_offsets.bin",
"duplicate_clusters.tsv",
"info.json",
"mphf.bin",
"pos.bin",
"pre_indexing.log",
"rank.bin",
"ref_indexing.log",
"refAccumLengths.bin",
"reflengths.bin",
"refseq.bin",
"seq.bin",
"versionInfo.json"
]
# param variables that are salmon index directories
salmon_keys = [
"splici_index",
"bulk_index"
]
for k in salmon_keys:
sa_dir = Path(refs.get(k))
if refdir_re.match(sa_dir.parts[0]):
sa_dir = genome_dir / sa_dir.relative_to(sa_dir.parts[0])
ref_paths += [sa_dir / f for f in salmon_index_files]

# star index files within index dir (must be downloaded individually through http)
star_index_files = [
"chrLength.txt",
"chrName.txt",
"chrNameLength.txt",
"chrStart.txt",
"exonGeTrInfo.tab",
"exonInfo.tab",
"geneInfo.tab",
"Genome",
"genomeParameters.txt",
"Log.out",
"SA",
"SAindex",
"sjdbInfo.txt",
"sjdbList.fromGTF.out.tab",
"sjdbList.out.tab",
"transcriptInfo.tab"
]
star_dir = Path(refs.get("star_index"))
if refdir_re.match(star_dir.parts[0]):
star_dir = genome_dir / star_dir.relative_to(star_dir.parts[0])
if args.star_index:
ref_paths += [star_dir / f for f in star_index_files]

# Cell Ranger index files within index dir (must be downloaded individually through http)
cr_index_files = [
"reference.json",
"fasta/genome.fa",
"fasta/genome.fa.fai",
"genes/genes.gtf.gz",
"star/chrLength.txt",
"star/chrName.txt",
"star/chrNameLength.txt",
"star/chrStart.txt",
"star/exonGeTrInfo.tab",
"star/exonInfo.tab",
"star/geneInfo.tab",
"star/Genome",
"star/genomeParameters.txt",
"star/SA",
"star/SAindex",
"star/sjdbInfo.txt",
"star/sjdbList.fromGTF.out.tab",
"star/sjdbList.out.tab",
"star/transcriptInfo.tab"
]
cr_dir = Path(refs.get("cellranger_index"))
if refdir_re.match(cr_dir.parts[0]):
cr_dir = genome_dir / cr_dir.relative_to(cr_dir.parts[0])
if args.cellranger_index:
ref_paths += [cr_dir / f for f in cr_index_files]

# barcode files on S3 within the barcode_dir (must be downloaded individually through http)
barcode_files = [
"3M-february-2018.txt",
"737K-august-2016.txt",
"cellranger_mit_license.txt",
"visium-v1.txt",
"visium-v2.txt"
]
barcode_dir = Path(refs.get("barcode_dir"))
if root_re.match(barcode_dir.parts[0]):
barcode_dir = barcode_dir.relative_to(barcode_dir.parts[0])
ref_paths += [barcode_dir / f for f in barcode_files]

## download all the files and put them in the correct locations ##
print("Downloading reference files... (This might take a while)")
for path in ref_paths:
outfile = args.refdir / path
jashapiro marked this conversation as resolved.
Show resolved Hide resolved
if outfile.exists() and not args.overwrite_refs:
continue
print(f"Getting {path}")
# make parents
outfile.parent.mkdir(exist_ok=True, parents = True)
# download and write
file_url = f"{url_root}/{path}"
try:
urllib.request.urlretrieve(file_url, outfile)
except urllib.error.URLError as e:
print(e.reason)
print(f"The file download failed for {file_url}; please check the URL for errors",
file = sys.stderr)
exit(1)
print("Done with reference file downloads\n"
f"Reference files can be found at '{args.refdir}'\n")

# write param file if requested
if args.paramfile:
pfile = Path(args.paramfile)
# check if paramfile exists & move old if needed
if pfile.exists():
print(f"A file already exists at `{pfile}`; renaming previous file to `{pfile.name}.bak`")
shutil.move(pfile, str(pfile) + ".bak")
# create parameter dictionary
nf_params = {
'assembly': assembly,
'ref_rootdir': os.path.abspath(args.refdir)
}
with open(pfile, 'w') as f:
f.write("# local nextflow reference file parameters, generated by `get_refs.py`\n\n")
for key, value in nf_params.items():
f.write(f"{key}: {value}\n")

## Get docker containers from workflow
if args.singularity or args.docker:
print("Getting list of required containers")
containers = {}
try:
container_file = urllib.request.urlopen(containerfile_url)
except urllib.error.URLError as e:
print(e.reason)
print(f"The file download failed for {container_url}; please check the URL for errors")
print(f"Is `{args.revision}` a valid release tag?")
exit(1)

# pattern match to find container id & location
container_re = re.compile(r'(?P<id>.+_CONTAINER)\s*=\s*([\'"])(?P<loc>.+)\2')
for line in container_file:
match = container_re.search(line.decode())
if match:
containers[match.group('id')] = match.group('loc')

# pull docker images if requested
if args.docker:
print("Pulling docker images...")
for loc in containers.values():
subprocess.run(["docker", "pull", loc])
print("Done pulling docker images\n")

# pull singularity images if requested (to optionally specified cache location)
if args.singularity:
print("Pulling singularity images...")
if args.singularity_cache:
os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(args.singularity_cache)
for loc in containers.values():
subprocess.run(
["singularity", "pull", "--force", f"docker://{loc}"],
env = os.environ
)
print("Done pulling singularity images")
if args.singularity_cache:
print(f"Singularity images located at {os.environ['SINGULARITY_CACHEDIR']}")
print()
1 change: 1 addition & 0 deletions modules/spaceranger.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ process spaceranger{
script:
out_id = file(meta.spaceranger_results_dir).name
meta.cellranger_index = index.fileName
meta_json = Utils.makeJson(meta)
"""
spaceranger count \
--id=${out_id} \
Expand Down