Skip to content

Commit

Permalink
Merge pull request #779 from AlexsLemonade/development
Browse files Browse the repository at this point in the history
Sync changes from `development` into `main` for v0.8.3
  • Loading branch information
allyhawkins authored Aug 2, 2024
2 parents f215046 + 58b084d commit 3d42560
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 75 deletions.
55 changes: 0 additions & 55 deletions bin/move_counts_anndata.py

This file was deleted.

4 changes: 2 additions & 2 deletions bin/predict_cellassign.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
scvi.settings.num_threads = args.threads

# compile extension regex
file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.anndata_file):
Expand All @@ -78,7 +78,7 @@
raise FileExistsError("--reference file not found.")

# make sure output file path is tsv file
if not args.output_predictions.endswith(".tsv"):
if not args.output_predictions.casefold().endswith(".tsv"):
raise ValueError("--output_predictions must provide a file path ending in tsv")

# read in references as marker gene tables
Expand Down
118 changes: 118 additions & 0 deletions bin/reformat_anndata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
This script takes an AnnData object and checks for the `logcounts` in layers.
If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X`
In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column.
If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy.
"""

import argparse
import os
import re

import anndata
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--anndata_file",
dest="anndata_file",
required=True,
help="Path to HDF5 file with processed AnnData object",
)
parser.add_argument(
"--pca_meta_file",
dest="pca_meta_file",
required=False,
help="Path to file with a table of variance explained by each PCA component",
)
parser.add_argument(
"--pca_not_centered",
dest="pca_centered",
action="store_false",
help="Indicate that the PCA table is not zero-centered",
)
parser.add_argument(
"--hvg_name",
dest="hvg_name",
default="highly_variable_genes",
help=(
"Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
"Use the value 'none' if no highly variable genes were used."
),
)
parser.add_argument(
"-u",
"--uncompressed",
dest="compress",
action="store_false",
help="Output an uncompressed HDF5 file",
)

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.anndata_file):
raise FileExistsError("`input_anndata` does not exist.")
elif not file_ext.search(args.anndata_file):
raise ValueError(
"input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata."
)

# if there is a pca_meta_file, check that it exists and has a tsv extension
if args.pca_meta_file:
if not os.path.exists(args.pca_meta_file):
raise FileExistsError("`pca_meta_file` does not exist.")
elif not args.pca_meta_file.casefold().endswith(".tsv"):
raise ValueError("`pca_meta_file` must end in .tsv.")

# read in anndata
adata = anndata.read_h5ad(args.anndata_file)

# if logcounts is present
if "logcounts" in adata.layers:
# move X to raw.X by creating the raw adata
adata.raw = adata
# move logcounts to X and rename
adata.X = adata.layers["logcounts"]
adata.uns["X_name"] = "logcounts"
del adata.layers["logcounts"]

# convert DataFrames in obsm to ndarrays
for key, value in adata.obsm.items():
if isinstance(value, pd.DataFrame):
adata.obsm[key] = value.to_numpy()

# convert highly variable genes to a column if given
use_hvg = args.hvg_name.casefold() != "none"
if use_hvg:
if args.hvg_name not in adata.uns.keys():
raise ValueError("`hvg_name` must be present in the `uns` data for the object")
adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name])


# add pca adata to uns if pca_meta_file is provided in the format created by scanpy
if args.pca_meta_file:
pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0)
if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns:
raise ValueError(
"`pca_meta_file` must contain columns `variance` and `variance_ratio`"
)
pca_object = {
"param": {
"zero_center": args.pca_centered,
"use_highly_variable": use_hvg,
"mask_var": ("highly_variable" if use_hvg else None),
},
"variance": pca_meta["variance"].to_numpy(),
"variance_ratio": pca_meta["variance_ratio"].to_numpy(),
}
adata.uns["pca"] = pca_object

# export adata
adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)
30 changes: 27 additions & 3 deletions bin/sce_to_anndata.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ option_list <- list(
type = "character",
help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
),
make_option(
opt_str = c("--output_pca_tsv"),
default = NULL,
type = "character",
help = "path to output a table of variance explained by each principal component. Must end in .tsv"
),
make_option(
opt_str = c("--feature_name"),
type = "character",
Expand Down Expand Up @@ -62,12 +68,17 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) {
stop("output rna file name must end in .hdf5, .h5, or .h5ad")
}

# check that the pca file is a tsv
if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) {
stop("output pca file name must end in .tsv")
}

# Merged object function ------------------------------------------------------

# this function updates merged object formatting for anndata export
format_merged_sce <- function(sce) {
# paste X to any present reduced dim names
reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
return(sce)
}

Expand Down Expand Up @@ -120,8 +131,8 @@ format_czi <- function(sce) {
# so everything gets set to false
rowData(sce)$feature_is_filtered <- FALSE

# paste X to any present reduced dim names
reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
# paste X to any present reduced dim names, converting to lower case
reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")

return(sce)
}
Expand Down Expand Up @@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata(
compression = ifelse(opt$compress_output, "gzip", "none")
)

# Get PCA metadata for AnnData
if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) {
pca_meta_df <- data.frame(
PC = 1:ncol(reducedDims(sce)$X_pca),
variance = attr(reducedDims(sce)$X_pca, "varExplained"),
variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100
)

# write pca to tsv
readr::write_tsv(pca_meta_df, opt$output_pca_tsv)
}

message("Exported RNA")

# AltExp to AnnData -----------------------------------------------------------
Expand Down Expand Up @@ -209,5 +232,6 @@ if (!is.null(opt$feature_name)) {
")
)
}
message("Exported alt")
}
}
6 changes: 3 additions & 3 deletions external-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ Using the above command will run the workflow from the `main` branch of the work
To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command.

To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag.
The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.2` version.
The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.3` version.
Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases).

```sh
nextflow run AlexsLemonade/scpca-nf \
-r v0.8.2 \
-r v0.8.3 \
-config <path to config file> \
-profile <name of profile>
```
Expand Down Expand Up @@ -325,7 +325,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra

If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well.
When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using.
For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.2`, then you will want to set `-r v0.8.2` for `get_refs.py` as well to be sure you have the correct containers.
For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.3`, then you will want to set `-r v0.8.3` for `get_refs.py` as well to be sure you have the correct containers.
By default, `get_refs.py` will download files and images associated with the latest release.

If your system uses Docker, you can add the `--docker` flag:
Expand Down
2 changes: 1 addition & 1 deletion internal-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more
When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release:

```
nextflow run AlexsLemonade/scpca-nf -r v0.8.2 -profile ccdl,batch --project SCPCP000000
nextflow run AlexsLemonade/scpca-nf -r v0.8.3 -profile ccdl,batch --project SCPCP000000
```

### Processing example data
Expand Down
28 changes: 22 additions & 6 deletions merge.nf
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ process export_anndata {
${has_adt ? "--feature_name adt" : ''}
# move normalized counts to X in AnnData
move_counts_anndata.py --anndata_file ${rna_h5ad_file}
${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
reformat_anndata.py --anndata_file ${rna_h5ad_file}
${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
"""
stub:
rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
Expand Down Expand Up @@ -144,11 +144,11 @@ workflow {
|| (it.scpca_sample_id in run_ids)
}
.map{[
seq_unit: it.seq_unit,
technology: it.technology,
project_id: it.scpca_project_id,
library_id: it.scpca_library_id,
sample_id: it.scpca_sample_id.split(";").sort().join(",")
sample_id: it.scpca_sample_id.split(";").sort().join(","),
seq_unit: it.seq_unit,
technology: it.technology
]}

// get all projects that contain at least one library with CITEseq
Expand All @@ -162,13 +162,23 @@ workflow {
.collect{it.project_id}
.map{it.unique()}

oversized_projects = libraries_ch
.map{[
it.project_id, // pull out project id for grouping
it
]}
.groupTuple(by: 0) // group by project id
.filter{it[1].size() > params.max_merge_libraries} // get projects with more samples than max merge
.collect{it[0]} // get project id

filtered_libraries_ch = libraries_ch
// only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
.filter{it.seq_unit in ['cell', 'nucleus']}
// remove any multiplexed projects
// remove any multiplexed projects or oversized projects
// future todo: only filter library ids that are multiplexed, but keep all other non-multiplexed libraries
.branch{
multiplexed: it.project_id in multiplex_projects.getVal()
oversized: it.project_id in oversized_projects.getVal()
single_sample: true
}

Expand All @@ -178,6 +188,12 @@ workflow {
log.warn("Not merging ${it.project_id} because it contains multiplexed libraries.")
}

filtered_libraries_ch.oversized
.unique{ it.project_id }
.subscribe{
log.warn("Not merging ${it.project_id} because it contains too many libraries.")
}

// print out warning message for any libraries not included in merging
filtered_libraries_ch.single_sample
.map{[
Expand Down
8 changes: 5 additions & 3 deletions modules/export-anndata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@ process export_anndata {
script:
rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv"
feature_present = meta.feature_type in ["adt"]
"""
sce_to_anndata.R \
--input_sce_file ${sce_file} \
--output_rna_h5 ${rna_h5ad_file} \
--output_feature_h5 ${feature_h5ad_file} \
--output_pca_tsv ${pca_meta_file} \
${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
${file_type != "processed" ? "--compress_output" : ''}
# move any normalized counts to X in AnnData
# move any normalized counts to X in AnnData, convert matrices, and add PCA metadata
if [ "${file_type}" = "processed" ]; then
move_counts_anndata.py --anndata_file ${rna_h5ad_file}
reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file}
# move counts in feature data, if the file exists
if [ -f "${feature_h5ad_file}" ]; then
move_counts_anndata.py --anndata_file ${feature_h5ad_file}
reformat_anndata.py --anndata_file ${feature_h5ad_file} --hvg_name "none"
fi
fi
Expand Down
Loading

0 comments on commit 3d42560

Please sign in to comment.