Merge pull request #728 from AlexsLemonade/development

Sync changes from `development` with `main` for v0.8.0
AlexsLemonade · Mar 13, 2024 · 246c3d7 · 246c3d7
2 parents b185050 + 29efc4c
commit 246c3d7
Show file tree

Hide file tree

Showing 66 changed files with 1,949 additions and 426 deletions.
diff --git a/.github/ISSUE_TEMPLATE/release-checklist.md b/.github/ISSUE_TEMPLATE/release-checklist.md
@@ -1,7 +1,7 @@
 ---
 name: Release checklist
 about: Prepare for a new release version of scpca-nf
-title: Prepare for scpca-nf release vX.X.X
+title: Prepare for scpca-nf release `vX.X.X`
 labels: release
 assignees: ''
 
@@ -13,15 +13,20 @@ assignees: ''
 
 - [ ] Are all of the issues planned for this release resolved? If there are any issues that are unresolved, mark this issue as blocked by those on ZenHub.
 - [ ] Update code and documentation with the latest version number in the `development` branch:
-  - [ ] [nextflow.config](https://github.com/AlexsLemonade/scpca-nf/blob/main/nextflow.config)
-  - [ ] [internal-instructions.md](https://github.com/AlexsLemonade/scpca-nf/blob/main/internal-instructions.md)
-  - [ ] [external-instructions.md](https://github.com/AlexsLemonade/scpca-nf/blob/main/external-instructions.md)
+  - [ ] [`nextflow.config`](https://github.com/AlexsLemonade/scpca-nf/blob/main/nextflow.config)
+  - [ ] [`internal-instructions.md`](https://github.com/AlexsLemonade/scpca-nf/blob/main/internal-instructions.md)
+  - [ ] [`external-instructions.md`](https://github.com/AlexsLemonade/scpca-nf/blob/main/external-instructions.md)
 - [ ] Test that the workflow is in good working order with `nextflow run alexslemonade/scpca-nf -latest -r development`
 - [ ] File a PR from the `development` branch to the `main` branch. This should include all of the changes that will be associated with the next release.
+- [ ] (Optional) Generate new example `scpca-nf` output files.
+If updating the example output is not necessary for this release, check these boxes off for free.
+  - [ ] [Re-process the example data](https://github.com/AlexsLemonade/scpca-nf/blob/main/internal-instructions.md#processing-example-data) through the `scpca-nf` workflow and ensure it looks correct.
+  - [ ] Compress the example output in `scpca_out` to create `scpca_out.zip`, as described in the instructions, and ensure the file is set to public and read-only.
+
 
 ### Creating a release
 - [ ] On the [releases page](https://github.com/AlexsLemonade/scpca-nf/releases), choose `Draft a new release`.
-- [ ] In `Choose a tag`, type a new release number using semantic versioning (vX.X.X) (you did update the title of this issue to match, right?), then click `Create a new tag: vX.X.X on publish`.
+- [ ] In `Choose a tag`, type a new release number using semantic versioning (`vX.X.X`) (you did update the title of this issue to match, right?), then click `Create a new tag: vX.X.X on publish`.
 - [ ] Write a description of the major changes in this release. You may want to start with the auto-generated release notes to save time.
 - [ ] Optional: If not all issues have been addressed, save a draft to return to later.
 - [ ] Publish the release!
diff --git a/.github/workflows/nextflow-config-check.yaml b/.github/workflows/nextflow-config-check.yaml
@@ -5,13 +5,13 @@ on:
     branches:
       - main
       - development
-
 jobs:
   nf-config-check:
     runs-on: ubuntu-22.04
+    container: nfcore/tools:2.13.1
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Check nextflow params
-        uses: docker://nextflow/nextflow:21.10.6
-        with:
-          args: nextflow config
+        run: nextflow config
+      - name: Check nextflow_schema file
+        run: nf-core schema lint nextflow_schema.json
diff --git a/.github/workflows/nextflow-stub-check.yaml b/.github/workflows/nextflow-stub-check.yaml
@@ -28,9 +28,14 @@ jobs:
         with:
           args: nextflow -log celltype-ref-run.log run build-celltype-ref.nf -stub -profile stub -ansi-log false
 
+      - name: Check Nextflow workflow for merging objects
+        uses: docker://nextflow/nextflow:21.10.6
+        with:
+          args: nextflow -log merge-run.log run merge.nf -stub -profile stub -ansi-log false --project STUBP01
+
       - name: Join log files
         if: ${{ !cancelled() }}
-        run: cat stub-run.log checkpoint-run.log celltype-ref-run.log > nextflow-runs.log
+        run: cat stub-run.log checkpoint-run.log celltype-ref-run.log merge-run.log > nextflow-runs.log
 
       - name: Upload nextflow log
         if: ${{ !cancelled() }}

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ scpca-references/
 # ignore template htmls
 main_qc_report.html
 celltypes_supplemental_report.html
+merged-report.html
 *_qc.html
 
 # ignore hidden `DS_Store`

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,6 +16,8 @@ repos:
         entry: Cannot commit .Rhistory, .RData, or .Rds files.
         language: fail
         files: '(?i)\.(Rhistory|RData|rds)$'
+        # `exclude` files here are _allowed_ data files.
+        # Here they are specified by a full path from the repository root in a multiline regex.
         exclude: |
           (?x)^(
             test/references/celltype/singler_models/singler_model_file.rds|
@@ -30,7 +32,7 @@ repos:
         args: [--update-only, --title=**Table of Contents**]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff for linting and formatting python
-    rev: v0.2.1
+    rev: v0.3.2
     hooks:
       # Run the linter.
       - id: ruff

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -20,6 +20,25 @@ To allow for efficient review, please include in any pull request a concise and
 
 When the changes in `development` merit a new release, a pull request will be filed to merge the current version of the `development` branch into `main`, followed by tagging a release on the `main` branch.
 
+### Updating `nextflow_schema.json`
+
+Any changes that affect the Nextflow configuration files should be reflected in the [`nextflow_schema.json` file](https://nf-co.re/tools#pipeline-schema).
+This file can most easily be updated using the [`nf-core/tools` package](https://nf-co.re/tools), which can be installed with `conda install nf-core`.
+Then run `nf-core schema build` in the `scpca-nf` directory to update the schema file to match the current config file.
+You can also use the web editor that it launches to further customize the schema file.
+Note that you may get warnings about any config parameters that include `${projectDir}`, as the build tool wants those to be replaced with absolute paths.
+This is not necessary for the schema to be valid, so please keep those paths with the `${projectDir}` variable (enter `n` at the prompt).
+### Continuous integration in pull requests
+
+There are several automatic checks performed by GitHub Actions in all pull requests filed to `main` or `development`:
+
+- [Check Nextflow config](.github/workflows/nextflow-config-check.yaml): This workflow ensures that there are no syntax errors in the Nextflow configuration files and that the `nextflow_schema.json` file is up to date. This check is required to pass before pull requests can be merged.
+- [Check Nextflow stub](.github/workflows/nextflow-stub-check.yaml): This workflow ensures that the [stub workflow](#stub-workflows) runs without errors. This check is required to pass before pull requests can be merged.
+- [Spell check R Markdown and Markdown files](.github/workflows/spell-check.yml): This workflow ensures there are no spelling errors in R Markdown and Markdown files. This check is not required to pass before pull requests can be merged.
+
+There is also one additional `pre-commit ci` workflow which runs all [pre-commit hooks as described in this section](#pre-commit-hooks), except for the spell check pre-commit hook.
+Although highly recommended, it is not required that this workflow passes before pull requests can be merged.
+
 ## Stub workflows
 
 All Nextflow processes should include a [`stub` block](https://www.nextflow.io/docs/latest/process.html#stub) with a minimal script that can be run quickly to produce files in the expected output locations.

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Nextflow will also handle parallelizing sample processing as allowed by your env
 The workflow processes fastq files from single-cell and single-nuclei RNA-seq samples using [alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) to create gene by cell matrices.
 The workflow outputs gene expression data in two formats: as [`SingleCellExperiment` objects](https://www.bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) and as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
 Reads from samples are aligned using selective alignment, to an index with transcripts corresponding to spliced cDNA and to intronic regions, denoted by alevin-fry as `splici`.
-These matrices are filtered and additional processing is performed to calculate quality control statistics, create reduced-dimension transformations, and create output reports.
+These matrices are filtered and additional processing is performed to calculate quality control statistics, create reduced-dimension transformations, assign cell types using both [`SingleR`](https://bioconductor.org/packages/release/bioc/html/SingleR.html) and [`CellAssign`](https://docs.scvi-tools.org/en/stable/user_guide/models/cellassign.html), and create output reports.
 `scpca-nf` can also process libraries with ADT tags (e.g., CITE-seq), multiplexed libraries (e.g., cell hashing), bulk RNA-seq, and spatial transcriptomics samples.
 
 For more information on the contents of the output files and the processing of all modalities, please see the [ScPCA Portal docs](https://scpca.readthedocs.io/en/latest/).

diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R
@@ -247,4 +247,4 @@ if (!is.null(opt$cellassign_predictions)) {
 }
 
 # export annotated object with cellassign assignments
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/add_demux_sce.R b/bin/add_demux_sce.R
@@ -125,4 +125,4 @@ if (length(cellhash_ids) > 1) {
 }
 
 # write filtered sce to output
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/add_submitter_annotations.R b/bin/add_submitter_annotations.R
@@ -107,4 +107,4 @@ colData(sce) <- DataFrame(
 metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "submitter")
 
 # Write SCE back to file
-readr::write_rds(sce, opt$sce_file, compress = "gz")
+readr::write_rds(sce, opt$sce_file, compress = "bz2")
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -106,5 +106,5 @@ metadata(singler_results)$cell_ontology_df <- singler_model$cell_ontology_df
 readr::write_rds(
   singler_results,
   opt$output_singler_results_file,
-  compress = "gz"
+  compress = "bz2"
 )
diff --git a/bin/cluster_sce.R b/bin/cluster_sce.R
@@ -101,4 +101,4 @@ if (!opt$pca_name %in% reducedDimNames(sce)) {
 }
 
 # export -------------------
-readr::write_rds(sce, opt$output_sce_file, compress = "gz")
+readr::write_rds(sce, opt$output_sce_file, compress = "bz2")
diff --git a/bin/filter_sce.R b/bin/filter_sce.R
@@ -202,4 +202,4 @@ if (!is.null(ambient_profile)) {
 
 
 # write filtered sce to output
-readr::write_rds(filtered_sce, opt$filtered_file, compress = "gz")
+readr::write_rds(filtered_sce, opt$filtered_file, compress = "bz2")
diff --git a/bin/generate_unfiltered_sce.R b/bin/generate_unfiltered_sce.R
@@ -152,6 +152,11 @@ if (opt$feature_dir != "") {
   unfiltered_sce <- merge_altexp(unfiltered_sce, feature_sce, opt$feature_name)
   # add alt experiment features stats
   altExp(unfiltered_sce, opt$feature_name) <- scuttle::addPerFeatureQCMetrics(altExp(unfiltered_sce, opt$feature_name))
+
+  # if CITE, add `adt_id` column to rowData with rownames
+  if (opt$feature_name == "adt") {
+    rowData(altExp(unfiltered_sce, "adt"))$adt_id <- rownames(rowData(altExp(unfiltered_sce, "adt")))
+  }
 }
 
 
@@ -179,13 +184,23 @@ unfiltered_sce <- unfiltered_sce |>
   # `add_sample_metadata` will filter sample_metadata_df to the relevant sample ids
   add_sample_metadata(metadata_df = sample_metadata_df)
 
+# if columns with sample type info aren't provided, set to NA
+if (!("is_xenograft" %in% colnames(sample_metadata_df))) {
+  sample_metadata_df$is_xenograft <- NA
+}
+if (!("is_cell_line" %in% colnames(sample_metadata_df))) {
+  sample_metadata_df$is_cell_line <- NA
+}
+
 # add explicit metadata field for the sample type
 sample_type <- sample_metadata_df |>
   dplyr::filter(sample_id %in% sample_ids) |>
   dplyr::mutate(
     sample_type = dplyr::case_when(
       is_xenograft ~ "patient-derived xenograft",
       is_cell_line ~ "cell line",
+      # if neither column was provided, note that
+      is.na(is_xenograft) && is.na(is_cell_line) ~ "Not provided",
       .default = "patient tissue"
     )
   ) |>
@@ -200,4 +215,4 @@ if (length(sample_type) == 1) {
 metadata(unfiltered_sce)$sample_type <- sample_type
 
 # write to rds
-readr::write_rds(unfiltered_sce, opt$unfiltered_file, compress = "gz")
+readr::write_rds(unfiltered_sce, opt$unfiltered_file, compress = "bz2")