Skip to content

Commit

Permalink
Merge pull request #17 from ajmaurais/pdc
Browse files Browse the repository at this point in the history
Add PDC as an input file source
  • Loading branch information
mriffle authored Sep 11, 2024
2 parents 9faf68b + ef83492 commit f33acdf
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 27 deletions.
1 change: 1 addition & 0 deletions container_images.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ params {
diann: 'quay.io/protio/diann:1.8.1',
bibliospec: 'quay.io/protio/bibliospec-linux:3.0',
panorama_client: 'quay.io/protio/panorama-client:1.1.0',
pdc_client: 'quay.io/mauraisa/pdc_client:0.15',
encyclopedia: 'quay.io/protio/encyclopedia:2.12.30-2',
encyclopedia3_mriffle: 'quay.io/protio/encyclopedia:3.0.0-MRIFFLE',
qc_pipeline: 'quay.io/mauraisa/dia_qc_report:2.2.4',
Expand Down
12 changes: 12 additions & 0 deletions docs/source/workflow_parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ The ``params`` Section
* -
- ``search_engine``
- Must be set to either ``'encyclopedia'`` or ``'diann'``. If set to ``'diann'``, ``chromatogram_library_spectra_dir``, ``chromatogram_library_spectra_glob``, and EncyclopeDIA-specific parameters will be ignored. Default: ``'encyclopedia'``.
* -
- ``pdc.study_id``
- When this option is set, raw files and metadata will be downloaded from the PDC. Default: ``null``.
* -
- ``pdc.gene_level_data``
- A ``tsv`` file mapping gene names to NCIB gene IDs and gene metadata. Required for PDC gene reports. Default: ``null``.
* -
- ``pdc.n_raw_files``
- If this option is set, only ``n`` raw files are downloaded. This is useful for testing but otherwise should be ``null``.
* -
- ``pdc.client_args``
- Additional command line arguments passed to ``PDC_client``. Default is ``null``.
* -
- ``skyline.skip``
- If set to ``true``, will skip the creation of a Skyline document. Default: ``false``.
Expand Down
76 changes: 49 additions & 27 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ include { generate_dia_qc_report } from "./workflows/generate_qc_report"
include { panorama_upload_results } from "./workflows/panorama_upload"
include { panorama_upload_mzmls } from "./workflows/panorama_upload"
include { save_run_details } from "./workflows/save_run_details"
include { get_pdc_files } from "./workflows/get_pdc_files"

// modules
include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia"
include { ENCYCLOPEDIA_DLIB_TO_TSV } from "./modules/encyclopedia"
include { BLIB_BUILD_LIBRARY } from "./modules/diann"
include { GET_AWS_USER_ID } from "./modules/aws"
include { BUILD_AWS_SECRETS } from "./modules/aws"
include { EXPORT_GENE_REPORTS } from "./modules/qc_report"

// useful functions and variables
include { param_to_list } from "./workflows/get_input_files"
Expand Down Expand Up @@ -96,21 +98,34 @@ workflow {
aws_secret_id = Channel.of('none').collect() // ensure this is a value channel
}

// only perform msconvert and terminate
if(params.msconvert_only) {
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls
// get mzML files
if(params.pdc.study_id) {
get_pdc_files()
wide_mzml_ch = get_pdc_files.out.wide_mzml_ch
pdc_study_name = get_pdc_files.out.study_name
} else{
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)
wide_mzml_ch = get_wide_mzmls.out.mzml_ch
}
narrow_mzml_ch = null
if(params.chromatogram_library_spectra_dir != null) {
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)

narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
} else {
all_mzml_ch = wide_mzml_ch
}

if(params.chromatogram_library_spectra_dir != null) {
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)

narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
} else {
all_mzml_ch = wide_mzml_ch
}
// only perform msconvert and terminate
if(params.msconvert_only) {
// save details about this run
input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
version_files = Channel.empty()
save_run_details(input_files.collect(), version_files.collect())
run_details_file = save_run_details.out.run_details

// save details about this run
input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
Expand All @@ -120,7 +135,6 @@ workflow {

// if requested, upload mzMLs to panorama
if(params.panorama.upload) {

panorama_upload_mzmls(
params.panorama.upload_url,
all_mzml_ch,
Expand All @@ -134,19 +148,23 @@ workflow {
}

get_input_files(aws_secret_id) // get input files
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls

// set up some convenience variables

if(params.spectral_library) {
spectral_library = get_input_files.out.spectral_library
} else {
spectral_library = Channel.empty()
}

if(params.pdc.study_id) {
if(params.replicate_metadata) {
log.warn "params.replicate_metadata will be overritten by PDC metadata"
}
replicate_metadata = get_pdc_files.out.annotations_csv
} else {
replicate_metadata = get_input_files.out.replicate_metadata
}
fasta = get_input_files.out.fasta
skyline_template_zipfile = get_input_files.out.skyline_template_zipfile
wide_mzml_ch = get_wide_mzmls.out.mzml_ch
skyr_file_ch = get_input_files.out.skyr_files

final_elib = null
Expand Down Expand Up @@ -174,13 +192,6 @@ workflow {

// create elib if requested
if(params.chromatogram_library_spectra_dir != null) {
// get narrow windows mzmls
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)
narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch

all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)

// create chromatogram library
encyclopeda_export_elib(
Expand Down Expand Up @@ -326,16 +337,26 @@ workflow {
// annotate skyline document if replicate_metadata was specified
if(params.replicate_metadata != null) {
skyline_annotate_doc(skyline_import.out.skyline_results,
get_input_files.out.replicate_metadata)
replicate_metadata)
final_skyline_file = skyline_annotate_doc.out.skyline_results
} else {
final_skyline_file = skyline_import.out.skyline_results
}

// generate QC report
if(!params.qc_report.skip) {
generate_dia_qc_report(final_skyline_file, get_input_files.out.replicate_metadata)
generate_dia_qc_report(final_skyline_file, replicate_metadata)
dia_qc_version = generate_dia_qc_report.out.dia_qc_version

// Export PDC gene tables
if(params.pdc.gene_level_data != null) {
EXPORT_GENE_REPORTS(generate_dia_qc_report.out.qc_report_db,
params.pdc.gene_level_data,
pdc_study_name)
EXPORT_GENE_REPORTS.out.gene_reports | flatten | set{ gene_reports }
} else {
gene_reports = Channel.empty()
}
} else {
dia_qc_version = Channel.empty()
}
Expand All @@ -360,6 +381,7 @@ workflow {
qc_report_files = Channel.empty()
proteowizard_version = Channel.empty()
dia_qc_version = Channel.empty()
gene_reports = Channel.empty()
}

version_files = encyclopedia_version.concat(diann_version,
Expand Down
75 changes: 75 additions & 0 deletions modules/pdc.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@

def format_client_args(var) {
ret = (var == null ? "" : var)
return ret
}

process GET_STUDY_METADATA {
publishDir "${params.result_dir}/pdc", failOnError: true, mode: 'copy'
errorStrategy 'retry'
maxRetries 5
label 'process_low_constant'
container params.images.pdc_client

input:
val pdc_study_id

output:
path('study_metadata.tsv'), emit: metadata
path('study_metadata_annotations.csv'), emit: skyline_annotations
env(study_id), emit: study_id
env(study_name), emit: study_name
path('pdc_client_version.txt'), emit: version

shell:
n_files_arg = params.pdc.n_raw_files == null ? "" : "--nFiles ${params.pdc.n_raw_files}"
pdc_client_args = params.pdc.client_args == null ? "" : params.pdc.client_args

'''
study_id=$(PDC_client studyID !{pdc_client_args} !{pdc_study_id} | tee study_id.txt)
study_name=$(PDC_client studyName --normalize !{pdc_client_args} ${study_id} | tee study_name.txt)
PDC_client metadata !{pdc_client_args} -f tsv !{n_files_arg} --skylineAnnotations ${study_id}
echo "pdc_client_git_repo='$GIT_REPO - $GIT_BRANCH [$GIT_SHORT_HASH]'" > pdc_client_version.txt
'''
}

process METADATA_TO_SKY_ANNOTATIONS {
label 'process_low_constant'
container params.images.pdc_client

input:
path pdc_study_metadata

output:
path('skyline_annotations.csv'), emit: skyline_annotations

shell:
'''
PDC_client metadataToSky !{pdc_study_metadata}
'''
}

process GET_FILE {
storeDir "${params.panorama_cache_directory}"
label 'process_low_constant'
container params.images.pdc_client
errorStrategy 'retry'
maxRetries 1

input:
tuple val(url), val(file_name), val(md5)

output:
path(file_name), emit: downloaded_file

shell:
'''
PDC_client file -o '!{file_name}' -m '!{md5}' '!{url}'
'''

stub:
"""
touch ${file_name}
"""
}
28 changes: 28 additions & 0 deletions modules/qc_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,31 @@ process RENDER_QC_REPORT {
"""
}

process EXPORT_GENE_REPORTS {
publishDir "${params.result_dir}/gene_reports", failOnError: true, mode: 'copy'
label 'process_high_memory'
container params.images.qc_pipeline

input:
path batch_db
path gene_level_data
val file_prefix

output:
path("*.tsv"), emit: gene_reports
path("*.stdout"), emit: stdout
path("*.stderr"), emit: stderr

script:
"""
dia_qc export_gene_matrix --prefix=${file_prefix} --useAliquotId \
'${gene_level_data}' '${batch_db}' \
> >(tee "export_reports.stdout") 2> >(tee "export_reports.stderr" >&2)
"""

stub:
"""
touch stub.tsv
touch stub.stdout stub.stderr
"""
}
7 changes: 7 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ params {
skip_skyline = null
skyline_skyr_file = null

// Optional PDC study settings
pdc.client_args = ''
pdc.study_id = null
pdc.n_raw_files = null
pdc.metadata_tsv = null
pdc.gene_level_data = null

// The final skyline document will be named using this name. For example,
// if skyline_custom_name = 'human_dia' then the final Skyline document
// will be named "human_dia.sky.zip". When importing into PanoramaWeb--this
Expand Down
51 changes: 51 additions & 0 deletions workflows/get_pdc_files.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

include { GET_STUDY_METADATA } from "../modules/pdc.nf"
include { METADATA_TO_SKY_ANNOTATIONS } from "../modules/pdc.nf"
include { GET_FILE } from "../modules/pdc.nf"
include { MSCONVERT } from "../modules/msconvert.nf"

workflow get_pdc_study_metadata {
emit:
study_name
metadata
annotations_csv

main:
if(params.pdc.metadata_tsv == null) {
GET_STUDY_METADATA(params.pdc.study_id)
metadata = GET_STUDY_METADATA.out.metadata
annotations_csv = GET_STUDY_METADATA.out.skyline_annotations
study_name = GET_STUDY_METADATA.out.study_name
} else {
metadata = Channel.fromPath(file(params.pdc.metadata_tsv, checkIfExists: true))
METADATA_TO_SKY_ANNOTATIONS(metadata)
annotations_csv = METADATA_TO_SKY_ANNOTATIONS.out
study_name = params.pdc.study_name
}
}

workflow get_pdc_files {
emit:
study_name
metadata
annotations_csv
wide_mzml_ch

main:
get_pdc_study_metadata()
metadata = get_pdc_study_metadata.out.metadata
annotations_csv = get_pdc_study_metadata.out.annotations_csv
study_name = get_pdc_study_metadata.out.study_name

metadata \
| splitCsv(header:true, sep:'\t') \
| map{row -> tuple(row.url, row.file_name, row.md5sum)} \
| GET_FILE

MSCONVERT(GET_FILE.out.downloaded_file,
params.msconvert.do_demultiplex,
params.msconvert.do_simasspectra)

wide_mzml_ch = MSCONVERT.out.mzml_file
}

0 comments on commit f33acdf

Please sign in to comment.