diff --git a/container_images.config b/container_images.config index ab3b68b..762ed4a 100644 --- a/container_images.config +++ b/container_images.config @@ -4,6 +4,7 @@ params { diann: 'quay.io/protio/diann:1.8.1', bibliospec: 'quay.io/protio/bibliospec-linux:3.0', panorama_client: 'quay.io/protio/panorama-client:1.1.0', + pdc_client: 'quay.io/mauraisa/pdc_client:0.15', encyclopedia: 'quay.io/protio/encyclopedia:2.12.30-2', encyclopedia3_mriffle: 'quay.io/protio/encyclopedia:3.0.0-MRIFFLE', qc_pipeline: 'quay.io/mauraisa/dia_qc_report:2.2.4', diff --git a/docs/source/workflow_parameters.rst b/docs/source/workflow_parameters.rst index 49c5e84..c88bee9 100644 --- a/docs/source/workflow_parameters.rst +++ b/docs/source/workflow_parameters.rst @@ -77,6 +77,18 @@ The ``params`` Section * - - ``search_engine`` - Must be set to either ``'encyclopedia'`` or ``'diann'``. If set to ``'diann'``, ``chromatogram_library_spectra_dir``, ``chromatogram_library_spectra_glob``, and EncyclopeDIA-specific parameters will be ignored. Default: ``'encyclopedia'``. + * - + - ``pdc.study_id`` + - When this option is set, raw files and metadata will be downloaded from the PDC. Default: ``null``. + * - + - ``pdc.gene_level_data`` + - A ``tsv`` file mapping gene names to NCIB gene IDs and gene metadata. Required for PDC gene reports. Default: ``null``. + * - + - ``pdc.n_raw_files`` + - If this option is set, only ``n`` raw files are downloaded. This is useful for testing but otherwise should be ``null``. + * - + - ``pdc.client_args`` + - Additional command line arguments passed to ``PDC_client``. Default is ``null``. * - - ``skyline.skip`` - If set to ``true``, will skip the creation of a Skyline document. Default: ``false``. diff --git a/main.nf b/main.nf index 7164e81..3a64b09 100644 --- a/main.nf +++ b/main.nf @@ -16,6 +16,7 @@ include { generate_dia_qc_report } from "./workflows/generate_qc_report" include { panorama_upload_results } from "./workflows/panorama_upload" include { panorama_upload_mzmls } from "./workflows/panorama_upload" include { save_run_details } from "./workflows/save_run_details" +include { get_pdc_files } from "./workflows/get_pdc_files" // modules include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia" @@ -23,6 +24,7 @@ include { ENCYCLOPEDIA_DLIB_TO_TSV } from "./modules/encyclopedia" include { BLIB_BUILD_LIBRARY } from "./modules/diann" include { GET_AWS_USER_ID } from "./modules/aws" include { BUILD_AWS_SECRETS } from "./modules/aws" +include { EXPORT_GENE_REPORTS } from "./modules/qc_report" // useful functions and variables include { param_to_list } from "./workflows/get_input_files" @@ -96,21 +98,34 @@ workflow { aws_secret_id = Channel.of('none').collect() // ensure this is a value channel } - // only perform msconvert and terminate - if(params.msconvert_only) { - get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls + // get mzML files + if(params.pdc.study_id) { + get_pdc_files() + wide_mzml_ch = get_pdc_files.out.wide_mzml_ch + pdc_study_name = get_pdc_files.out.study_name + } else{ + get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) wide_mzml_ch = get_wide_mzmls.out.mzml_ch + } + narrow_mzml_ch = null + if(params.chromatogram_library_spectra_dir != null) { + get_narrow_mzmls(params.chromatogram_library_spectra_dir, + params.chromatogram_library_spectra_glob, + aws_secret_id) + + narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch + all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch) + } else { + all_mzml_ch = wide_mzml_ch + } - if(params.chromatogram_library_spectra_dir != null) { - get_narrow_mzmls(params.chromatogram_library_spectra_dir, - params.chromatogram_library_spectra_glob, - aws_secret_id) - - narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch - all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch) - } else { - all_mzml_ch = wide_mzml_ch - } + // only perform msconvert and terminate + if(params.msconvert_only) { + // save details about this run + input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] } + version_files = Channel.empty() + save_run_details(input_files.collect(), version_files.collect()) + run_details_file = save_run_details.out.run_details // save details about this run input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] } @@ -120,7 +135,6 @@ workflow { // if requested, upload mzMLs to panorama if(params.panorama.upload) { - panorama_upload_mzmls( params.panorama.upload_url, all_mzml_ch, @@ -134,19 +148,23 @@ workflow { } get_input_files(aws_secret_id) // get input files - get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls // set up some convenience variables - if(params.spectral_library) { spectral_library = get_input_files.out.spectral_library } else { spectral_library = Channel.empty() } - + if(params.pdc.study_id) { + if(params.replicate_metadata) { + log.warn "params.replicate_metadata will be overritten by PDC metadata" + } + replicate_metadata = get_pdc_files.out.annotations_csv + } else { + replicate_metadata = get_input_files.out.replicate_metadata + } fasta = get_input_files.out.fasta skyline_template_zipfile = get_input_files.out.skyline_template_zipfile - wide_mzml_ch = get_wide_mzmls.out.mzml_ch skyr_file_ch = get_input_files.out.skyr_files final_elib = null @@ -174,13 +192,6 @@ workflow { // create elib if requested if(params.chromatogram_library_spectra_dir != null) { - // get narrow windows mzmls - get_narrow_mzmls(params.chromatogram_library_spectra_dir, - params.chromatogram_library_spectra_glob, - aws_secret_id) - narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch - - all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch) // create chromatogram library encyclopeda_export_elib( @@ -326,7 +337,7 @@ workflow { // annotate skyline document if replicate_metadata was specified if(params.replicate_metadata != null) { skyline_annotate_doc(skyline_import.out.skyline_results, - get_input_files.out.replicate_metadata) + replicate_metadata) final_skyline_file = skyline_annotate_doc.out.skyline_results } else { final_skyline_file = skyline_import.out.skyline_results @@ -334,8 +345,18 @@ workflow { // generate QC report if(!params.qc_report.skip) { - generate_dia_qc_report(final_skyline_file, get_input_files.out.replicate_metadata) + generate_dia_qc_report(final_skyline_file, replicate_metadata) dia_qc_version = generate_dia_qc_report.out.dia_qc_version + + // Export PDC gene tables + if(params.pdc.gene_level_data != null) { + EXPORT_GENE_REPORTS(generate_dia_qc_report.out.qc_report_db, + params.pdc.gene_level_data, + pdc_study_name) + EXPORT_GENE_REPORTS.out.gene_reports | flatten | set{ gene_reports } + } else { + gene_reports = Channel.empty() + } } else { dia_qc_version = Channel.empty() } @@ -360,6 +381,7 @@ workflow { qc_report_files = Channel.empty() proteowizard_version = Channel.empty() dia_qc_version = Channel.empty() + gene_reports = Channel.empty() } version_files = encyclopedia_version.concat(diann_version, diff --git a/modules/pdc.nf b/modules/pdc.nf new file mode 100644 index 0000000..508a019 --- /dev/null +++ b/modules/pdc.nf @@ -0,0 +1,75 @@ + +def format_client_args(var) { + ret = (var == null ? "" : var) + return ret +} + +process GET_STUDY_METADATA { + publishDir "${params.result_dir}/pdc", failOnError: true, mode: 'copy' + errorStrategy 'retry' + maxRetries 5 + label 'process_low_constant' + container params.images.pdc_client + + input: + val pdc_study_id + + output: + path('study_metadata.tsv'), emit: metadata + path('study_metadata_annotations.csv'), emit: skyline_annotations + env(study_id), emit: study_id + env(study_name), emit: study_name + path('pdc_client_version.txt'), emit: version + + shell: + n_files_arg = params.pdc.n_raw_files == null ? "" : "--nFiles ${params.pdc.n_raw_files}" + pdc_client_args = params.pdc.client_args == null ? "" : params.pdc.client_args + + ''' + study_id=$(PDC_client studyID !{pdc_client_args} !{pdc_study_id} | tee study_id.txt) + study_name=$(PDC_client studyName --normalize !{pdc_client_args} ${study_id} | tee study_name.txt) + PDC_client metadata !{pdc_client_args} -f tsv !{n_files_arg} --skylineAnnotations ${study_id} + + echo "pdc_client_git_repo='$GIT_REPO - $GIT_BRANCH [$GIT_SHORT_HASH]'" > pdc_client_version.txt + ''' +} + +process METADATA_TO_SKY_ANNOTATIONS { + label 'process_low_constant' + container params.images.pdc_client + + input: + path pdc_study_metadata + + output: + path('skyline_annotations.csv'), emit: skyline_annotations + + shell: + ''' + PDC_client metadataToSky !{pdc_study_metadata} + ''' +} + +process GET_FILE { + storeDir "${params.panorama_cache_directory}" + label 'process_low_constant' + container params.images.pdc_client + errorStrategy 'retry' + maxRetries 1 + + input: + tuple val(url), val(file_name), val(md5) + + output: + path(file_name), emit: downloaded_file + + shell: + ''' + PDC_client file -o '!{file_name}' -m '!{md5}' '!{url}' + ''' + + stub: + """ + touch ${file_name} + """ +} diff --git a/modules/qc_report.nf b/modules/qc_report.nf index 493e2ff..92845bc 100644 --- a/modules/qc_report.nf +++ b/modules/qc_report.nf @@ -151,3 +151,31 @@ process RENDER_QC_REPORT { """ } +process EXPORT_GENE_REPORTS { + publishDir "${params.result_dir}/gene_reports", failOnError: true, mode: 'copy' + label 'process_high_memory' + container params.images.qc_pipeline + + input: + path batch_db + path gene_level_data + val file_prefix + + output: + path("*.tsv"), emit: gene_reports + path("*.stdout"), emit: stdout + path("*.stderr"), emit: stderr + + script: + """ + dia_qc export_gene_matrix --prefix=${file_prefix} --useAliquotId \ + '${gene_level_data}' '${batch_db}' \ + > >(tee "export_reports.stdout") 2> >(tee "export_reports.stderr" >&2) + """ + + stub: + """ + touch stub.tsv + touch stub.stdout stub.stderr + """ +} diff --git a/nextflow.config b/nextflow.config index 6fe5fa9..2f80c0d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,13 @@ params { skip_skyline = null skyline_skyr_file = null + // Optional PDC study settings + pdc.client_args = '' + pdc.study_id = null + pdc.n_raw_files = null + pdc.metadata_tsv = null + pdc.gene_level_data = null + // The final skyline document will be named using this name. For example, // if skyline_custom_name = 'human_dia' then the final Skyline document // will be named "human_dia.sky.zip". When importing into PanoramaWeb--this diff --git a/workflows/get_pdc_files.nf b/workflows/get_pdc_files.nf new file mode 100644 index 0000000..d039204 --- /dev/null +++ b/workflows/get_pdc_files.nf @@ -0,0 +1,51 @@ + +include { GET_STUDY_METADATA } from "../modules/pdc.nf" +include { METADATA_TO_SKY_ANNOTATIONS } from "../modules/pdc.nf" +include { GET_FILE } from "../modules/pdc.nf" +include { MSCONVERT } from "../modules/msconvert.nf" + +workflow get_pdc_study_metadata { + emit: + study_name + metadata + annotations_csv + + main: + if(params.pdc.metadata_tsv == null) { + GET_STUDY_METADATA(params.pdc.study_id) + metadata = GET_STUDY_METADATA.out.metadata + annotations_csv = GET_STUDY_METADATA.out.skyline_annotations + study_name = GET_STUDY_METADATA.out.study_name + } else { + metadata = Channel.fromPath(file(params.pdc.metadata_tsv, checkIfExists: true)) + METADATA_TO_SKY_ANNOTATIONS(metadata) + annotations_csv = METADATA_TO_SKY_ANNOTATIONS.out + study_name = params.pdc.study_name + } +} + +workflow get_pdc_files { + emit: + study_name + metadata + annotations_csv + wide_mzml_ch + + main: + get_pdc_study_metadata() + metadata = get_pdc_study_metadata.out.metadata + annotations_csv = get_pdc_study_metadata.out.annotations_csv + study_name = get_pdc_study_metadata.out.study_name + + metadata \ + | splitCsv(header:true, sep:'\t') \ + | map{row -> tuple(row.url, row.file_name, row.md5sum)} \ + | GET_FILE + + MSCONVERT(GET_FILE.out.downloaded_file, + params.msconvert.do_demultiplex, + params.msconvert.do_simasspectra) + + wide_mzml_ch = MSCONVERT.out.mzml_file +} +