Merge pull request #17 from ajmaurais/pdc

Add PDC as an input file source
mriffle · Sep 11, 2024 · f33acdf · f33acdf
2 parents 9faf68b + ef83492
commit f33acdf
Show file tree

Hide file tree

Showing 7 changed files with 223 additions and 27 deletions.
diff --git a/container_images.config b/container_images.config
@@ -4,6 +4,7 @@ params {
         diann:                 'quay.io/protio/diann:1.8.1',
         bibliospec:            'quay.io/protio/bibliospec-linux:3.0',
         panorama_client:       'quay.io/protio/panorama-client:1.1.0',
+        pdc_client:            'quay.io/mauraisa/pdc_client:0.15',
         encyclopedia:          'quay.io/protio/encyclopedia:2.12.30-2',
         encyclopedia3_mriffle: 'quay.io/protio/encyclopedia:3.0.0-MRIFFLE',
         qc_pipeline:           'quay.io/mauraisa/dia_qc_report:2.2.4',

diff --git a/docs/source/workflow_parameters.rst b/docs/source/workflow_parameters.rst
@@ -77,6 +77,18 @@ The ``params`` Section
    * -
      - ``search_engine``
      - Must be set to either ``'encyclopedia'`` or ``'diann'``. If set to ``'diann'``, ``chromatogram_library_spectra_dir``, ``chromatogram_library_spectra_glob``, and EncyclopeDIA-specific parameters will be ignored. Default: ``'encyclopedia'``.
+   * -
+     - ``pdc.study_id``
+     - When this option is set, raw files and metadata will be downloaded from the PDC. Default: ``null``.
+   * -
+     - ``pdc.gene_level_data``
+     - A ``tsv`` file mapping gene names to NCIB gene IDs and gene metadata. Required for PDC gene reports. Default: ``null``.
+   * -
+     - ``pdc.n_raw_files``
+     - If this option is set, only ``n`` raw files are downloaded. This is useful for testing but otherwise should be ``null``.
+   * -
+     - ``pdc.client_args``
+     - Additional command line arguments passed to ``PDC_client``. Default is ``null``.
    * -
      - ``skyline.skip``
      - If set to ``true``, will skip the creation of a Skyline document. Default: ``false``.

diff --git a/main.nf b/main.nf
@@ -16,13 +16,15 @@ include { generate_dia_qc_report } from "./workflows/generate_qc_report"
 include { panorama_upload_results } from "./workflows/panorama_upload"
 include { panorama_upload_mzmls } from "./workflows/panorama_upload"
 include { save_run_details } from "./workflows/save_run_details"
+include { get_pdc_files } from "./workflows/get_pdc_files"
 
 // modules
 include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia"
 include { ENCYCLOPEDIA_DLIB_TO_TSV } from "./modules/encyclopedia"
 include { BLIB_BUILD_LIBRARY } from "./modules/diann"
 include { GET_AWS_USER_ID } from "./modules/aws"
 include { BUILD_AWS_SECRETS } from "./modules/aws"
+include { EXPORT_GENE_REPORTS } from "./modules/qc_report"
 
 // useful functions and variables
 include { param_to_list } from "./workflows/get_input_files"
@@ -96,21 +98,34 @@ workflow {
         aws_secret_id = Channel.of('none').collect()    // ensure this is a value channel
     }
 
-    // only perform msconvert and terminate
-    if(params.msconvert_only) {
-        get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)  // get wide windows mzmls
+    // get mzML files
+    if(params.pdc.study_id) {
+        get_pdc_files()
+        wide_mzml_ch = get_pdc_files.out.wide_mzml_ch
+        pdc_study_name = get_pdc_files.out.study_name
+    } else{
+        get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)
         wide_mzml_ch = get_wide_mzmls.out.mzml_ch
+    }
+    narrow_mzml_ch = null
+    if(params.chromatogram_library_spectra_dir != null) {
+        get_narrow_mzmls(params.chromatogram_library_spectra_dir,
+                         params.chromatogram_library_spectra_glob,
+                         aws_secret_id)
+
+        narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
+        all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
+    } else {
+        all_mzml_ch = wide_mzml_ch
+    }
 
-        if(params.chromatogram_library_spectra_dir != null) {
-            get_narrow_mzmls(params.chromatogram_library_spectra_dir,
-                             params.chromatogram_library_spectra_glob,
-                             aws_secret_id)
-
-            narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
-            all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
-        } else {
-            all_mzml_ch = wide_mzml_ch
-        }
+    // only perform msconvert and terminate
+    if(params.msconvert_only) {
+        // save details about this run
+        input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
+        version_files = Channel.empty()
+        save_run_details(input_files.collect(), version_files.collect())
+        run_details_file = save_run_details.out.run_details
 
         // save details about this run
         input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
@@ -120,7 +135,6 @@ workflow {
 
         // if requested, upload mzMLs to panorama
         if(params.panorama.upload) {
-
             panorama_upload_mzmls(
                 params.panorama.upload_url,
                 all_mzml_ch,
@@ -134,19 +148,23 @@ workflow {
     }
 
     get_input_files(aws_secret_id)   // get input files
-    get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)  // get wide windows mzmls
 
     // set up some convenience variables
-
     if(params.spectral_library) {
         spectral_library = get_input_files.out.spectral_library
     } else {
         spectral_library = Channel.empty()
     }
-
+    if(params.pdc.study_id) {
+        if(params.replicate_metadata) {
+            log.warn "params.replicate_metadata will be overritten by PDC metadata"
+        }
+        replicate_metadata = get_pdc_files.out.annotations_csv
+    } else {
+        replicate_metadata = get_input_files.out.replicate_metadata
+    }
     fasta = get_input_files.out.fasta
     skyline_template_zipfile = get_input_files.out.skyline_template_zipfile
-    wide_mzml_ch = get_wide_mzmls.out.mzml_ch
     skyr_file_ch = get_input_files.out.skyr_files
 
     final_elib = null
@@ -174,13 +192,6 @@ workflow {
 
         // create elib if requested
         if(params.chromatogram_library_spectra_dir != null) {
-            // get narrow windows mzmls
-            get_narrow_mzmls(params.chromatogram_library_spectra_dir,
-                             params.chromatogram_library_spectra_glob,
-                             aws_secret_id)
-            narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
-
-            all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
 
             // create chromatogram library
             encyclopeda_export_elib(
@@ -326,16 +337,26 @@ workflow {
         // annotate skyline document if replicate_metadata was specified
         if(params.replicate_metadata != null) {
             skyline_annotate_doc(skyline_import.out.skyline_results,
-                                 get_input_files.out.replicate_metadata)
+                                 replicate_metadata)
             final_skyline_file = skyline_annotate_doc.out.skyline_results
         } else {
             final_skyline_file = skyline_import.out.skyline_results
         }
 
         // generate QC report
         if(!params.qc_report.skip) {
-            generate_dia_qc_report(final_skyline_file, get_input_files.out.replicate_metadata)
+            generate_dia_qc_report(final_skyline_file, replicate_metadata)
             dia_qc_version = generate_dia_qc_report.out.dia_qc_version
+
+            // Export PDC gene tables
+            if(params.pdc.gene_level_data != null) {
+                EXPORT_GENE_REPORTS(generate_dia_qc_report.out.qc_report_db,
+                                    params.pdc.gene_level_data,
+                                    pdc_study_name)
+                EXPORT_GENE_REPORTS.out.gene_reports | flatten | set{ gene_reports }
+            } else {
+                gene_reports = Channel.empty()
+            }
         } else {
             dia_qc_version = Channel.empty()
         }
@@ -360,6 +381,7 @@ workflow {
         qc_report_files = Channel.empty()
         proteowizard_version = Channel.empty()
         dia_qc_version = Channel.empty()
+        gene_reports = Channel.empty()
     }
 
     version_files = encyclopedia_version.concat(diann_version,

diff --git a/modules/pdc.nf b/modules/pdc.nf
@@ -0,0 +1,75 @@
+
+def format_client_args(var) {
+    ret = (var == null ? "" : var)
+    return ret
+}
+
+process GET_STUDY_METADATA {
+    publishDir "${params.result_dir}/pdc", failOnError: true, mode: 'copy'
+    errorStrategy 'retry'
+    maxRetries 5
+    label 'process_low_constant'
+    container params.images.pdc_client
+
+    input:
+        val pdc_study_id
+
+    output:
+        path('study_metadata.tsv'), emit: metadata
+        path('study_metadata_annotations.csv'), emit: skyline_annotations
+        env(study_id), emit: study_id
+        env(study_name), emit: study_name
+        path('pdc_client_version.txt'), emit: version
+
+    shell:
+    n_files_arg = params.pdc.n_raw_files == null ? "" : "--nFiles ${params.pdc.n_raw_files}"
+    pdc_client_args = params.pdc.client_args == null ? "" : params.pdc.client_args
+
+    '''
+    study_id=$(PDC_client studyID !{pdc_client_args} !{pdc_study_id} | tee study_id.txt)
+    study_name=$(PDC_client studyName --normalize !{pdc_client_args} ${study_id} | tee study_name.txt)
+    PDC_client metadata !{pdc_client_args} -f tsv !{n_files_arg} --skylineAnnotations ${study_id}
+
+    echo "pdc_client_git_repo='$GIT_REPO - $GIT_BRANCH [$GIT_SHORT_HASH]'" > pdc_client_version.txt
+    '''
+}
+
+process METADATA_TO_SKY_ANNOTATIONS {
+    label 'process_low_constant'
+    container params.images.pdc_client
+
+    input:
+        path pdc_study_metadata
+
+    output:
+        path('skyline_annotations.csv'), emit: skyline_annotations
+
+    shell:
+    '''
+    PDC_client metadataToSky !{pdc_study_metadata}
+    '''
+}
+
+process GET_FILE {
+    storeDir "${params.panorama_cache_directory}"
+    label 'process_low_constant'
+    container params.images.pdc_client
+    errorStrategy 'retry'
+    maxRetries 1
+
+    input:
+        tuple val(url), val(file_name), val(md5)
+
+    output:
+        path(file_name), emit: downloaded_file
+
+    shell:
+    '''
+    PDC_client file -o '!{file_name}' -m '!{md5}' '!{url}'
+    '''
+
+    stub:
+    """
+    touch ${file_name}
+    """
+}
diff --git a/modules/qc_report.nf b/modules/qc_report.nf
@@ -151,3 +151,31 @@ process RENDER_QC_REPORT {
     """
 }
 
+process EXPORT_GENE_REPORTS {
+    publishDir "${params.result_dir}/gene_reports", failOnError: true, mode: 'copy'
+    label 'process_high_memory'
+    container params.images.qc_pipeline
+
+    input:
+        path batch_db
+        path gene_level_data
+        val file_prefix
+
+    output:
+        path("*.tsv"), emit: gene_reports
+        path("*.stdout"), emit: stdout
+        path("*.stderr"), emit: stderr
+
+    script:
+        """
+        dia_qc export_gene_matrix --prefix=${file_prefix} --useAliquotId \
+            '${gene_level_data}' '${batch_db}'  \
+            > >(tee "export_reports.stdout") 2> >(tee "export_reports.stderr" >&2)
+        """
+
+    stub:
+        """
+        touch stub.tsv
+        touch stub.stdout stub.stderr
+        """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -23,6 +23,13 @@ params {
     skip_skyline = null
     skyline_skyr_file = null
 
+    // Optional PDC study settings
+    pdc.client_args = ''
+    pdc.study_id = null
+    pdc.n_raw_files = null
+    pdc.metadata_tsv = null
+    pdc.gene_level_data = null
+
     // The final skyline document will be named using this name. For example,
     // if skyline_custom_name = 'human_dia' then the final Skyline document
     // will be named "human_dia.sky.zip". When importing into PanoramaWeb--this

diff --git a/workflows/get_pdc_files.nf b/workflows/get_pdc_files.nf
@@ -0,0 +1,51 @@
+
+include { GET_STUDY_METADATA } from "../modules/pdc.nf"
+include { METADATA_TO_SKY_ANNOTATIONS } from "../modules/pdc.nf"
+include { GET_FILE } from "../modules/pdc.nf"
+include { MSCONVERT } from "../modules/msconvert.nf"
+
+workflow get_pdc_study_metadata {
+    emit:
+        study_name
+        metadata
+        annotations_csv
+
+    main:
+        if(params.pdc.metadata_tsv == null) {
+            GET_STUDY_METADATA(params.pdc.study_id)
+            metadata = GET_STUDY_METADATA.out.metadata
+            annotations_csv = GET_STUDY_METADATA.out.skyline_annotations
+            study_name = GET_STUDY_METADATA.out.study_name
+        } else {
+            metadata = Channel.fromPath(file(params.pdc.metadata_tsv, checkIfExists: true))
+            METADATA_TO_SKY_ANNOTATIONS(metadata)
+            annotations_csv = METADATA_TO_SKY_ANNOTATIONS.out
+            study_name = params.pdc.study_name
+        }
+}
+
+workflow get_pdc_files {
+    emit:
+        study_name
+        metadata
+        annotations_csv
+        wide_mzml_ch
+
+    main:
+        get_pdc_study_metadata()
+        metadata = get_pdc_study_metadata.out.metadata
+        annotations_csv = get_pdc_study_metadata.out.annotations_csv
+        study_name = get_pdc_study_metadata.out.study_name
+
+        metadata \
+            | splitCsv(header:true, sep:'\t') \
+            | map{row -> tuple(row.url, row.file_name, row.md5sum)} \
+            | GET_FILE
+
+        MSCONVERT(GET_FILE.out.downloaded_file,
+                  params.msconvert.do_demultiplex,
+                  params.msconvert.do_simasspectra)
+
+        wide_mzml_ch = MSCONVERT.out.mzml_file
+}
+