stjudecloud · a-frantz · Nov 1, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/tools/htseq.wdl b/tools/htseq.wdl
@@ -6,12 +6,12 @@ task count {
     meta {
         description: "Performs read counting for a set of features in the input BAM file"
         outputs: {
-            feature_counts: "A two column headerless TSV file. First column is feature names and second column is counts."
+            feature_counts: "A two column TSV file. First column is feature names and second column is counts. Presence of a header is determined by the `include_custom_header` parameter."
         }
     }
 
     parameter_meta {
-        bam: "Input BAM format file to generate coverage for"
+        bam: "Input BAM format file to generate feature counts for"
         gtf: "Input genomic features in gzipped GTF format to count reads for"
         strandedness: {
             description: "Strandedness protocol of the RNA-Seq experiment",
@@ -41,11 +41,11 @@ task count {
             ]
         }
         include_custom_header: {
-            description: "Include a custom header for the output file? This is not an official feature of HTSeq. If true, the first line of the output file will be `feature\t~{prefix}`. This may break downstream tools that expect the typical headerless HTSeq output format.",
+            description: "Include a custom header for the output file? This is not an official feature of HTSeq. If true, the first line of the output file will be `~{idattr}\t~{prefix}`. This may break downstream tools that expect the typical headerless HTSeq output format.",
             common: true
         }
         pos_sorted: {
-            description: "Is the BAM position sorted (true) or name sorted (false)?",
+            description: "Is the BAM position sorted (true) or name sorted (false)? It is **highly** recommended to use a name sorted BAM file. This is because HTSeq will re-sort position-sorted BAMs with an inefficient algorithm, causing very large memory and disk space allocations (especially for large BAMs).",
             common: true
         }
         nonunique: {
@@ -76,8 +76,8 @@ task count {
         String feature_type = "exon"
         String idattr = "gene_name"
         String mode = "union"
-        Boolean include_custom_header = false
-        Boolean pos_sorted = true
+        Boolean include_custom_header = true
+        Boolean pos_sorted = false
         Boolean nonunique = false
         Boolean secondary_alignments = false
         Boolean supplementary_alignments = false
@@ -91,15 +91,15 @@ task count {
     Float bam_size = size(bam, "GiB")
     Float gtf_size = size(gtf, "GiB")
 
-    Int memory_gb = ceil(bam_size) + 5 + modify_memory_gb
+    Int memory_gb = (if pos_sorted then ceil(bam_size) + 4 else 4) + modify_memory_gb
 
-    Int disk_size_gb = ceil((bam_size + gtf_size) * 4) + 10 + modify_disk_size_gb
+    Int disk_size_gb = ceil((bam_size + gtf_size) * if pos_sorted then 4 else 1) + 10 + modify_disk_size_gb
 
     command <<<
         set -euo pipefail
 
         if ~{include_custom_header}; then
-            echo -e "feature\t~{prefix}" > ~{outfile_name}
+            echo -e "~{idattr}\t~{prefix}" > ~{outfile_name}
         else
             true > ~{outfile_name}  # ensure file is empty
         fi
@@ -146,15 +146,17 @@ task calc_tpm {
     }
 
     parameter_meta {
-        counts: "A two column headerless TSV file with gene names in the first column and counts (as integers) in the second column. Entries starting with '__' will be discarded. Can be generated with the `count` task."
+        counts: "A two column TSV file with gene names in the first column and counts (as integers) in the second column. Entries starting with '__' will be discarded. Can be generated with the `count` task."
         gene_lengths: "A two column headered TSV file with gene names (matching those in the `counts` file) in the first column and feature lengths (as integers) in the second column. Can be generated with the `calc_gene_lengths` task in `util.wdl`."
         prefix: "Prefix for the TPM file. The extension `.TPM.txt` will be added."
+        has_header: "Does the `counts` file have a header line? If true, the first line will be ignored."
     }
 
     input {
         File counts
         File gene_lengths
         String prefix = basename(counts, ".feature-counts.txt")
+        Boolean has_header = true
     }
 
     String outfile_name = prefix + ".TPM.txt"
@@ -166,6 +168,8 @@ task calc_tpm {
 
         counts_file = open(os.environ['COUNTS'], 'r')
         counts = {}
+        if "~{has_header}" == "true":
+            counts_file.readline()
         for line in counts_file:
             gene, count = line.split('\t')
             if gene[0:2] == '__':

diff --git a/tools/star.wdl b/tools/star.wdl
@@ -626,8 +626,6 @@ task alignment {
         ) + 10 + modify_disk_size_gb
     )
 
-    Array[File] empty_array = []  # odd construction forced by WDL v1.1 spec
-
     #@ except: LineWidth
     command <<<
         set -euo pipefail
@@ -643,12 +641,8 @@ task alignment {
         # and limitations of the WDL v1.1 spec
         python3 /home/sort_star_input.py \
             --read-one-fastqs "~{sep(",", read_one_fastqs_gz)}" \
-            ~{if (read_two_fastqs_gz != empty_array) then "--read-two-fastqs" else ""} "~{
-                sep(",", (
-                    if (read_two_fastqs_gz != empty_array)
-                    then read_two_fastqs_gz
-                    else []
-                ))
+            ~{if (length(read_two_fastqs_gz) != 0) then "--read-two-fastqs" else ""} "~{
+                sep(",", (read_two_fastqs_gz))
             }" \
             ~{if defined(read_groups) then "--read-groups" else ""} "~{
                 if defined(read_groups)

diff --git a/workflows/rnaseq/rnaseq-core.wdl b/workflows/rnaseq/rnaseq-core.wdl
@@ -207,6 +207,7 @@ workflow rnaseq_core {
                 then ngsderive_strandedness.strandedness_string
                 else provided_strandedness
             ),
+        pos_sorted = true,
     }
 
     output {