From 12284033d48d4163d115c0a337643da2e66e24d2 Mon Sep 17 00:00:00 2001
From: "Stephanie J. Spielman" <spielman@rowan.edu>
Date: Fri, 13 Dec 2019 11:45:55 -0500
Subject: [PATCH 01/11] Started a template markdown file for contributors to
 provide and track the source and description of data files

---
 DATA-DESCRIPTION.md | 63 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 DATA-DESCRIPTION.md

diff --git a/DATA-DESCRIPTION.md b/DATA-DESCRIPTION.md
new file mode 100644
index 0000000000..e4f2aae2f6
--- /dev/null
+++ b/DATA-DESCRIPTION.md
@@ -0,0 +1,63 @@
+## Data file descriptions
+
+This document contains information about all data files associated with this project. Each file should have the following association information:
+
++ **File type** should be one of..
+	+ *Reference file*: Obtained from an external source/database. When known, the obtained data and a link to the external source should be included.
+	+ *Modified reference file*: Obtained from an external source/database but modified for OpenPBTA use. 	
+	+ *External data file*: Data directly obtained from the cancer samples databases that be. When known, the specific database and download date should be included.
+	+ *Analysis file*: Any file created by a script in `analyses/*`. 
++ **Associated analyses**
+	+ A relative link to the specific analyses that use the data
+		+ For any files which are generally applicable to many/most/all analyses, please write *Universal* in this field
+	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data
++ **File description**
+	+ A *brief* one sentence description of what the file contains (e.g., bed files contain coordinates for features XYZ).
+
+
+
+### current release (release-v11-20191126)
+
+| **File name** |  **File Type** | **Associated analysis** | **File Description** |
+|---------------|----------------|------------------------|-----------------------|
+|`GRCh38.primary_assembly.genome.fa.gz` | | |  |
+|`StrexomeLite_Targets_CrossMap_hg38_filtered_chr_prefixed.bed` | | |
+|`StrexomeLite_hg38_liftover_100bp_padded.bed`| | |
+|`WGS.hg38.lancet.300bp_padded.bed` | | |
+|`WGS.hg38.lancet.unpadded.bed` | | |
+|`WGS.hg38.mutect2.unpadded.bed` | | |
+|`WGS.hg38.strelka2.unpadded.bed` | ||
+|`WGS.hg38.vardict.100bp_padded.bed` | ||
+|`WXS.hg38.100bp_padded.bed` | ||
+|`gencode.v27.primary_assembly.annotation.gtf.gz` | ||
+|`independent-specimens.wgs.primary-plus.tsv` | ||
+|`independent-specimens.wgs.primary.tsv` | ||
+|`independent-specimens.wgswxs.primary-plus.tsv` | ||
+|`independent-specimens.wgswxs.primary.tsv` | ||
+|`pbta-cnv-cnvkit.seg.gz` | ||
+|`pbta-cnv-controlfreec.tsv.gz` | ||
+|`pbta-fusion-arriba.tsv.gz` | ||
+|`pbta-fusion-putative-oncogenic.tsv` | ||
+|`pbta-fusion-starfusion.tsv.gz` | ||
+|`pbta-gene-counts-rsem-expected_count.polya.rds` | ||
+|`pbta-gene-counts-rsem-expected_count.stranded.rds` || |
+|`pbta-gene-expression-kallisto.polya.rds` | ||
+|`pbta-gene-expression-kallisto.stranded.rds` || |
+|`pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | ||
+|`pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` || |
+|`pbta-gene-expression-rsem-fpkm.polya.rds` | ||
+|`pbta-gene-expression-rsem-fpkm.stranded.rds` || |
+|`pbta-gene-expression-rsem-tpm.polya.rds` | ||
+|`pbta-gene-expression-rsem-tpm.stranded.rds` || |
+|`pbta-histologies.tsv` | ||
+|`pbta-isoform-counts-rsem-expected_count.polya.rds` | ||
+|`pbta-isoform-counts-rsem-expected_count.stranded.rds` || |
+|`pbta-isoform-expression-rsem-tpm.polya.rds` | ||
+|`pbta-isoform-expression-rsem-tpm.stranded.rds` || |
+|`pbta-snv-consensus-mutation-tmb.tsv` | ||
+|`pbta-snv-consensus-mutation.maf.tsv.gz` || |
+|`pbta-snv-lancet.vep.maf.gz` | ||
+|`pbta-snv-mutect2.vep.maf.gz` | ||
+|`pbta-snv-strelka2.vep.maf.gz` | ||
+|`pbta-snv-vardict.vep.maf.gz` | ||
+|`pbta-sv-manta.tsv.gz`| ||
\ No newline at end of file

From e56299c2160c392daa77ce464307e1ffb3bcf80f Mon Sep 17 00:00:00 2001
From: Stephanie <spielman@rowan.edu>
Date: Fri, 13 Dec 2019 14:49:31 -0500
Subject: [PATCH 02/11] Update DATA-DESCRIPTION.md

Co-Authored-By: Jo Lynne <jharenza@gmail.com>
---
 DATA-DESCRIPTION.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DATA-DESCRIPTION.md b/DATA-DESCRIPTION.md
index e4f2aae2f6..3810457343 100644
--- a/DATA-DESCRIPTION.md
+++ b/DATA-DESCRIPTION.md
@@ -8,7 +8,7 @@ This document contains information about all data files associated with this pro
 	+ *External data file*: Data directly obtained from the cancer samples databases that be. When known, the specific database and download date should be included.
 	+ *Analysis file*: Any file created by a script in `analyses/*`. 
 + **Associated analyses**
-	+ A relative link to the specific analyses that use the data
+	+ A relative link to the specific analysis from which the file was generated.
 		+ For any files which are generally applicable to many/most/all analyses, please write *Universal* in this field
 	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data
 + **File description**
@@ -60,4 +60,4 @@ This document contains information about all data files associated with this pro
 |`pbta-snv-mutect2.vep.maf.gz` | ||
 |`pbta-snv-strelka2.vep.maf.gz` | ||
 |`pbta-snv-vardict.vep.maf.gz` | ||
-|`pbta-sv-manta.tsv.gz`| ||
\ No newline at end of file
+|`pbta-sv-manta.tsv.gz`| ||

From c0716f212f1852f5a2e2e31d9c5359125cc93b75 Mon Sep 17 00:00:00 2001
From: Stephanie <spielman@rowan.edu>
Date: Fri, 13 Dec 2019 14:49:38 -0500
Subject: [PATCH 03/11] Update DATA-DESCRIPTION.md

Co-Authored-By: Jo Lynne <jharenza@gmail.com>
---
 DATA-DESCRIPTION.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DATA-DESCRIPTION.md b/DATA-DESCRIPTION.md
index 3810457343..cf20a8aafb 100644
--- a/DATA-DESCRIPTION.md
+++ b/DATA-DESCRIPTION.md
@@ -7,7 +7,7 @@ This document contains information about all data files associated with this pro
 	+ *Modified reference file*: Obtained from an external source/database but modified for OpenPBTA use. 	
 	+ *External data file*: Data directly obtained from the cancer samples databases that be. When known, the specific database and download date should be included.
 	+ *Analysis file*: Any file created by a script in `analyses/*`. 
-+ **Associated analyses**
++ **Origin**
 	+ A relative link to the specific analysis from which the file was generated.
 		+ For any files which are generally applicable to many/most/all analyses, please write *Universal* in this field
 	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data

From 01dfc465ef9512c7b8e98502266e531cc53ef687 Mon Sep 17 00:00:00 2001
From: Stephanie <spielman@rowan.edu>
Date: Fri, 13 Dec 2019 14:49:43 -0500
Subject: [PATCH 04/11] Update DATA-DESCRIPTION.md

Co-Authored-By: Jo Lynne <jharenza@gmail.com>
---
 DATA-DESCRIPTION.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DATA-DESCRIPTION.md b/DATA-DESCRIPTION.md
index cf20a8aafb..a6049317a9 100644
--- a/DATA-DESCRIPTION.md
+++ b/DATA-DESCRIPTION.md
@@ -18,7 +18,7 @@ This document contains information about all data files associated with this pro
 
 ### current release (release-v11-20191126)
 
-| **File name** |  **File Type** | **Associated analysis** | **File Description** |
+| **File name** |  **File Type** | **Origin** | **File Description** |
 |---------------|----------------|------------------------|-----------------------|
 |`GRCh38.primary_assembly.genome.fa.gz` | | |  |
 |`StrexomeLite_Targets_CrossMap_hg38_filtered_chr_prefixed.bed` | | |

From d1ccaa884c76900b040a6e7cf4692c21dcd0dae2 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
Date: Sun, 15 Dec 2019 18:14:56 -0500
Subject: [PATCH 05/11] Move file to doc

---
 DATA-DESCRIPTION.md => doc/data-description.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename DATA-DESCRIPTION.md => doc/data-description.md (100%)

diff --git a/DATA-DESCRIPTION.md b/doc/data-description.md
similarity index 100%
rename from DATA-DESCRIPTION.md
rename to doc/data-description.md

From e32f5c39d9ddc06b1ecc51d7908368af36445ed7 Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
Date: Sun, 15 Dec 2019 18:15:51 -0500
Subject: [PATCH 06/11] @jharenza suggested change

---
 doc/data-description.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index a6049317a9..d2b9535d38 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -9,7 +9,6 @@ This document contains information about all data files associated with this pro
 	+ *Analysis file*: Any file created by a script in `analyses/*`. 
 + **Origin**
 	+ A relative link to the specific analysis from which the file was generated.
-		+ For any files which are generally applicable to many/most/all analyses, please write *Universal* in this field
 	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data
 + **File description**
 	+ A *brief* one sentence description of what the file contains (e.g., bed files contain coordinates for features XYZ).

From 87a4f17bccfcfea2822cfbb86ddd31aada71865a Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
Date: Sun, 15 Dec 2019 21:21:34 -0500
Subject: [PATCH 07/11] Introduce PBTA data concept

Change tense
---
 doc/data-description.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index d2b9535d38..52c1c5af0a 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -1,15 +1,15 @@
 ## Data file descriptions
 
-This document contains information about all data files associated with this project. Each file should have the following association information:
+This document contains information about all data files associated with this project. Each file will have the following association information:
 
-+ **File type** should be one of..
-	+ *Reference file*: Obtained from an external source/database. When known, the obtained data and a link to the external source should be included.
-	+ *Modified reference file*: Obtained from an external source/database but modified for OpenPBTA use. 	
-	+ *External data file*: Data directly obtained from the cancer samples databases that be. When known, the specific database and download date should be included.
++ **File type** will be one of:
+	+ *Reference file*: Obtained from an external source/database. When known, the obtained data and a link to the external source is included.
+	+ *Modified reference file*: Obtained from an external source/database but modified for OpenPBTA use. 
+	+ *PBTA data file*: Pediatric Brain Tumor Atlas data that are processed upstream of the OpenPBTA project, e.g., the output of a somatic single nucleotide variant method. Links to the relevant D3B Center or Kids First workflow (and version where applicable) are included in **Origin**.
 	+ *Analysis file*: Any file created by a script in `analyses/*`. 
 + **Origin**
-	+ A relative link to the specific analysis from which the file was generated.
-	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data
+	+ For _PBTA data files_, a link the relevant D3B Center or Kids First workflow (and version where applicable).
+	+ When applicable, a link to the specific *script* that produced (or modified, for *Modified reference file* types) the data.
 + **File description**
 	+ A *brief* one sentence description of what the file contains (e.g., bed files contain coordinates for features XYZ).
 

From 9e0f962e9eb72665c2f13d6dd3f24a234938c76e Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
Date: Sun, 15 Dec 2019 21:21:50 -0500
Subject: [PATCH 08/11] First pass at filling in the table

---
 doc/data-description.md | 66 ++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index 52c1c5af0a..d16fbfb226 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -19,7 +19,7 @@ This document contains information about all data files associated with this pro
 
 | **File name** |  **File Type** | **Origin** | **File Description** |
 |---------------|----------------|------------------------|-----------------------|
-|`GRCh38.primary_assembly.genome.fa.gz` | | |  |
+|`GRCh38.primary_assembly.genome.fa.gz` | Reference file | GENCODE v27 | hg38 primary assembly genome sequence FASTA file
 |`StrexomeLite_Targets_CrossMap_hg38_filtered_chr_prefixed.bed` | | |
 |`StrexomeLite_hg38_liftover_100bp_padded.bed`| | |
 |`WGS.hg38.lancet.300bp_padded.bed` | | |
@@ -28,35 +28,35 @@ This document contains information about all data files associated with this pro
 |`WGS.hg38.strelka2.unpadded.bed` | ||
 |`WGS.hg38.vardict.100bp_padded.bed` | ||
 |`WXS.hg38.100bp_padded.bed` | ||
-|`gencode.v27.primary_assembly.annotation.gtf.gz` | ||
-|`independent-specimens.wgs.primary-plus.tsv` | ||
-|`independent-specimens.wgs.primary.tsv` | ||
-|`independent-specimens.wgswxs.primary-plus.tsv` | ||
-|`independent-specimens.wgswxs.primary.tsv` | ||
-|`pbta-cnv-cnvkit.seg.gz` | ||
-|`pbta-cnv-controlfreec.tsv.gz` | ||
-|`pbta-fusion-arriba.tsv.gz` | ||
-|`pbta-fusion-putative-oncogenic.tsv` | ||
-|`pbta-fusion-starfusion.tsv.gz` | ||
-|`pbta-gene-counts-rsem-expected_count.polya.rds` | ||
-|`pbta-gene-counts-rsem-expected_count.stranded.rds` || |
-|`pbta-gene-expression-kallisto.polya.rds` | ||
-|`pbta-gene-expression-kallisto.stranded.rds` || |
-|`pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | ||
-|`pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` || |
-|`pbta-gene-expression-rsem-fpkm.polya.rds` | ||
-|`pbta-gene-expression-rsem-fpkm.stranded.rds` || |
-|`pbta-gene-expression-rsem-tpm.polya.rds` | ||
-|`pbta-gene-expression-rsem-tpm.stranded.rds` || |
-|`pbta-histologies.tsv` | ||
-|`pbta-isoform-counts-rsem-expected_count.polya.rds` | ||
-|`pbta-isoform-counts-rsem-expected_count.stranded.rds` || |
-|`pbta-isoform-expression-rsem-tpm.polya.rds` | ||
-|`pbta-isoform-expression-rsem-tpm.stranded.rds` || |
-|`pbta-snv-consensus-mutation-tmb.tsv` | ||
-|`pbta-snv-consensus-mutation.maf.tsv.gz` || |
-|`pbta-snv-lancet.vep.maf.gz` | ||
-|`pbta-snv-mutect2.vep.maf.gz` | ||
-|`pbta-snv-strelka2.vep.maf.gz` | ||
-|`pbta-snv-vardict.vep.maf.gz` | ||
-|`pbta-sv-manta.tsv.gz`| ||
+|`gencode.v27.primary_assembly.annotation.gtf.gz` | Reference file | GENCODE v27 | hg38 gene annotation on primary assembly (reference chromosomes and scaffolds)
+|`independent-specimens.wgs.primary-plus.tsv` | Analysis file |[`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples)| Independent specimens list for WGS sample, primary + non-primary when no primary sample is available
+|`independent-specimens.wgs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS samples, primary only
+|`independent-specimens.wgswxs.primary-plus.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary + non-primary when no primary sample is available
+|`independent-specimens.wgswxs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary only
+|`pbta-cnv-cnvkit.seg.gz` | PBTA data file || Somatic Copy Number Variant - CNVkit [SEG file](https://cnvkit.readthedocs.io/en/stable/fileformats.html#seg)
+|`pbta-cnv-controlfreec.tsv.gz` | PBTA data file || Somatic Copy Number Variant - TSV file that is a merge of [ControlFreeC `*_CNVs` files](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT)
+|`pbta-fusion-arriba.tsv.gz` | PBTA data file || Fusion - [Arriba TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/arriba-tsv-header.md)
+|`pbta-fusion-putative-oncogenic.tsv` | Analysis file | [`analyses/fusion_filtering`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/fusion_filtering) | Filtered and prioritized fusions 
+|`pbta-fusion-starfusion.tsv.gz` | PBTA data file || Fusion - [STARFusion TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/starfusion-tsv-header.md)
+|`pbta-gene-counts-rsem-expected_count.polya.rds` | PBTA data file || Gene expression - RSEM expected counts for poly-A samples (gene-level)
+|`pbta-gene-counts-rsem-expected_count.stranded.rds` | PBTA data file | | Gene expression - RSEM  expected counts for stranded samples (gene-level)
+|`pbta-gene-expression-kallisto.polya.rds` | PBTA data file || Gene expression - kallisto TPM for poly-A samples (transcript-level)
+|`pbta-gene-expression-kallisto.stranded.rds` | PBTA data file | | Gene expression - kallisto TPM for stranded samples (transcript-level)
+|`pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for poly-A samples collapsed to gene symbol (gene-level)
+|`pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for stranded samples collapsed to gene symbol (gene-level)
+|`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file || Gene expression - RSEM FPKM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | | Gene expression - RSEM FPKM for stranded samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file || Gene expression - RSEM TPM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | | Gene expression -RSEM TPM for stranded samples (gene-level)
+|`pbta-histologies.tsv` | PBTA data file || Harmonized clinical metadata file (see data dictionary [here](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization))
+|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file || Gene expression -RSEM expected counts for poly-A samples (transcript-level)
+|`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | |Gene expression - RSEM expected counts for stranded samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file || Gene expression - RSEM TPM for poly-A samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | | Gene expression - RSEM TPM for stranded samples (transcript-level)
+|`pbta-snv-consensus-mutation-tmb.tsv` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/) | Tumor mutation burden statistics calculated from consensus SNV, using Strelka2 counts and BED window sizes
+|`pbta-snv-consensus-mutation.maf.tsv.gz` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/)  | Consensus calls for SNVs and small indels; columns in the included file are derived from the Strelka2.
+|`pbta-snv-lancet.vep.maf.gz` | PBTA data file | | Somatic SNV - Lancet [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-mutect2.vep.maf.gz` | PBTA data file || Somatic SNV - Mutect2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-strelka2.vep.maf.gz` | PBTA data file || Somatic SNV - Strelka2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-vardict.vep.maf.gz` | PBTA data file || Somatic SNV - VarDict [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-sv-manta.tsv.gz`| PBTA data file || Somatic Structural Variant - Manta output, annotated with AnnotSV

From 3202804b0dbb81673c8310db0d4e1bf778dd3bc7 Mon Sep 17 00:00:00 2001
From: Jo Lynne <jolynnerokita@d3b.center>
Date: Mon, 16 Dec 2019 09:36:38 -0500
Subject: [PATCH 09/11] update data descriptions

Add missing fields
---
 doc/data-description.md | 62 ++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index d16fbfb226..1ec96e524c 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -19,44 +19,44 @@ This document contains information about all data files associated with this pro
 
 | **File name** |  **File Type** | **Origin** | **File Description** |
 |---------------|----------------|------------------------|-----------------------|
-|`GRCh38.primary_assembly.genome.fa.gz` | Reference file | GENCODE v27 | hg38 primary assembly genome sequence FASTA file
-|`StrexomeLite_Targets_CrossMap_hg38_filtered_chr_prefixed.bed` | | |
-|`StrexomeLite_hg38_liftover_100bp_padded.bed`| | |
-|`WGS.hg38.lancet.300bp_padded.bed` | | |
-|`WGS.hg38.lancet.unpadded.bed` | | |
-|`WGS.hg38.mutect2.unpadded.bed` | | |
-|`WGS.hg38.strelka2.unpadded.bed` | ||
-|`WGS.hg38.vardict.100bp_padded.bed` | ||
-|`WXS.hg38.100bp_padded.bed` | ||
+|`GRCh38.primary_assembly.genome.fa.gz` | Reference Genome file | GENCODE v27 | hg38 primary assembly genome sequence FASTA file
+|`StrexomeLite_Targets_CrossMap_hg38_filtered_chr_prefixed.bed` | Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 targeted DNA panel bait capture regions provided by the kit manufacturer
+|`StrexomeLite_hg38_liftover_100bp_padded.bed`| Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 targeted panel regions used for all variant callers, each region padded by 100 bp
+|`WGS.hg38.lancet.300bp_padded.bed` | Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | WGS.hg38.lancet.unpadded.bed file with each region padded by 300 bp
+|`WGS.hg38.lancet.unpadded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) |  hg38 WGS regions created using UTR, exome, and start/stop codon features of the GENCODE 31 reference, augmented with PASS variant calls from Strelka2 and Mutect2
+|`WGS.hg38.mutect2.unpadded.bed` | Reference Regions File  | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) |  hg38 BROAD Institute interval calling list used for Mutect2 variant caller
+|`WGS.hg38.strelka2.unpadded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 BROAD Institute interval calling list used for Strelka2 variant caller
+|`WGS.hg38.vardict.100bp_padded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 BROAD Institute interval calling list used for VarDict variant caller with each region padded by 100 bp
+|`WXS.hg38.100bp_padded.bed` | Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 WXS regions provided by the kit manufacturer used for all variant callers with each region padded by 100 bp
 |`gencode.v27.primary_assembly.annotation.gtf.gz` | Reference file | GENCODE v27 | hg38 gene annotation on primary assembly (reference chromosomes and scaffolds)
 |`independent-specimens.wgs.primary-plus.tsv` | Analysis file |[`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples)| Independent specimens list for WGS sample, primary + non-primary when no primary sample is available
 |`independent-specimens.wgs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS samples, primary only
 |`independent-specimens.wgswxs.primary-plus.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary + non-primary when no primary sample is available
 |`independent-specimens.wgswxs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary only
-|`pbta-cnv-cnvkit.seg.gz` | PBTA data file || Somatic Copy Number Variant - CNVkit [SEG file](https://cnvkit.readthedocs.io/en/stable/fileformats.html#seg)
-|`pbta-cnv-controlfreec.tsv.gz` | PBTA data file || Somatic Copy Number Variant - TSV file that is a merge of [ControlFreeC `*_CNVs` files](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT)
-|`pbta-fusion-arriba.tsv.gz` | PBTA data file || Fusion - [Arriba TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/arriba-tsv-header.md)
+|`pbta-cnv-cnvkit.seg.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling) | Somatic Copy Number Variant - CNVkit [SEG file](https://cnvkit.readthedocs.io/en/stable/fileformats.html#seg)
+|`pbta-cnv-controlfreec.tsv.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling) | Somatic Copy Number Variant - TSV file that is a merge of [ControlFreeC `*_CNVs` files](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT)
+|`pbta-fusion-arriba.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection) | Fusion - [Arriba TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/arriba-tsv-header.md)
 |`pbta-fusion-putative-oncogenic.tsv` | Analysis file | [`analyses/fusion_filtering`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/fusion_filtering) | Filtered and prioritized fusions 
-|`pbta-fusion-starfusion.tsv.gz` | PBTA data file || Fusion - [STARFusion TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/starfusion-tsv-header.md)
-|`pbta-gene-counts-rsem-expected_count.polya.rds` | PBTA data file || Gene expression - RSEM expected counts for poly-A samples (gene-level)
-|`pbta-gene-counts-rsem-expected_count.stranded.rds` | PBTA data file | | Gene expression - RSEM  expected counts for stranded samples (gene-level)
-|`pbta-gene-expression-kallisto.polya.rds` | PBTA data file || Gene expression - kallisto TPM for poly-A samples (transcript-level)
-|`pbta-gene-expression-kallisto.stranded.rds` | PBTA data file | | Gene expression - kallisto TPM for stranded samples (transcript-level)
+|`pbta-fusion-starfusion.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection) | Fusion - [STARFusion TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/starfusion-tsv-header.md)
+|`pbta-gene-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM expected counts for poly-A samples (gene-level)
+|`pbta-gene-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM  expected counts for stranded samples (gene-level)
+|`pbta-gene-expression-kallisto.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - kallisto TPM for poly-A samples (transcript-level)
+|`pbta-gene-expression-kallisto.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - kallisto TPM for stranded samples (transcript-level)
 |`pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for poly-A samples collapsed to gene symbol (gene-level)
 |`pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for stranded samples collapsed to gene symbol (gene-level)
-|`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file || Gene expression - RSEM FPKM for poly-A samples (gene-level)
-|`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | | Gene expression - RSEM FPKM for stranded samples (gene-level)
-|`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file || Gene expression - RSEM TPM for poly-A samples (gene-level)
-|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | | Gene expression -RSEM TPM for stranded samples (gene-level)
-|`pbta-histologies.tsv` | PBTA data file || Harmonized clinical metadata file (see data dictionary [here](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization))
-|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file || Gene expression -RSEM expected counts for poly-A samples (transcript-level)
-|`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | |Gene expression - RSEM expected counts for stranded samples (transcript-level)
-|`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file || Gene expression - RSEM TPM for poly-A samples (transcript-level)
-|`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | | Gene expression - RSEM TPM for stranded samples (transcript-level)
+|`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM FPKM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM FPKM for stranded samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression -RSEM TPM for stranded samples (gene-level)
+|`pbta-histologies.tsv` | PBTA data file | [Clinical data harmonization](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization) | Harmonized clinical metadata file (see data dictionary [here](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization))
+|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression -RSEM expected counts for poly-A samples (transcript-level)
+|`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) |Gene expression - RSEM expected counts for stranded samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for stranded samples (transcript-level)
 |`pbta-snv-consensus-mutation-tmb.tsv` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/) | Tumor mutation burden statistics calculated from consensus SNV, using Strelka2 counts and BED window sizes
 |`pbta-snv-consensus-mutation.maf.tsv.gz` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/)  | Consensus calls for SNVs and small indels; columns in the included file are derived from the Strelka2.
-|`pbta-snv-lancet.vep.maf.gz` | PBTA data file | | Somatic SNV - Lancet [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-mutect2.vep.maf.gz` | PBTA data file || Somatic SNV - Mutect2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-strelka2.vep.maf.gz` | PBTA data file || Somatic SNV - Strelka2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-vardict.vep.maf.gz` | PBTA data file || Somatic SNV - VarDict [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-sv-manta.tsv.gz`| PBTA data file || Somatic Structural Variant - Manta output, annotated with AnnotSV
+|`pbta-snv-lancet.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Lancet [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-mutect2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Mutect2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-strelka2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Strelka2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-vardict.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - VarDict [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-sv-manta.tsv.gz`| PBTA data file | [Structural variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-structural-variant-calling) | Somatic Structural Variant - Manta output, annotated with AnnotSV

From 22db28dcca9a8c22ec022d92b724da42521f9e7b Mon Sep 17 00:00:00 2001
From: Jaclyn Taroni <jaclyn.n.taroni@gmail.com>
Date: Mon, 16 Dec 2019 10:21:04 -0500
Subject: [PATCH 10/11] Fix some description spacing

---
 doc/data-description.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index 1ec96e524c..88f76ba382 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -47,9 +47,9 @@ This document contains information about all data files associated with this pro
 |`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM FPKM for poly-A samples (gene-level)
 |`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM FPKM for stranded samples (gene-level)
 |`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (gene-level)
-|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression -RSEM TPM for stranded samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM TPM for stranded samples (gene-level)
 |`pbta-histologies.tsv` | PBTA data file | [Clinical data harmonization](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization) | Harmonized clinical metadata file (see data dictionary [here](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization))
-|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression -RSEM expected counts for poly-A samples (transcript-level)
+|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM expected counts for poly-A samples (transcript-level)
 |`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) |Gene expression - RSEM expected counts for stranded samples (transcript-level)
 |`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (transcript-level)
 |`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for stranded samples (transcript-level)

From 88abd4b799ccd465304910709b370e5167259b99 Mon Sep 17 00:00:00 2001
From: Jo Lynne <jolynnerokita@d3b.center>
Date: Mon, 16 Dec 2019 10:41:31 -0500
Subject: [PATCH 11/11] Update data-description.md

-add workflows
-note: `WGS.hg38.mutect2.unpadded.bed` should be renamed to `WGS.hg38.mutect2.vardict.unpadded.bed` in the next release, but kept as is for now since this description is for v11 files
---
 doc/data-description.md | 48 ++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/doc/data-description.md b/doc/data-description.md
index 88f76ba382..f5e385e770 100644
--- a/doc/data-description.md
+++ b/doc/data-description.md
@@ -24,39 +24,39 @@ This document contains information about all data files associated with this pro
 |`StrexomeLite_hg38_liftover_100bp_padded.bed`| Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 targeted panel regions used for all variant callers, each region padded by 100 bp
 |`WGS.hg38.lancet.300bp_padded.bed` | Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | WGS.hg38.lancet.unpadded.bed file with each region padded by 300 bp
 |`WGS.hg38.lancet.unpadded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) |  hg38 WGS regions created using UTR, exome, and start/stop codon features of the GENCODE 31 reference, augmented with PASS variant calls from Strelka2 and Mutect2
-|`WGS.hg38.mutect2.unpadded.bed` | Reference Regions File  | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) |  hg38 BROAD Institute interval calling list used for Mutect2 variant caller
-|`WGS.hg38.strelka2.unpadded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 BROAD Institute interval calling list used for Strelka2 variant caller
-|`WGS.hg38.vardict.100bp_padded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 BROAD Institute interval calling list used for VarDict variant caller with each region padded by 100 bp
+|`WGS.hg38.mutect2.unpadded.bed` | Reference Regions File  | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) |  hg38 BROAD Institute interval calling list (restricted to Chr1-22,X,Y,M and non-N regions) used for Mutect2 and VarDict variant callers
+|`WGS.hg38.strelka2.unpadded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 BROAD Institute interval calling list (restricted to Chr1-22,X,Y,M) used for Strelka2 variant caller
+|`WGS.hg38.vardict.100bp_padded.bed` | Reference Regions File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | WGS.hg38.mutect2.unpadded.bed with each region padded by 100 bp used for VarDict variant caller
 |`WXS.hg38.100bp_padded.bed` | Reference Target/Baits File | [SNV and INDEL calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#snv-and-indel-calling) | hg38 WXS regions provided by the kit manufacturer used for all variant callers with each region padded by 100 bp
 |`gencode.v27.primary_assembly.annotation.gtf.gz` | Reference file | GENCODE v27 | hg38 gene annotation on primary assembly (reference chromosomes and scaffolds)
 |`independent-specimens.wgs.primary-plus.tsv` | Analysis file |[`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples)| Independent specimens list for WGS sample, primary + non-primary when no primary sample is available
 |`independent-specimens.wgs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS samples, primary only
 |`independent-specimens.wgswxs.primary-plus.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary + non-primary when no primary sample is available
 |`independent-specimens.wgswxs.primary.tsv` | Analysis file | [`analyses/independent-samples`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/independent-samples) | Independent specimens list for WGS and WXS samples, primary only
-|`pbta-cnv-cnvkit.seg.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling) | Somatic Copy Number Variant - CNVkit [SEG file](https://cnvkit.readthedocs.io/en/stable/fileformats.html#seg)
-|`pbta-cnv-controlfreec.tsv.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling) | Somatic Copy Number Variant - TSV file that is a merge of [ControlFreeC `*_CNVs` files](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT)
-|`pbta-fusion-arriba.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection) | Fusion - [Arriba TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/arriba-tsv-header.md)
+|`pbta-cnv-cnvkit.seg.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc_combined_somatic_wgs_cnv_wf.cwl) | Somatic Copy Number Variant - CNVkit [SEG file](https://cnvkit.readthedocs.io/en/stable/fileformats.html#seg)
+|`pbta-cnv-controlfreec.tsv.gz` | PBTA data file | [Copy number variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-copy-number-variant-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc_combined_somatic_wgs_cnv_wf.cwl) | Somatic Copy Number Variant - TSV file that is a merge of [ControlFreeC `*_CNVs` files](http://boevalab.inf.ethz.ch/FREEC/tutorial.html#OUTPUT)
+|`pbta-fusion-arriba.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Fusion - [Arriba TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/arriba-tsv-header.md)
 |`pbta-fusion-putative-oncogenic.tsv` | Analysis file | [`analyses/fusion_filtering`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/fusion_filtering) | Filtered and prioritized fusions 
-|`pbta-fusion-starfusion.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection) | Fusion - [STARFusion TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/starfusion-tsv-header.md)
-|`pbta-gene-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM expected counts for poly-A samples (gene-level)
-|`pbta-gene-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM  expected counts for stranded samples (gene-level)
-|`pbta-gene-expression-kallisto.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - kallisto TPM for poly-A samples (transcript-level)
-|`pbta-gene-expression-kallisto.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - kallisto TPM for stranded samples (transcript-level)
+|`pbta-fusion-starfusion.tsv.gz` | PBTA data file | [Gene fusion detection](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-fusion-detection); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Fusion - [STARFusion TSV](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/starfusion-tsv-header.md)
+|`pbta-gene-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM expected counts for poly-A samples (gene-level)
+|`pbta-gene-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl)  | Gene expression - RSEM  expected counts for stranded samples (gene-level)
+|`pbta-gene-expression-kallisto.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - kallisto TPM for poly-A samples (transcript-level)
+|`pbta-gene-expression-kallisto.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl)  | Gene expression - kallisto TPM for stranded samples (transcript-level)
 |`pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for poly-A samples collapsed to gene symbol (gene-level)
 |`pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` | Analysis file | [`analyses/collapse-rnaseq`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq) | Gene expression - RSEM FPKM for stranded samples collapsed to gene symbol (gene-level)
-|`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM FPKM for poly-A samples (gene-level)
-|`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM FPKM for stranded samples (gene-level)
-|`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (gene-level)
-|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation)  | Gene expression - RSEM TPM for stranded samples (gene-level)
+|`pbta-gene-expression-rsem-fpkm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM FPKM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-fpkm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl)  | Gene expression - RSEM FPKM for stranded samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM TPM for poly-A samples (gene-level)
+|`pbta-gene-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl)  | Gene expression - RSEM TPM for stranded samples (gene-level)
 |`pbta-histologies.tsv` | PBTA data file | [Clinical data harmonization](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization) | Harmonized clinical metadata file (see data dictionary [here](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#clinical-data-harmonization))
-|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM expected counts for poly-A samples (transcript-level)
-|`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) |Gene expression - RSEM expected counts for stranded samples (transcript-level)
-|`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for poly-A samples (transcript-level)
-|`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation) | Gene expression - RSEM TPM for stranded samples (transcript-level)
+|`pbta-isoform-counts-rsem-expected_count.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM expected counts for poly-A samples (transcript-level)
+|`pbta-isoform-counts-rsem-expected_count.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) |Gene expression - RSEM expected counts for stranded samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.polya.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM TPM for poly-A samples (transcript-level)
+|`pbta-isoform-expression-rsem-tpm.stranded.rds` | PBTA data file | [Gene expression abundance estimation](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#gene-expression-abundance-estimation); [Workflow](https://github.com/kids-first/kf-rnaseq-workflow/blob/master/workflow/kfdrc_RNAseq_workflow.cwl) | Gene expression - RSEM TPM for stranded samples (transcript-level)
 |`pbta-snv-consensus-mutation-tmb.tsv` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/) | Tumor mutation burden statistics calculated from consensus SNV, using Strelka2 counts and BED window sizes
 |`pbta-snv-consensus-mutation.maf.tsv.gz` | Analysis file | [`analyses/snv-callers`](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/analyses/snv-callers/)  | Consensus calls for SNVs and small indels; columns in the included file are derived from the Strelka2.
-|`pbta-snv-lancet.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Lancet [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-mutect2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Mutect2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-strelka2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - Strelka2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-snv-vardict.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling) | Somatic SNV - VarDict [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
-|`pbta-sv-manta.tsv.gz`| PBTA data file | [Structural variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-structural-variant-calling) | Somatic Structural Variant - Manta output, annotated with AnnotSV
+|`pbta-snv-lancet.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc-lancet-wf.cwl) | Somatic SNV - Lancet [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-mutect2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc_strelka2_mutect2_manta_workflow.cwl) | Somatic SNV - Mutect2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-strelka2.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc_strelka2_mutect2_manta_workflow.cwl) | Somatic SNV - Strelka2 [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-snv-vardict.vep.maf.gz` | PBTA data file | [Somatic mutation calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-mutation-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc-vardict-wf.cwl) | Somatic SNV - VarDict [annotated MAF file](https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/master/doc/format/vep-maf.md)
+|`pbta-sv-manta.tsv.gz`| PBTA data file | [Structural variant calling](https://github.com/AlexsLemonade/OpenPBTA-manuscript/blob/master/content/03.methods.md#somatic-structural-variant-calling); [Workflow](https://github.com/d3b-center/publication_workflows/blob/master/openPBTA/kfdrc_strelka2_mutect2_manta_workflow.cwl) | Somatic Structural Variant - Manta output, annotated with AnnotSV