diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R index f2db7e856d..65559133e8 100644 --- a/analyses/create-subset-files/01-get_biospecimen_identifiers.R +++ b/analyses/create-subset-files/01-get_biospecimen_identifiers.R @@ -57,7 +57,7 @@ get_biospecimen_ids <- function(filename, id_mapping_df) { # 'Tumor_Sample_Barcode' # if the files have consensus in the name, the first line of the file does # not contain MAF version information - if (grepl("consensus", filename)) { + if (grepl("consensus|hotspots", filename)) { snv_file <- data.table::fread(filename, data.table = FALSE) } else { snv_file <- data.table::fread(filename, diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R index 5f366b49dd..b5f7415772 100644 --- a/analyses/create-subset-files/02-subset_files.R +++ b/analyses/create-subset-files/02-subset_files.R @@ -69,7 +69,7 @@ subset_files <- function(filename, biospecimen_ids, output_directory) { # filtering strategy depends on the file type, mostly because how the sample # IDs change based on the file type -- that's why this logic is required if (grepl("pbta-snv", filename)) { - if (grepl("consensus-mutation", filename)) { + if (grepl("consensus-mutation|hotspots", filename)) { snv_file <- data.table::fread(filename, data.table = FALSE) snv_file %>% dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>% diff --git a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS index 25d0c2a3c5..947a3ee3e4 100644 Binary files a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS and b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS differ diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh index eff24f3fa6..6bf64ad3b2 100755 --- a/analyses/create-subset-files/create_subset_files.sh +++ b/analyses/create-subset-files/create_subset_files.sh @@ -7,7 +7,7 @@ set -o pipefail # Set defaults for release and biospecimen file name BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS} -RELEASE=${RELEASE:-release-v19-20210423} +RELEASE=${RELEASE:-release-v20-20210726} NUM_MATCHED=${NUM_MATCHED:-15} # This option controls whether or not the two larger MAF files are skipped as @@ -87,6 +87,9 @@ cp $FULL_DIRECTORY/pbta-mend* $SUBSET_DIRECTORY # fusion summary files cp $FULL_DIRECTORY/fusion_summary* $SUBSET_DIRECTORY +# MB pathology subtypes +cp $FULL_DIRECTORY/pbta-mb-pathology-subtypes.tsv $SUBSET_DIRECTORY + # if the md5sum.txt file already exists, get rid of it cd $SUBSET_DIRECTORY rm -f md5sum.txt