From 702b120e7024fb179f6112540da55f684124d282 Mon Sep 17 00:00:00 2001
From: BillClifford <bcli4d>
Date: Mon, 26 Jun 2023 11:53:46 -0700
Subject: [PATCH] ~More completion of v15 ETL, cleanup

---
 .../publish_idc_pdp_staging_datasets.py       |   4 +-
 .../copy_apollo_blobs.py                      |  48 ---
 .../copy_pathology_blobs.py                   |  49 ---
 .../pathology_collections.py                  |  17 -
 gcs/{copy_bucket_mp => }/copy_bucket_mp.py    |   2 -
 ..._to_staging.py => copy_premerge_to_dev.py} |   0
 gcs/misc/README.md                            |   1 -
 .../delete_idc_dev_defaced_pathology.py       |  43 ---
 .../delete_list_of_blobs.py                   |  95 -----
 gcs/misc/move_blobs_between_buckets.py        | 178 ---------
 gcs/obsolete/copy_blobs.py                    | 265 -------------
 .../collection_list.txt                       |   1 -
 .../copy_collections_bq.py                    | 268 -------------
 .../copy_collections_nlst.py                  |  75 ----
 .../copy_collections_psql.py                  | 263 -------------
 .../copy_collections_v5.py                    |  66 ----
 .../copy_prestaging_to_staging.py             | 124 ------
 .../copy_staging_buckets.py                   |  94 -----
 .../copy_staging_buckets_v5.py                |  38 --
 .../README.md                                 |   2 -
 .../depopulate_collections_from_bucket.py     | 346 -----------------
 ...ced_collections_from_open_bucket.v5.dev.py |  70 ----
 ...ced_collections_from_open_bucket.v5.pdp.py |  70 ----
 ...s_from_dicomstore_staging_bucket.v5.dev.py |  81 ----
 ...ted_collections_from_open_bucket.v5.dev.py |  69 ----
 ...ted_collections_from_open_bucket.v5.pdp.py |  69 ----
 .../depopulate_version_from_bucket.v5.dev.py  |  89 -----
 .../depopulate_versions_from_bucket.py        | 243 ------------
 gcs/obsolete/empty_and_delete_bucket.py       |  51 ---
 gcs/obsolete/empty_bucket.py                  |  48 ---
 gcs/obsolete/empty_idc_dev_etl_v2_buckets.py  | 107 ------
 .../move_collection/copy_collection.py        |   0
 .../move_collection/delete_collection.py      |   0
 .../move_apollo/copy_apollo_collections.py    |   0
 .../move_apollo/delete_apollo_collections.py  |   0
 .../copy_cptac_cm_and_lscc_collection.py      |   0
 .../delete_collection.py                      |   0
 .../copy_prostate_diagnosis_collection.py     |   0
 .../README.md                                 |   2 -
 .../populate_bucket_with_collections.py       | 361 ------------------
 .../populate_bucket_with_cr_collections.v5.py |  69 ----
 ..._bucket_with_defaced_collections.v5.dev.py |  70 ----
 ...bucket_with_defaced_collections.v5.prod.py |  70 ----
 ...ate_bucket_with_excluded_collections.v5.py |  70 ----
 ...opulate_bucket_with_open_collections.v5.py |  77 ----
 ...bucket_with_redacted_collections.v5.dev.py |  70 ----
 ...ucket_with_redacted_collections.v5.prod.py |  70 ----
 gcs/obsolete/validate_opens_bucket.py         |  66 ----
 .../copy_staging_buckets_to_public_buckets.py |  49 +++
 .../validate_bucket/validate_bucket_mp.py     |   0
 .../validate_bucket/validate_idc_dev_cr.py    |   0
 .../validate_idc_dev_defaced.py               |   0
 .../validate_idc_dev_excluded.py              |   0
 .../validate_bucket/validate_idc_dev_open.py  |   0
 .../validate_idc_dev_redacted.py              |   0
 .../validate_bucket/validate_idc_open_cr.py   |   0
 .../validate_idc_open_cr_staging.py           |   0
 .../validate_bucket/validate_idc_open_idc.py  |   0
 .../validate_bucket/validate_idc_open_idc1.py |   0
 .../validate_idc_open_idc1_staging.py         |   0
 .../validate_public_datasets_idc.py           |   0
 .../validate_public_datasets_idc_staging.py   |   0
 validation/compare_hashes.py                  |   2 +-
 63 files changed, 52 insertions(+), 3800 deletions(-)
 delete mode 100644 gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py
 delete mode 100644 gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py
 delete mode 100644 gcs/copy_blobs_using_BQ_query/pathology_collections.py
 rename gcs/{copy_bucket_mp => }/copy_bucket_mp.py (99%)
 rename gcs/copy_premerge_to_dev_buckets/{copy_premerge_to_staging.py => copy_premerge_to_dev.py} (100%)
 delete mode 100644 gcs/misc/README.md
 delete mode 100644 gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py
 delete mode 100644 gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py
 delete mode 100644 gcs/misc/move_blobs_between_buckets.py
 delete mode 100644 gcs/obsolete/copy_blobs.py
 delete mode 100644 gcs/obsolete/copy_collections__obsolete/collection_list.txt
 delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py
 delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py
 delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py
 delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py
 delete mode 100644 gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py
 delete mode 100644 gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py
 delete mode 100644 gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/README.md
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py
 delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py
 delete mode 100644 gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py
 delete mode 100644 gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py
 delete mode 100644 gcs/obsolete/empty_and_delete_bucket.py
 delete mode 100644 gcs/obsolete/empty_bucket.py
 delete mode 100644 gcs/obsolete/empty_idc_dev_etl_v2_buckets.py
 rename gcs/{ => obsolete}/move_collection/copy_collection.py (100%)
 rename gcs/{ => obsolete}/move_collection/delete_collection.py (100%)
 rename gcs/{ => obsolete}/move_collection/move_apollo/copy_apollo_collections.py (100%)
 rename gcs/{ => obsolete}/move_collection/move_apollo/delete_apollo_collections.py (100%)
 rename gcs/{ => obsolete}/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py (100%)
 rename gcs/{ => obsolete}/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py (100%)
 rename gcs/{ => obsolete}/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py (100%)
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/README.md
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py
 delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py
 delete mode 100644 gcs/obsolete/validate_opens_bucket.py
 create mode 100644 gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_bucket_mp.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_cr.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_defaced.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_excluded.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_open.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_redacted.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_cr.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_cr_staging.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc1.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc1_staging.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_public_datasets_idc.py (100%)
 rename gcs/{ => release_gcs_data}/validate_bucket/validate_public_datasets_idc_staging.py (100%)
diff --git a/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py b/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py
index 63520ed..4b85190 100644
--- a/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py
+++ b/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py
@@ -33,7 +33,7 @@
     progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}')
 
     for src_dataset in (
-            'idc_v14',
-            'idc_v14_clinical',
+            f'idc_v{settings.CURRENT_VERSION}',
+            f'idc_v1{settings.CURRENT_VERSION}_clinical',
             ):
         publish_dataset(args)
diff --git a/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py b/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py
deleted file mode 100644
index 6da1c0a..0000000
--- a/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# One time script to copy all APOLLOxx blobs from idc-dev-redacted to idc-dev-open
-import json
-import os
-import argparse
-
-from copy_blobs_mp import copy_all_blobs
-from utilities.logging_config import successlogger, progresslogger, errlogger
-
-# Copy the blobs that are new to a version from dev pre-staging buckets
-# to dev staging buckets.
-import settings
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Version to work on')
-    parser.add_argument('--src_bucket', default="idc-dev-redacted")
-    parser.add_argument('--dst_bucket', default="idc-dev-open")
-    parser.add_argument('--batch', default=1000)
-    parser.add_argument('--processes', default=1)
-    args = parser.parse_args()
-    args.id = 0 # Default process ID
-
-    progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}')
-
-    query=f"""
-    SELECT DISTINCT concat(i_uuid,'.dcm') blob
-    FROM `idc-dev-etl.idc_v{args.version}_dev.all_joined` aj
-    WHERE collection_id LIKE 'APOLLO%'
-    """
-
-    copy_all_blobs(args, query)
\ No newline at end of file
diff --git a/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py b/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py
deleted file mode 100644
index 308d19c..0000000
--- a/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-import argparse
-
-from copy_blobs_mp import copy_all_blobs
-from pathology_collections import collection_list
-from utilities.logging_config import successlogger, progresslogger, errlogger
-
-# Copy the blobs that are new to a version from dev pre-staging buckets
-# to dev staging buckets.
-import settings
-
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=12, help='Version to work on')
-    parser.add_argument('--src_bucket', default="idc-dev-open")
-    parser.add_argument('--dst_bucket', default="pathology_blobs_whc")
-    parser.add_argument('--batch', default=1000)
-    parser.add_argument('--processes', default=1)
-    args = parser.parse_args()
-    args.id = 0 # Default process ID
-
-    progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}')
-
-    query=f"""
-    select concat(i_uuid,'.dcm') blob
-    from `idc-dev-etl.idc_v{args.version}_dev.all_joined` aj
-    where idc_version={args.version} and i_source='path'
-    and collection_id in {tuple(collection_list)}    """
-
-    copy_all_blobs(args, query)
\ No newline at end of file
diff --git a/gcs/copy_blobs_using_BQ_query/pathology_collections.py b/gcs/copy_blobs_using_BQ_query/pathology_collections.py
deleted file mode 100644
index 780c49a..0000000
--- a/gcs/copy_blobs_using_BQ_query/pathology_collections.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-collection_list = ['CPTAC-LSCC', 'CPTAC-LSCC']
\ No newline at end of file
diff --git a/gcs/copy_bucket_mp/copy_bucket_mp.py b/gcs/copy_bucket_mp.py
similarity index 99%
rename from gcs/copy_bucket_mp/copy_bucket_mp.py
rename to gcs/copy_bucket_mp.py
index fd18a46..55cb86d 100644
--- a/gcs/copy_bucket_mp/copy_bucket_mp.py
+++ b/gcs/copy_bucket_mp.py
@@ -90,8 +90,6 @@ def worker(input, args, dones):
 def copy_all_instances(args, dones):
     client = storage.Client()
     src_bucket = storage.Bucket(client, args.src_bucket)
-
-
     n=len(dones)
 
     progresslogger.info(f"{len(dones)} blobs previously copied")
diff --git a/gcs/copy_premerge_to_dev_buckets/copy_premerge_to_staging.py b/gcs/copy_premerge_to_dev_buckets/copy_premerge_to_dev.py
similarity index 100%
rename from gcs/copy_premerge_to_dev_buckets/copy_premerge_to_staging.py
rename to gcs/copy_premerge_to_dev_buckets/copy_premerge_to_dev.py
diff --git a/gcs/misc/README.md b/gcs/misc/README.md
deleted file mode 100644
index 055dae6..0000000
--- a/gcs/misc/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Mostly replaced by newer scripts/
\ No newline at end of file
diff --git a/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py b/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py
deleted file mode 100644
index 4f73a0b..0000000
--- a/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# One time script to delete CPTAC pathology from the idc-open-idc1 bucket
-# It was previously moved from the idc-dev-defaced bucket to idc-dev-open
-# and idc-open-pdpn-staging.
-import argparse
-from gcs.misc.delete_list_of_blobs.delete_list_of_blobs import del_all_instances
-from google.cloud import bigquery
-from utilities.logging_config import successlogger, progresslogger, errlogger
-from python_settings import settings
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--bucket', default='idc-dev-defaced')
-    parser.add_argument('--processes', default=16, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
-    parser.add_argument('--project', default='canceridc-data')
-    args = parser.parse_args()
-
-    client = bigquery.Client()
-    query = f"""
-    SELECT distinct i_uuid FROM `idc-dev-etl.idc_v12_dev.all_joined_included` 
-    where collection_id in ('CPTAC-CM', 'CPTAC-LSCC') 
-    and i_source='idc'
-    and i_rev_idc_version<10
-    order by i_uuid
-    """
-    instances = [f'{row.i_uuid}.dcm' for row in client.query(query)]
-    del_all_instances  (args, instances)
diff --git a/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py b/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py
deleted file mode 100644
index 0aab39f..0000000
--- a/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from utilities.logging_config import successlogger, progresslogger, errlogger
-
-import time
-from multiprocessing import Process, Queue
-from google.cloud import storage
-from google.api_core.exceptions import ServiceUnavailable, NotFound
-
-def delete_instances(args, client, bucket, blobs, n):
-    try:
-        # with client.batch():
-        #     for blob in blobs:
-        #         bucket.blob(blob).delete()
-        #         # bucket.blob(blob[0], generation=blob[1]).delete()
-        for blob in blobs:
-            bucket.blob(blob).delete()
-            # bucket.blob(blob[0], generation=blob[1]).delete()
-            successlogger.info(f'{blob}')
-    except ServiceUnavailable:
-        errlogger.error('p%s Delete %s blob %s failed', args.id, args.bucket, blob)
-    except NotFound:
-        errlogger.error('p%s Delete %s blobs % failed, not found', args.id, args.bucket, blob)
-    except Exception as exc:
-        errlogger.error('p%s Exception on %s blob %s: %s', args.id, args.bucket, blob, exc)
-
-
-def worker(input, args):
-    client = storage.Client()
-    bucket = storage.Bucket(client, args.bucket)
-    for blobs, n in iter(input.get, 'STOP'):
-        delete_instances(args, client, bucket, blobs, n)
-
-
-def del_all_instances(args, instance_list):
-    bucket = args.bucket
-    client = storage.Client()
-    bucket = storage.Bucket(client, args.bucket)
-
-    dones = set(open(successlogger.handlers[0].baseFilename).read().splitlines())
-
-    num_processes = args.processes
-    processes = []
-
-    task_queue = Queue()
-
-    strt = time.time()
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args)))
-        processes[-1].start()
-
-
-    # Distribute the work across the task_queues
-    n = 0
-    n=0
-    # Submit args.batch size chunks to process
-    while instance_list:
-        some_instances= list(set(instance_list[0:args.batch]) - dones)
-        instance_list = instance_list[args.batch:]
-        if some_instances:
-            task_queue.put((some_instances,n))
-        n += args.batch
-    progresslogger.info('Primary work distribution complete; {} blobs'.format(n))
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-
-
-    # Wait for process to terminate
-    for process in processes:
-        # print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-
-    delta = time.time() - strt
-    rate = (n)/delta
-    progresslogger.info(f'Completed bucket {args.bucket}, {rate} instances/sec, {num_processes} processes')
diff --git a/gcs/misc/move_blobs_between_buckets.py b/gcs/misc/move_blobs_between_buckets.py
deleted file mode 100644
index f41bd2a..0000000
--- a/gcs/misc/move_blobs_between_buckets.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-import json
-import os
-import argparse
-import logging
-from logging import INFO
-from google.cloud import bigquery, storage
-import time
-from multiprocessing import Process, Queue
-from utilities.logging_config import successlogger, progresslogger, errlogger
-
-# Copy the blobs that are new to a version from dev pre-staging buckets
-# to dev staging buckets.
-import settings
-
-
-# Get a the dev_url and pub_url of all new instances. The dev_url is the url of the
-# premerge bucket or staging bucket holding the new instance. The pub_url is the
-# url of the bucket to which to copy it
-def get_urls(args):
-    client = bigquery.Client()
-    query = f"""
-    SELECT
-      instance_uuid uuid
-    FROM
-      `idc-dev-etl.idc_v{args.version}_pub.auxiliary_metadata`
-    WHERE
-      instance_revised_idc_version = {args.version}
-      AND tcia_api_collection_id = '{args.collection}'
-    """
-    # urls = list(client.query(query))
-    query_job = client.query(query)  # Make an API request.
-    query_job.result()  # Wait for the query to complete.
-    destination = query_job.destination
-    destination = client.get_table(destination)
-    return destination
-
-def move_some_blobs(args, client, urls, n, dones):
-    done = 0
-    copied = 0
-    for blob_name in urls:
-        if not blob_name in dones:
-            src_bucket = client.bucket(args.src_bucket)
-            src_blob = src_bucket.blob(blob_name)
-            trg_bucket = client.bucket(args.trg_bucket)
-            trg_blob = trg_bucket.blob(blob_name)
-            for attempt in range(3):
-                try:
-                    rewrite_token = False
-                    while True:
-                        rewrite_token, bytes_rewritten, bytes_to_rewrite = trg_blob.rewrite(
-                            src_blob, token=rewrite_token
-                        )
-                        if not rewrite_token:
-                            break
-                    src_blob.delete()
-
-                    successlogger.info('%s', blob_name)
-                    progresslogger.info(f'p{args.id}: {done+n}of{len(urls)+n}: {args.src_bucket}/{blob_name} --> {args.trg_bucket}/{blob_name}')
-                    break
-                except Exception as exc:
-                    errlogger.error('p%s: Blob: %s, attempt: %s;  %s', args.id, blob_name, attempt, exc)
-
-        done += 1
-    if copied == 0:
-        progresslogger.info(f'p{args.id}: Skipped {n}:{n+done-1}')
-
-
-def worker(input, args, dones):
-    # proglogger.info('p%s: Worker starting: args: %s', args.id, args )
-    # print(f'p{args.id}: Worker starting: args: {args}')
-
-    # RETRIES = 3
-    # try:
-    #     dones = set(open(f'{successlogger.handlers[0].baseFilename}').read().splitlines())
-    # except:
-    #     dones = []
-
-    client = storage.Client()
-    for urls, n in iter(input.get, 'STOP'):
-        move_some_blobs(args, client, urls, n, dones)
-
-
-def copy_all_blobs(args):
-    bq_client = bigquery.Client()
-    destination = get_urls(args)
-
-    num_processes = args.processes
-    processes = []
-    # Create a pair of queue for each process
-
-    task_queue = Queue()
-
-    strt = time.time()
-    dones = set(open(f'{successlogger.handlers[0].baseFilename}').read().splitlines())
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args, dones)))
-        processes[-1].start()
-
-    # Distribute the work across the task_queues
-    n = 0
-    for page in bq_client.list_rows(destination, page_size=args.batch).pages:
-        uuids = [f'{row.uuid}.dcm' for row in page]
-        task_queue.put((uuids, n))
-        # print(f'Queued {n}:{n+args.batch-1}')
-        n += page.num_items
-    print('Primary work distribution complete; {} blobs'.format(n))
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-
-
-    # Wait for process to terminate
-    for process in processes:
-        print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-
-    delta = time.time() - strt
-    rate = (n)/delta
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Version to work on')
-    # parser.add_argument('--log_dir', default=f'{settings.LOGGING_BASE}/{settings.BASE_NAME}')
-    parser.add_argument('--batch', default=100)
-    parser.add_argument('--processes', default=16)
-    parser.add_argument('--collection', default = 'CPTAC-LSCC')
-    parser.add_argument('--src_bucket', default = 'idc-open-idc1')
-    parser.add_argument('--trg_bucket', default = 'idc-open-pdp-staging')
-    args = parser.parse_args()
-    args.id = 0 # Default process ID
-
-    progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}')
-
-    # if not os.path.exists(settings.LOGGING_BASE):
-    #     os.mkdir(settings.LOGGING_BASE)
-    # if not os.path.exists(args.log_dir):
-    #     os.mkdir(args.log_dir)
-    #
-    # successlogger = logging.getLogger('root.success')
-    # successlogger.setLevel(INFO)
-    # for hdlr in successlogger.handlers[:]:
-    #     successlogger.removeHandler(hdlr)
-    # success_fh = logging.FileHandler('{}/success.log'.format(args.log_dir))
-    # successlogger.addHandler(success_fh)
-    # successformatter = logging.Formatter('%(message)s')
-    # success_fh.setFormatter(successformatter)
-    #
-    # errlogger = logging.getLogger('root.err')
-    # for hdlr in errlogger.handlers[:]:
-    #     errlogger.removeHandler(hdlr)
-    # err_fh = logging.FileHandler('{}/error.log'.format(args.log_dir))
-    # errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-    # errlogger.addHandler(err_fh)
-    # err_fh.setFormatter(errformatter)
-
-
-    copy_all_blobs(args)
\ No newline at end of file
diff --git a/gcs/obsolete/copy_blobs.py b/gcs/obsolete/copy_blobs.py
deleted file mode 100644
index 5157993..0000000
--- a/gcs/obsolete/copy_blobs.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Copy all blobs named in the DB to some bucket.
-# This is specifically to copy blobs from the dev bucket
-# to the open bucket.
-# Since we multiprocess by collection, this depends on the
-# a table that is the join of the version, collection,..., instance tables.
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-import time
-from datetime import timedelta
-from multiprocessing import Process, Queue
-from queue import Empty
-from google.cloud import storage
-
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-import psycopg2
-from psycopg2.extras import DictCursor
-
-TRIES=3
-
-def val_collection(cur, args, dones, collection_index, tcia_api_collection_id):
-    if not tcia_api_collection_id in dones:
-
-        # src_client = storage.Client(project=args.src_project)
-        # dst_client = storage.Client(project=args.dst_project)
-        # src_bucket = src_client.bucket(args.src_bucket)
-        # dst_bucket = dst_client.bucket(args.dst_bucket, user_project=args.dst_project)
-        client = storage.Client()
-        src_bucket = client.bucket(args.src_bucket, user_project=args.src_project)
-        dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project)
-        n = 1
-
-        try:
-            done_instances = set(open(f'./logs/cb_{tcia_api_collection_id}_success.log').read().splitlines())
-        except:
-            done_instances = []
-
-        increment = 5000
-        query= f"""
-        SELECT * 
-        FROM {args.all_table}
-        WHERE tcia_api_collection_id = '{tcia_api_collection_id}'
-        order by sop_instance_uid
-        """
-        cur.execute(query)
-        rowcount=cur.rowcount
-        successes = open(f'./logs/cb_{tcia_api_collection_id}_success.log', 'a')
-        failures = open(f'./logs/cb_{tcia_api_collection_id}_failures.log', 'a')
-        failure_count=0
-        while True:
-            rows = cur.fetchmany(increment)
-            if len(rows) == 0:
-                break
-            for row in rows:
-                index = f'{n}/{rowcount}'
-                blob_name = f'{row["instance_uuid"]}.dcm'
-                if not blob_name in done_instances:
-                    retries = 0
-                    while True:
-                        try:
-                            blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket)
-                            rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, tcia_api_collection_id, blob_name)
-                            successes.write(f'{blob_name}\n')
-                            break
-                        except Exception as exc:
-                            errlogger.error('%s %s: %s: copy failed %s\n, retry %s; %s', args.id,
-                                            index, tcia_api_collection_id,
-                                            blob_name, retries, exc)
-                            if retries == TRIES:
-                                failures.write(f'{blob_name}; {exc}\n')
-                                failure_count += 1
-                                break
-                        time.sleep(retries)
-                        retries += 1
-                else:
-                    if n % 10000 == 0:
-                        rootlogger.info('%s %s: %s: skipping blob %s ', args.id, index, tcia_api_collection_id, blob_name)
-                n += 1
-
-        if failure_count == 0:
-            # with open(args.dones, 'a') as f:
-            #     f.write(f"{tcia_api_collection_id}\n")
-            donelogger.info('%s', tcia_api_collection_id)
-            rootlogger.info('%s: Completed collection %s ', args.id, tcia_api_collection_id)
-        else:
-            errlogger.error('%s: Failed collection %s; %s failures ', args.id, tcia_api_collection_id, failure_count)
-
-    else:
-        rootlogger.info("p%s: Collection %s, %s, previously built", args.id, tcia_api_collection_id, collection_index)
-
-
-
-def worker(input, output, args, dones):
-    rootlogger.debug('p%s: Worker starting: args: %s', args.id, args)
-    conn = psycopg2.connect(dbname=settings.DATABASE_NAME, user=settings.DATABASE_USERNAME,
-                            password=settings.DATABASE_PASSWORD, host=settings.DATABASE_HOST)
-    with conn:
-        with conn.cursor(cursor_factory=DictCursor) as cur:
-
-            for more_args in iter(input.get, 'STOP'):
-                validated = 0
-                for attempt in range(TRIES):
-                    try:
-                        collection_index, tcia_api_collection_id = more_args
-                        # copy_collection(args, dones, collection_index, tcia_api_collection_id)
-                        val_collection(cur, args, dones, collection_index, tcia_api_collection_id)
-                        break
-                    except Exception as exc:
-                        errlogger.error("p%s, exception %s; reattempt %s on collection %s", args.id, exc, attempt, tcia_api_collection_id)
-
-
-                if attempt == TRIES:
-                    errlogger.error("p%s, Failed to process collection: %s", args.id, tcia_api_collection_id)
-
-                output.put((tcia_api_collection_id))
-
-def copy_collections(cur, args, version):
-    # Session = sessionmaker(bind= sql_engine)
-    # version = version_is_done(sess, args.version)
-    try:
-        skips = open(args.skips).read().splitlines()
-    except:
-        skips = []
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    begin = time.time()
-    cur.execute("""
-        SELECT * FROM collection
-        WHERE version_id = (%s)""", (version['id'],))
-    collections = cur.fetchall()
-
-    rootlogger.info("Version %s; %s collections", version['idc_version_number'], len(collections))
-    if args.processes == 0:
-        args.id=0
-        for collection in collections:
-            if not collection['tcia_api_collection_id'] in skips:
-                collection_index = f'{collections.index(collection)+1} of {len(collections)}'
-                val_collection(cur, args, dones, collection_index,  collection['tcia_api_collection_id'])
-
-    else:
-        processes = []
-        # Create queues
-        task_queue = Queue()
-        done_queue = Queue()
-
-        # List of patients enqueued
-        enqueued_collections = []
-
-        # Start worker processes
-        for process in range(args.processes):
-            args.id = process + 1
-            processes.append(
-                Process(target=worker, args=(task_queue, done_queue, args, dones)))
-            processes[-1].start()
-
-        # Enqueue each patient in the the task queue
-        for collection in collections:
-            if not collection['tcia_api_collection_id'] in skips:
-                collection_index = f'{collections.index(collection) + 1} of {len(collections)}'
-                task_queue.put((collection_index, collection['tcia_api_collection_id']))
-                enqueued_collections.append(collection['tcia_api_collection_id'])
-
-        # Collect the results for each patient
-        try:
-            while not enqueued_collections == []:
-                # Timeout if waiting too long
-                tcia_api_collection_id = done_queue.get(True)
-                enqueued_collections.remove(tcia_api_collection_id)
-
-            # Tell child processes to stop
-            for process in processes:
-                task_queue.put('STOP')
-
-            # Wait for them to stop
-            for process in processes:
-                process.join()
-
-            duration = str(timedelta(seconds=(time.time() - begin)))
-            rootlogger.info("Collection %s, %s, completed in %s", collection['tcia_api_collection_id'], collection_index,
-                            duration)
-
-
-        except Empty as e:
-            errlogger.error("Exception copy_collections__obsolete ")
-            for process in processes:
-                process.terminate()
-                process.join()
-            duration = str(timedelta(seconds=(time.time() - begin)))
-            rootlogger.info("Collection copying NOT completed")
-
-
-def precopy(args):
-    conn = psycopg2.connect(dbname=settings.DATABASE_NAME, user=settings.DATABASE_USERNAME,
-                            password=settings.DATABASE_PASSWORD, host=settings.DATABASE_HOST)
-    with conn:
-        with conn.cursor(cursor_factory=DictCursor) as cur:
-            cur.execute("""
-            SELECT * 
-            FROM version
-            WHERE idc_version_number = (%s)""", (args.version,))
-
-            version = cur.fetchone()
-            copy_collections(cur, args, version)
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=2, help='Next version to generate')
-    parser.add_argument('--src_bucket', default='idc_dev', help='Bucket to validate')
-    parser.add_argument('--dst_bucket', default='idc-open', help='Bucket to validate')
-    parser.add_argument('--all_table', default='all_v2')
-    parser.add_argument('--processes', default=16, help="Number of concurrent processes")
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='canceridc-data')
-    parser.add_argument('--skips', default='./logs/copy_blobs_skips.log' )
-    parser.add_argument('--dones', default='./logs/copy_blobs__dones.log' )
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler('{}/logs/copy_blobs_log.log'.format(os.environ['PWD']))
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    donelogger = logging.getLogger('done')
-    done_fh = logging.FileHandler(args.dones)
-    doneformatter = logging.Formatter('%(message)s')
-    donelogger.addHandler(done_fh)
-    done_fh.setFormatter(doneformatter)
-    donelogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-    err_fh = logging.FileHandler('{}/logs/copy_blobs_err.log'.format(os.environ['PWD']))
-    errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-    errlogger.addHandler(err_fh)
-    err_fh.setFormatter(errformatter)
-
-    precopy(args)
diff --git a/gcs/obsolete/copy_collections__obsolete/collection_list.txt b/gcs/obsolete/copy_collections__obsolete/collection_list.txt
deleted file mode 100644
index 8e6c556..0000000
--- a/gcs/obsolete/copy_collections__obsolete/collection_list.txt
+++ /dev/null
@@ -1 +0,0 @@
-NLST
\ No newline at end of file
diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py
deleted file mode 100644
index e5e34df..0000000
--- a/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all included collections (not in the excluded_collections table) to another bucket.
-This is generally used to populate a bucket that can then be imported
-into a DICOM store.
-"""
-
-import argparse
-import os
-import queue
-import time
-from subprocess import run, PIPE
-import logging
-rootlogger = logging.getLogger('root')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-from logging import INFO
-import time
-from datetime import timedelta
-from multiprocessing import Process, Queue
-from queue import Empty
-from google.cloud import storage, bigquery
-
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-import psycopg2
-from psycopg2.extras import DictCursor
-
-TRIES=3
-
-# Get all collections in some version that are not excluded
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = f"""
-    SELECT c.* 
-    FROM `{args.src_project}.{args.bqdataset_name}.{args.bq_collection_table}` as c
-    LEFT JOIN `{args.src_project}.{args.bqdataset_name}.{args.bq_excluded_collections}` as ex
-    ON LOWER(c.collection_id) = LOWER(ex.tcia_api_collection_id)
-    WHERE ex.tcia_api_collection_id is NULL
-    ORDER BY c.collection_id
-    """
-    result = client.query(query).result()
-    collection_ids = [collection['collection_id'] for collection in result]
-    return collection_ids
-
-
-def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket):
-    for row in rows:
-        index = f'{n}/{rowcount}'
-        blob_name = f'{row}.dcm'
-        if not blob_name in done_instances:
-            retries = 0
-            while True:
-                try:
-                    blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket)
-                    # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name)
-                    successlogger.info(f'{blob_name}')
-                    break
-                except Exception as exc:
-                    if retries == TRIES:
-                        errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id,
-                                    index, args.collection,
-                                    blob_name, retries, exc)
-                        break
-                time.sleep(retries)
-                retries += 1
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s', args.id, index, args.collection)
-        else:
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name)
-        n += 1
-
-
-def worker(input, args, done_instances):
-    # rootlogger.info('p%s: Worker starting: args: %s', args.id, args )
-    print(f'p{args.id}: Worker starting: args: {args}')
-
-    conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT,
-                            password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST)
-    with conn:
-        with conn.cursor(cursor_factory=DictCursor) as cur:
-            client = storage.Client()
-            src_bucket = client.bucket(args.src_bucket, user_project=args.src_project)
-            dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project)
-
-            for rows, n, rowcount in iter(input.get, 'STOP'):
-                copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket)
-                # output.put(n)
-
-
-def copy_all_instances(args, query):
-    client = bigquery.Client()
-    try:
-        # Create a set of previously copied blobs
-        done_instances = set(open(f'{args.log_dir}/cc_{args.collection}_success.log').read().splitlines())
-    except:
-        done_instances = []
-
-    increment = args.batch
-    # cur.execute(query)
-    query_job = client.query((query))
-    query_job.result()
-    # Get the destination table for the query results.
-    #
-    # All queries write to a destination table. If a destination table is not
-    # specified, the BigQuery populates it with a reference to a temporary
-    # anonymous table after the query completes.
-    destination = query_job.destination
-
-    # Get the schema (and other properties) for the destination table.
-    #
-    # A schema is useful for converting from BigQuery types to Python types.
-    destination = client.get_table(destination)
-
-    rowcount = destination.num_rows
-    print(f'Copying collection {args.collection}; {rowcount} instances')
-
-    num_processes = max(1,min(args.processes, int(rowcount/increment)))
-    processes = []
-    # Create a pair of queue for each process
-
-    task_queue = Queue()
-
-    # task_queues = [Queue() for p in range(num_processes)]
-    # done_queues = [Queue() for p in range(num_processes)]
-
-    # List of patients enqueued
-    enqueued_batches = []
-
-    strt = time.time()
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args, done_instances)))
-        # processes.append(
-        #     Process(group=None, target=worker, args=(task_queues[process], args, done_instances)))
-        # print(f'Started process {args.id}: {processes[-1]}')
-        processes[-1].start()
-
-    # Distribute the work across the task_queues
-    n = 1
-    while True:
-        # rows = cur.fetchmany(increment)
-        rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n-1)]
-        if len(rows) == 0:
-            break
-        task_queue.put((rows, n, rowcount))
-        # task_queues[q%num_processes].put((rows, n, rowcount))
-        enqueued_batches.append(n)
-        # print(f'Enqueue {n} on queue {q%num_processes}')
-        n += increment
-    print('Work distribution complete')
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-        # print(f'Stop queue {i}')
-
-    # # Wait until all work is complete
-    # q = 0
-    # while not enqueued_batches == []:
-    #     # Timeout if waiting too long
-    #     try:
-    #         results = done_queues[q%num_processes].get(timeout=1)
-    #         enqueued_batches.remove(results)
-    #     except queue.Empty:
-    #         pass
-    #     q += 1
-    #
-    # Close all the queues
-    # for q in task_queues:
-    #     q.close()
-    # for q in done_queues:
-    #     q.close()
-
-    # Wait for process to terminate
-    for process in processes:
-        print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-        # if process.is_alive():
-        #     rootlogger.info('Collection: %s, terminating process %s',args.collection, process.name)
-        #     process.kill()
-         # print(f'Joined process {process.name.split("-")[-1]}, exitcode: {process.exitcode}')
-
-
-    delta = time.time() - strt
-    rate = rowcount/delta
-    print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes')
-
-
-
-def precopy(args):
-    client = bigquery.Client()
-    collections = get_collections_in_version(args)
-
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    for collection in collections:
-        if not collection in dones:
-            args.collection = collection
-            if os.path.exists('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)):
-                os.remove('{}/logs/cc_{}_error.log'.format(args.log_dir, collection))
-
-            # Change logging file. File name includes collection ID.
-            for hdlr in successlogger.handlers[:]:
-                successlogger.removeHandler(hdlr)
-            success_fh = logging.FileHandler('{}/cc_{}_success.log'.format(args.log_dir, collection))
-            successlogger.addHandler(success_fh)
-            successformatter = logging.Formatter('%(message)s')
-            success_fh.setFormatter(successformatter)
-
-            for hdlr in errlogger.handlers[:]:
-                errlogger.removeHandler(hdlr)
-            err_fh = logging.FileHandler('{}/cc_{}_error.log'.format(args.log_dir, collection))
-            errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-            errlogger.addHandler(err_fh)
-            err_fh.setFormatter(errformatter)
-
-            # Query to get the instances in the collection
-            query = f"""
-                SELECT i.uuid
-                FROM `idc-dev-etl.idc_v{args.version}.collection` as c 
-                JOIN `idc-dev-etl.idc_v{args.version}.patient` as p
-                ON c.collection_id = p.collection_id
-                JOIN `idc-dev-etl.idc_v{args.version}.study` as st
-                ON p.submitter_case_id = st.submitter_case_id
-                JOIN `idc-dev-etl.idc_v{args.version}.series` as se
-                ON st.study_instance_uid = se.study_instance_uid
-                JOIN `idc-dev-etl.idc_v{args.version}.instance` as i
-                ON se.series_instance_uid = i.series_instance_uid
-                WHERE c.collection_id = '{args.collection}'
-                ORDER by i.uuid
-                """
-            args.id = 0
-
-            copy_all_instances(args, query)
-
-            if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat('{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size==0:
-                # If no errors, then we are done with this collection
-                with open(args.dones, 'a') as f:
-                     f.write(f'{collection}\n')
-
-
diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py
deleted file mode 100644
index ea18038..0000000
--- a/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs named in some collection from the dev bucket to some other bucket.
-This is/was used, among other things, for the initial population of the idc_gch_staging
-bucket from which Google Healthcare ingests our data.
-"""
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-import time
-from datetime import timedelta
-from multiprocessing import Process, Queue
-from queue import Empty
-from google.cloud import storage
-
-
-from python_settings import settings
-import settings as etl_settings
-
-# settings.configure(etl_settings)
-# assert settings.configured
-# import psycopg2
-# from psycopg2.extras import DictCursor
-from gcs.copy_collections__obsolete.copy_collections_bq import precopy
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=4, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--src_bucket', default='idc_v5_nlst')
-    parser.add_argument('--dst_bucket', default='idc_dev')
-    parser.add_argument('--processes', default=96, help="Number of concurrent processes")
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default='/mnt/disks/idc-etl/logs/copy_collections__obsolete')
-    parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default='./logs/dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler('{}/logs/copy_collections__obsolete.log'.format(os.environ['PWD']))
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py
deleted file mode 100644
index e2b76b7..0000000
--- a/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py
+++ /dev/null
@@ -1,263 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all included collections (not in the excluded_collections table) to another bucket.
-This is generally used to populate a bucket that can then be imported
-into a DICOM store.
-"""
-
-import argparse
-import os
-import queue
-import time
-from subprocess import run, PIPE
-import logging
-rootlogger = logging.getLogger('root')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-from logging import INFO
-import time
-from datetime import timedelta
-from multiprocessing import Process, Queue
-from queue import Empty
-from google.cloud import storage, bigquery
-
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-import psycopg2
-from psycopg2.extras import DictCursor
-
-TRIES=3
-
-# Get all collections in some version that are not excluded
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = f"""
-    SELECT c.* 
-    FROM `{args.src_project}.{args.bqdataset_name}.{args.bq_collection_table}` as c
-    LEFT JOIN `{args.src_project}.{args.bqdataset_name}.{args.bq_excluded_collections}` as ex
-    ON LOWER(c.collection_id) = LOWER(ex.tcia_api_collection_id)
-    WHERE ex.tcia_api_collection_id is NULL
-    ORDER BY c.collection_id
-    """
-    result = client.query(query).result()
-    collection_ids = [collection['collection_id'] for collection in result]
-    return collection_ids
-
-
-def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket):
-    for row in rows:
-        index = f'{n}/{rowcount}'
-        blob_name = f'{row["uuid"]}.dcm'
-        if not blob_name in done_instances:
-            retries = 0
-            while True:
-                try:
-                    blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket)
-                    # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name)
-                    successlogger.info(f'{blob_name}')
-                    break
-                except Exception as exc:
-                    if retries == TRIES:
-                        errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id,
-                                    index, args.collection,
-                                    blob_name, retries, exc)
-                        break
-                time.sleep(retries)
-                retries += 1
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s', args.id, index, args.collection)
-        else:
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name)
-        n += 1
-
-
-def worker(input, args, done_instances):
-    # rootlogger.info('p%s: Worker starting: args: %s', args.id, args )
-    print(f'p{args.id}: Worker starting: args: {args}')
-
-    conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT,
-                            password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST)
-    with conn:
-        with conn.cursor(cursor_factory=DictCursor) as cur:
-            client = storage.Client()
-            src_bucket = client.bucket(args.src_bucket, user_project=args.src_project)
-            dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project)
-
-            for rows, n, rowcount in iter(input.get, 'STOP'):
-                copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket)
-                # output.put(n)
-
-
-def copy_all_instances(args, cur, query):
-
-    try:
-        # Create a set of previously copied blobs
-        done_instances = set(open(f'{args.log_dir}/cc_{args.collection}_success.log').read().splitlines())
-    except:
-        done_instances = []
-
-    increment = args.batch
-    cur.execute(query)
-    rowcount = cur.rowcount
-    print(f'Copying collection {args.collection}; {rowcount} instances')
-
-    strt = time.time()
-    num_processes = max(1,min(args.processes, int(rowcount/increment)))
-    processes = []
-    # Create a pair of queue for each process
-
-    task_queue = Queue()
-
-    # task_queues = [Queue() for p in range(num_processes)]
-    # done_queues = [Queue() for p in range(num_processes)]
-
-    # List of patients enqueued
-    enqueued_batches = []
-
-    strt = time.time()
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args, done_instances)))
-        # processes.append(
-        #     Process(group=None, target=worker, args=(task_queues[process], args, done_instances)))
-        # print(f'Started process {args.id}: {processes[-1]}')
-        processes[-1].start()
-
-    # Distribute the work across the task_queues
-    n = 1
-    q=0
-    while True:
-        rows = cur.fetchmany(increment)
-        if len(rows) == 0:
-            break
-        task_queue.put((rows, n, rowcount))
-        # task_queues[q%num_processes].put((rows, n, rowcount))
-        enqueued_batches.append(n)
-        # print(f'Enqueue {n} on queue {q%num_processes}')
-        n += increment
-        q+=1
-    print('Work distribution complete')
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-        # print(f'Stop queue {i}')
-
-    # # Wait until all work is complete
-    # q = 0
-    # while not enqueued_batches == []:
-    #     # Timeout if waiting too long
-    #     try:
-    #         results = done_queues[q%num_processes].get(timeout=1)
-    #         enqueued_batches.remove(results)
-    #     except queue.Empty:
-    #         pass
-    #     q += 1
-    #
-    # Close all the queues
-    # for q in task_queues:
-    #     q.close()
-    # for q in done_queues:
-    #     q.close()
-
-    # Wait for process to terminate
-    for process in processes:
-        print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-        # if process.is_alive():
-        #     rootlogger.info('Collection: %s, terminating process %s',args.collection, process.name)
-        #     process.kill()
-         # print(f'Joined process {process.name.split("-")[-1]}, exitcode: {process.exitcode}')
-
-
-    delta = time.time() - strt
-    rate = rowcount/delta
-    print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes')
-
-
-
-def precopy(args):
-    conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT,
-                            password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST)
-
-    # Get excluded collections
-
-    # collections = open(args.collection_list).read().splitlines()
-
-    collections = get_collections_in_version(args)
-
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    for collection in collections:
-        if not collection in dones:
-            args.collection = collection
-            with conn:
-                if os.path.exists('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)):
-                    os.remove('{}/logs/cc_{}_error.log'.format(args.log_dir, collection))
-
-                # Change logging file. File name includes collection ID.
-                for hdlr in successlogger.handlers[:]:
-                    successlogger.removeHandler(hdlr)
-                success_fh = logging.FileHandler('{}/cc_{}_success.log'.format(args.log_dir, collection))
-                successlogger.addHandler(success_fh)
-                successformatter = logging.Formatter('%(message)s')
-                success_fh.setFormatter(successformatter)
-
-                for hdlr in errlogger.handlers[:]:
-                    errlogger.removeHandler(hdlr)
-                err_fh = logging.FileHandler('{}/cc_{}_error.log'.format(args.log_dir, collection))
-                errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-                errlogger.addHandler(err_fh)
-                err_fh.setFormatter(errformatter)
-
-                # Query to get the instances in the collection
-                with conn.cursor(cursor_factory=DictCursor) as cur:
-                    query = f"""
-                        SELECT i.uuid
-                        FROM collection as c 
-                        JOIN patient as p
-                        ON c.collection_id = p.collection_id
-                        JOIN study as st
-                        ON p.submitter_case_id = st.submitter_case_id
-                        JOIN series as se
-                        ON st.study_instance_uid = se.study_instance_uid
-                        JOIN instance as i
-                        ON se.series_instance_uid = i.series_instance_uid
-                        WHERE c.collection_id = '{args.collection}'
-                        ORDER by i.uuid
-                        """
-                    args.id = 0
-                    copy_all_instances(args, cur, query)
-
-            if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat('{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size==0:
-                # If no errors, then we are done with this collection
-                with open(args.dones, 'a') as f:
-                     f.write(f'{collection}\n')
-
-
diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py
deleted file mode 100644
index 1038682..0000000
--- a/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs named in some collections from the dev bucket to some other bucket.
-This is/was used, among other things, for the initial population of the idc_gch_staging
-bucket from which Google Healthcare ingests our data.
-"""
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-
-from gcs.copy_collections__obsolete.copy_collections_bq import precopy
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collection_table', default='collection')
-    parser.add_argument('--bq_excluded_collections', default='excluded_collections')
-    parser.add_argument('--src_bucket', default='idc_dev')
-    parser.add_argument('--dst_bucket', default=f'idc_dev_v{args.version}_dicomstore_staging')
-    parser.add_argument('--processes', default=96, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default='/mnt/disks/idc-etl/logs/copy_collections_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/copy_collections_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler('{}/logs/copy_collections__obsolete.log'.format(os.environ['PWD']))
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py b/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py
deleted file mode 100644
index dcadfbd..0000000
--- a/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Copy pre-staging buckets populated by ingestion to staging buckets:
-# Ingestion copies data into prestaging buckets named by version,
-# collection, and source e.g. idc_v8_path_tcga_brca. The data in these buckets must be
-# copied to one of the idc-dev-etl staging buckets:
-# idc-dev-open, idc-dev-cr, idc-dev-defaced, idc-dev-redacted, idc-dev-excluded.
-
-import os
-import argparse
-import logging
-from logging import INFO
-
-from idc.models import Base, Collection, CR_Collections, Defaced_Collections, Excluded_Collections, Open_Collections, Redacted_Collections
-import settings as etl_settings
-from python_settings import settings
-settings.configure(etl_settings)
-from google.cloud import storage
-from gcs.copy_bucket_mp.copy_bucket_mp import copy_all_instances
-
-from sqlalchemy import create_engine
-from sqlalchemy_utils import register_composites
-from sqlalchemy.orm import Session
-
-
-def get_collection_groups(sess):
-    dev_staging_buckets = {}
-    pub_staging_buckets = {}
-    collections = sess.query(CR_Collections.tcia_api_collection_id, CR_Collections.dev_url, CR_Collections.pub_url)
-    for collection in  collections:
-        dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url
-        pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url
-    collections = sess.query(Defaced_Collections.tcia_api_collection_id, Defaced_Collections.dev_url, Defaced_Collections.pub_url)
-    for collection in  collections:
-        dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url
-        pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url
-    collections = sess.query(Excluded_Collections.tcia_api_collection_id)
-    for collection in  collections:
-        dev_staging_buckets[collection.tcia_api_collection_id] = 'idc-dev-excluded'
-    collections = sess.query(Open_Collections.tcia_api_collection_id, Open_Collections.dev_url, Open_Collections.pub_url)
-    for collection in  collections:
-        dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url
-        pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url
-    collections = sess.query(Redacted_Collections.tcia_api_collection_id, Redacted_Collections.dev_url, Redacted_Collections.pub_url)
-    for collection in  collections:
-        dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url
-        pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url
-    return dev_staging_buckets, pub_staging_buckets
-
-
-def copy_prestaging_to_staging(args, prestaging_bucket, staging_bucket):
-    print(f'Copying {prestaging_bucket} to {staging_bucket}')
-    args.src_bucket = prestaging_bucket
-    args.dst_bucket = staging_bucket
-    copy_all_instances(args)
-
-
-def copy_dev_buckets(args):
-    sql_uri = f'postgresql+psycopg2://{settings.CLOUD_USERNAME}:{settings.CLOUD_PASSWORD}@{settings.CLOUD_HOST}:{settings.CLOUD_PORT}/{args.db}'
-    # sql_engine = create_engine(sql_uri, echo=True) # Use this to see the SQL being sent to PSQL
-    sql_engine = create_engine(sql_uri)
-    args.sql_uri = sql_uri # The subprocesses need this uri to create their own SQL engine
-
-    # Create the tables if they do not already exist
-    Base.metadata.create_all(sql_engine)
-
-    # Enable the underlying psycopg2 to deal with composites
-    conn = sql_engine.connect()
-    register_composites(conn)
-
-    with Session(sql_engine) as sess:
-        dev_staging_buckets, pub_staging_buckets = get_collection_groups(sess)
-        pass
-        revised_collection_ids = sorted([row.collection_id for row in sess.query(Collection).filter(Collection.rev_idc_version == args.version).all()])
-        for collection_id in revised_collection_ids:
-            prestaging_collection_id = collection_id.lower().replace('-','_').replace(' ','_')
-            prestaging_bucket = f"{args.prestaging_bucket_prefix}{prestaging_collection_id}"
-            staging_bucket = f'{args.staging_bucket_prefix}{dev_staging_buckets[collection_id]}'
-            copy_prestaging_to_staging(args, prestaging_bucket, staging_bucket)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=8, help='Version to work on')
-    parser.add_argument('--client', default=storage.Client())
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v8', help='Database on which to operate')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--prestaging_bucket_prefix', default=f'idc_v{args.version}_', help='Copy instances here before forwarding to --staging_bucket')
-    parser.add_argument('--staging_bucket_prefix', default=f'', help='Copy instances here before forwarding to --staging_bucket')
-    parser.add_argument('--processes', default=8, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/copy_prestaging_to_staging_bucket_mp')
-    args = parser.parse_args()
-    args.id = 0 # Default process ID
-
-    proglogger = logging.getLogger('root.prog')
-    prog_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log')
-    progformatter = logging.Formatter('%(levelname)s:prog:%(message)s')
-    proglogger.addHandler(prog_fh)
-    prog_fh.setFormatter(progformatter)
-    proglogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('root.success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    copy_dev_buckets(args)
\ No newline at end of file
diff --git a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py b/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py
deleted file mode 100644
index 545323a..0000000
--- a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Copy some set of BQ tables from one dataset to another. Used to populate public dataset
-# Uses gsutil -m cp. Not continuable or performant.
-import argparse
-import sys
-import os
-import logging
-from logging import INFO, DEBUG
-from subprocess import run
-
-from idc.models import Version, Collection, Patient, Study, Series, Instance, Retired, WSI_metadata, instance_source
-from sqlalchemy import select,delete
-from sqlalchemy.orm import Session
-
-from python_settings import settings
-import settings as etl_settings
-settings.configure(etl_settings)
-
-from sqlalchemy import create_engine
-from sqlalchemy_utils import register_composites
-
-rootlogger = logging.getLogger('root')
-errlogger = logging.getLogger('root.err')
-
-
-def copy_bucket(args, src_bucket):
-    print("Copying {}".format(src_bucket), flush=True)
-    try:
-        result = run(['gsutil', '-m', 'cp', f'gs://{src_bucket}/*',
-                      f'gs://{args.dst_bucket}'])
-        print("   {} copied, results: {}".format(src_bucket, result), flush=True)
-        if result.returncode:
-            errlogger.error('Copy %s failed: %s', src_bucket, result.stderr)
-            return {"bucket": src_bucket, "status": -1}
-        rootlogger.info('%s',src_bucket)
-        return 0
-    except:
-        errlogger.error("Error copying {}: {},{},{}".format(src_bucket, sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2]), file=sys.stdout, flush=True)
-        raise
-
-
-def copy_buckets(args):
-    # rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler('{}/logs/copy_bucket_v{}_log.log'.format(os.environ['PWD'], args.version))
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    # errlogger = logging.getLogger('root.err')
-    err_fh = logging.FileHandler('{}/logs/copy_bucket_v{}_err.log'.format(os.environ['PWD'], args.version))
-    errformatter = logging.Formatter('{%(pathname)s:%(lineno)d} %(levelname)s:err:%(message)s')
-    errlogger.addHandler(err_fh)
-    err_fh.setFormatter(errformatter)
-
-    rootlogger.debug('Args: %s', args)
-
-    sql_uri = f'postgresql+psycopg2://{settings.CLOUD_USERNAME}:{settings.CLOUD_PASSWORD}@{settings.CLOUD_HOST}:{settings.CLOUD_PORT}/{args.db}'
-    # sql_engine = create_engine(sql_uri, echo=True)
-    sql_engine = create_engine(sql_uri)
-    args.sql_engine = sql_engine
-
-    conn = sql_engine.connect()
-    register_composites(conn)
-
-    dones = open('{}/logs/copy_bucket_v{}_log.log'.format(os.environ['PWD'], args.version)).read().splitlines()
-
-    # Add a new Version with idc_version_number args.version, if it does not already exist
-    with Session(sql_engine) as sess:
-        idc_collections = [c.collection_id for c in sess.query(Collection).\
-            filter(Collection.rev_idc_version==5 and Collection.done == True ).order_by('collection_id')]
-        for c in idc_collections:
-            src_bucket = f"{args.src_bucket_prefix}{c.lower().replace('-', '_').replace(' ', '_')}"
-            if not c in dones:
-                result = copy_bucket(args, src_bucket)
-
-
-
-
diff --git a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py b/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py
deleted file mode 100644
index 990c5f7..0000000
--- a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Copy some set of BQ tables from one dataset to another. Used to populate public dataset
-import argparse
-import sys
-from gcs.copy_staging_buckets__obsolete.copy_staging_buckets import copy_buckets
-import logging
-
-if __name__ == '__main__':
-
-    parser =argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='IDC version for which to build the table')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}', help='Database to access')
-    parser.add_argument('--src_bucket_prefix', default=f'idc_v{args.version}_')
-    parser.add_argument('--dst_bucket', default=f'idc_dev', help='Destination BQ dataset')
-
-    args = parser.parse_args()
-    print("{}".format(args), file=sys.stdout)
-
-    rootlogger = logging.getLogger('root')
-    errlogger = logging.getLogger('root.err')
-
-    copy_buckets(args)
\ No newline at end of file
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/README.md b/gcs/obsolete/depopulate_collections_from_bucket/README.md
deleted file mode 100644
index d7eaf69..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-The scripts in this directory were used to depopulate to-be-redacted collextions from various buckets.
-The base script, depopulated_collections_from_bucket.py, might be useful at some future time, but the calling scripts are probably not useful.
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py
deleted file mode 100644
index c1985b2..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py
+++ /dev/null
@@ -1,346 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Note: This script and scripts which call it should be restructured such
-that the calling script passes in the list of collections to be copied.
-"""
-
-"""
-NOTE: The get_collections_in_version function is hardcoded for v5.
-Also, probably shoud remove the test for 'where v1=False and v2=True'
-in the SQL. That was a special case that should have been parameterized
-"""
-
-"""
-General purpose multiprocessing routine to the instances of some set of
-collections from one bucket to another.
-
-A parameter, args.retired, controls whether retired instances (as listed
-in the retired table) are also deleted.
-In general args.retired should be False when depopulating a bucket that will
-imported into a DICOM store, where only current instances are wanted.
-args.retired should be True when populating a bucket that will be public
-(even if having Limited access) as well as the dev counterparts of these
-buckets.
-
-"""
-
-import argparse
-import os
-import logging
-
-rootlogger = logging.getLogger('root')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-import time
-from multiprocessing import Process, Queue
-from google.cloud import storage, bigquery
-from google.cloud.exceptions import NotFound
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-
-TRIES = 3
-"""
-args paramaters
-bqdataset_name: bq datas et from which to access tables
-bq_collections_table': BQ table listing group of collections to be populate
-retired: Copy retired instances in collection if True
-src_bucket: Bucket from which to delete blobs
-processes: Number of concurrent processes
-batch: Size of batch of blobs to be copied
-src_project: Project of destination bucket
-dst_project: Project of source bucket
-log_dir: Directory in which some log files are kept.
-dones: File listing collections that have been copied
-"""
-
-
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = ""
-    if 'excluded_tables' in args.__dict__ and args.excluded_tables:
-        query = f"""
-            WITH ex as
-            ({query}
-                SELECT tcia_api_collection_id
-                FROM `{args.src_project}.{args.bqdataset_name}.{args.excluded_tables[0]}` 
-            """
-        for table in args.excluded_tables[1:]:
-            query = f"""
-                {query}
-                UNION ALL
-                SELECT tcia_api_collection_id
-                FROM `{args.src_project}.{args.bqdataset_name}.{table}` 
-                """
-        query = f"""
-            {query})
-            """
-        query = f"""
-            {query}
-            SELECT c.* 
-            FROM `idc-dev-etl.idc_v5.collection` AS c
-            LEFT JOIN ex
-            ON c.collection_id =ex.tcia_api_collection_id
-            where ex.tcia_api_collection_id is NULL
-
-            ORDER BY c.collection_id 
-            """
-        result = client.query(query).result()
-        collection_ids = [collection['collection_id'] for collection in result]
-    else:
-        query = f"""
-            SELECT c.* 
-            FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c
-            where v1=False and v2=True
-            ORDER BY c.tcia_api_collection_id 
-            """
-        result = client.query(query).result()
-        collection_ids = [collection['tcia_api_collection_id'] for collection in result]
-    return collection_ids
-
-
-def delete_instances(args, rows, n, rowcount, done_instances, src_bucket):
-    for row in rows:
-        index = f'{n}/{rowcount}'
-        blob_name = f'{row}.dcm'
-        if not blob_name in done_instances:
-            retries = 0
-            while True:
-                try:
-                    src_bucket.delete_blob(blob_name)
-                    # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name)
-                    successlogger.info(f'{blob_name}')
-                    break
-                except NotFound:
-                    errlogger.error('p%s %s: %s: Failed, not found %s\n', args.id,
-                                    index, args.collection,
-                                    blob_name)
-                    break
-                except Exception as exc:
-                    if retries == TRIES:
-                        errlogger.error('p%s %s: %s: Failed %s\n: %s', args.id,
-                                        index, args.collection,
-                                        blob_name, exc)
-                        break
-                retries += 1
-
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s', args.id, index, args.collection)
-        else:
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name)
-        n += 1
-
-
-def worker(input, args, done_instances):
-    # rootlogger.info('p%s: Worker starting: args: %s', args.id, args )
-    # print(f'p{args.id}: Worker starting: args: {args}')
-
-    client = storage.Client()
-    src_bucket = client.bucket(args.src_bucket, user_project=args.src_project)
-
-    for rows, n, rowcount in iter(input.get, 'STOP'):
-        delete_instances(args, rows, n, rowcount, done_instances, src_bucket)
-        # output.put(n)
-
-
-def delete_all_instances(args):
-    client = bigquery.Client()
-    try:
-        # Create a set of previously copied blobs
-        done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines())
-    except:
-        done_instances = []
-
-    # We first delete the instances in the current IDC version,
-
-    # Query to get the instances in the collection
-    query = f"""
-        SELECT i.uuid
-        FROM `idc-dev-etl.idc_v{args.version}.collection` as c 
-        JOIN `idc-dev-etl.idc_v{args.version}.patient` as p
-        ON c.collection_id = p.collection_id
-        JOIN `idc-dev-etl.idc_v{args.version}.study` as st
-        ON p.submitter_case_id = st.submitter_case_id
-        JOIN `idc-dev-etl.idc_v{args.version}.series` as se
-        ON st.study_instance_uid = se.study_instance_uid
-        JOIN `idc-dev-etl.idc_v{args.version}.instance` as i
-        ON se.series_instance_uid = i.series_instance_uid
-        WHERE c.collection_id = '{args.collection}'
-        ORDER by i.uuid
-        """
-    args.id = 0
-
-    increment = args.batch
-    # cur.execute(query)
-    query_job = client.query((query))
-    query_job.result()
-    # Get the destination table for the query results.
-    #
-    # All queries write to a destination table. If a destination table is not
-    # specified, the BigQuery populates it with a reference to a temporary
-    # anonymous table after the query completes.
-    destination = query_job.destination
-
-    # Get the schema (and other properties) for the destination table.
-    #
-    # A schema is useful for converting from BigQuery types to Python types.
-    destination = client.get_table(destination)
-
-    prowcount = destination.num_rows
-    print(f'Copying collection {args.collection}; primary {prowcount} instances')
-
-    num_processes = max(1, min(args.processes, int(prowcount / increment)))
-    processes = []
-    # Create a pair of queue for each process
-
-    task_queue = Queue()
-
-    # task_queues = [Queue() for p in range(num_processes)]
-    # done_queues = [Queue() for p in range(num_processes)]
-
-    strt = time.time()
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args, done_instances)))
-        # processes.append(
-        #     Process(group=None, target=worker, args=(task_queues[process], args, done_instances)))
-        # print(f'Started process {args.id}: {processes[-1]}')
-        processes[-1].start()
-
-    # Distribute the work across the task_queues
-    n = 1
-    while True:
-        # rows = cur.fetchmany(increment)
-        rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)]
-        if len(rows) == 0:
-            break
-        task_queue.put((rows, n, prowcount))
-        n += increment
-    print('Primary work distribution complete')
-
-    # Next we delete retired instances
-    # Query to get the instances from the retired table
-    if args.retired:
-        query = f"""
-            SELECT r.instance_uuid
-            FROM `idc-dev-etl.idc_v{args.version}.retired` as r 
-            WHERE r.collection_id = '{args.collection}'
-            ORDER by r.instance_uuid
-            """
-
-        query_job = client.query((query))
-        query_job.result()
-        # Get the destination table for the query results.
-        #
-        # All queries write to a destination table. If a destination table is not
-        # specified, the BigQuery populates it with a reference to a temporary
-        # anonymous table after the query completes.
-        destination = query_job.destination
-
-        # Get the schema (and other properties) for the destination table.
-        #
-        # A schema is useful for converting from BigQuery types to Python types.
-        destination = client.get_table(destination)
-
-        rrowcount = destination.num_rows
-        if rrowcount:
-            print(f'Copying retired {args.collection}; primary {rrowcount} instances')
-
-            # Distribute the work across the task_queues
-            n = 1
-            while True:
-                # rows = cur.fetchmany(increment)
-                rows = [r.instance_uuid for r in
-                        client.list_rows(destination, max_results=increment, start_index=n - 1)]
-                if len(rows) == 0:
-                    break
-                task_queue.put((rows, n, rrowcount))
-                n += increment
-            print('Retired work distribution complete')
-        else:
-            print(f'No retired instances in collection {args.collection}')
-    else:
-        rrowcount = 0
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-
-    # Wait for process to terminate
-    for process in processes:
-        print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-
-    delta = time.time() - strt
-    rate = (prowcount + rrowcount) / delta
-    print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes')
-
-
-def predelete(args, collections=None):
-    client = bigquery.Client()
-    if not collections:
-        collections = get_collections_in_version(args)
-
-    if not os.path.exists('{}'.format(args.log_dir)):
-        os.mkdir('{}'.format(args.log_dir))
-        st = os.stat('{}'.format(args.log_dir))
-        os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222)
-
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    for collection in collections:
-        if not collection in dones:
-            args.collection = collection
-            if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)):
-                os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection))
-
-            # Change logging file. File name includes collection ID.
-            for hdlr in successlogger.handlers[:]:
-                successlogger.removeHandler(hdlr)
-            success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection))
-            successlogger.addHandler(success_fh)
-            successformatter = logging.Formatter('%(message)s')
-            success_fh.setFormatter(successformatter)
-
-            for hdlr in errlogger.handlers[:]:
-                errlogger.removeHandler(hdlr)
-            err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection))
-            errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-            errlogger.addHandler(err_fh)
-            err_fh.setFormatter(errformatter)
-
-            delete_all_instances(args)
-
-            if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat(
-                    '{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size == 0:
-                # If no errors, then we are done with this collection
-                with open(args.dones, 'a') as f:
-                    f.write(f'{collection}\n')
-
-
-
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py
deleted file mode 100644
index 19401c6..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-
-from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete
-
-
-if __name__ == '__main__':
-    group = 'defaced'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-dev-open')
-    parser.add_argument('--processes', default=50, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket')
-    args = parser.parse_args()
-    parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt')
-    args = parser.parse_args()
-
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    predelete(args)
-
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py
deleted file mode 100644
index d0355e8..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-
-from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete
-
-
-if __name__ == '__main__':
-    group = 'defaced'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-open-pdp-staging')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-pdp-staging')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket')
-    args = parser.parse_args()
-    parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt')
-    args = parser.parse_args()
-
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    predelete(args)
-
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py
deleted file mode 100644
index bd5d633..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Delete collections from the idc-dev-v5-dicomstore-staging bucket.
-This is/was used for generating a bucket for dicom store import tocreate a dicomstore
-without redacted collections.
-Buckets for import into a dicomstore do not have retired instances, thus retired
-instances are not deleted.
-"""
-
-
-import argparse
-import os
-from google.cloud import bigquery
-import logging
-from logging import INFO
-
-from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete
-
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = f"""
-        SELECT c.* 
-        FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c
-         ORDER BY c.tcia_api_collection_id 
-        """
-    result = client.query(query).result()
-    collection_ids = [collection['tcia_api_collection_id'] for collection in result]
-    return collection_ids
-
-
-
-if __name__ == '__main__':
-    group = 'redacted'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=False, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-dev-v5-dicomstore-staging', help='Bucket from which to delete instances')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_dicomstore_staging_bucket')
-    args = parser.parse_args()
-    parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt')
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_dicomstore_staging_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    collections = get_collections_in_version(args)
-
-    predelete(args, collections)
-
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py
deleted file mode 100644
index 1f03a9b..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-
-from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete
-
-
-if __name__ == '__main__':
-    group = 'redacted'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-dev-open')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket')
-    args = parser.parse_args()
-    parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt')
-    args = parser.parse_args()
-
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    predelete(args)
-
diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py
deleted file mode 100644
index 5c51902..0000000
--- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-from subprocess import run, PIPE
-import logging
-from logging import INFO
-
-from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete
-
-
-if __name__ == '__main__':
-    group = 'redacted'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-open-pdp-staging')
-    parser.add_argument('--processes', default=96, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-pdp-staging')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket')
-    args = parser.parse_args()
-    parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt')
-    args = parser.parse_args()
-
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    predelete(args)
-
diff --git a/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py b/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py
deleted file mode 100644
index 856244c..0000000
--- a/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import os
-from google.cloud import bigquery
-import logging
-from logging import INFO, DEBUG
-
-from gcs.depopulate_version_from_bucket.depopulate_versions_from_bucket import predelete
-
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = f"""
-        SELECT o.tcia_api_collection_id as tcia_api_collection_id
-        FROM `idc-dev-etl.idc_v5.open_collections` AS o
---        ORDER BY o.tcia_api_collection_id 
-        UNION ALL
-        SELECT c.tcia_api_collection_id
-        FROM `idc-dev-etl.idc_v5.cr_collections` AS c
---        ORDER BY c.tcia_api_collection_id 
-        UNION ALL
-        SELECT d.tcia_api_collection_id
-        FROM `idc-dev-etl.idc_v5.defaced_collections` AS d
---         UNION ALL
---         SELECT d.tcia_api_collection_id
---         FROM `idc-dev-etl.idc_v5.redacted_collections` AS d
-        ORDER BY tcia_api_collection_id
---        ORDER BY d.tcia_api_collection_id 
-
-        """
-    result = client.query(query).result()
-    collection_ids = [collection['tcia_api_collection_id'] for collection in result]
-    return collection_ids
-
-
-
-if __name__ == '__main__':
-    bucket = 'idc-open'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bucket', default=f'{bucket}', help='Bucket from which to delete instances')
-    parser.add_argument('--processes', default=8, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
-    parser.add_argument('--project', default='canceridc-data')
-    parser.add_argument('--deleted_version', default=3, help='Version whose instances are to be deleted')
-    args = parser.parse_args()
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_version_{args.deleted_version}_from_{bucket}')
-    parser.add_argument('--dones', default=f'./logs/depopulate_v{args.deleted_version}_dones.txt')
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/log.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    proglogger = logging.getLogger('root.prog')
-    prog_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/prog.log')
-    progformatter = logging.Formatter('%(levelname)s:prog:%(message)s')
-    proglogger.addHandler(prog_fh)
-    prog_fh.setFormatter(progformatter)
-    proglogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(DEBUG)
-
-    errlogger = logging.getLogger('root.err')
-
-    collections = get_collections_in_version(args)
-
-    predelete(args, collections)
-
diff --git a/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py b/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py
deleted file mode 100644
index 41a4e9a..0000000
--- a/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-"""
-General purpose multiprocessing routine to delete all instances added
-in some version from a specified bucker
-"""
-
-import argparse
-import os
-import logging
-
-rootlogger = logging.getLogger('root')
-proglogger = logging.getLogger('root.prog')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-import time
-from multiprocessing import Process, Queue
-from google.cloud import storage, bigquery
-from google.cloud.exceptions import NotFound
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-
-TRIES = 3
-"""
-args paramaters
-bqdataset_name: bq datas et from which to access tables
-bq_collections_table': BQ table listing group of collections to be populate
-retired: Copy retired instances in collection if True
-src_bucket: Bucket from which to delete blobs
-processes: Number of concurrent processes
-batch: Size of batch of blobs to be copied
-src_project: Project of destination bucket
-dst_project: Project of source bucket
-log_dir: Directory in which some log files are kept.
-dones: File listing collections that have been copied
-"""
-
-def delete_instances(args, rows, n, rowcount, done_instances, bucket):
-    for row in rows:
-        index = f'{n}/{rowcount}'
-        blob_name = f'{row}.dcm'
-        if not blob_name in done_instances:
-            retries = 0
-            while True:
-                try:
-                    bucket.delete_blob(blob_name)
-                    # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name)
-                    successlogger.debug(f'{blob_name}')
-                    break
-                except NotFound:
-                    errlogger.error('p%s %s: %s: Failed, not found %s\n', args.id,
-                                    index, args.collection,
-                                    blob_name)
-                    break
-                except Exception as exc:
-                    if retries == TRIES:
-                        errlogger.error('p%s %s: %s: Failed %s\n: %s', args.id,
-                                        index, args.collection,
-                                        blob_name, exc)
-                        break
-                retries += 1
-
-            if n % args.batch == 0:
-                proglogger.info('p%s %s: %s', args.id, index, args.collection)
-                # print('p%s %s: %s', args.id, index, args.collection)
-        else:
-            if n % args.batch == 0:
-                proglogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name)
-        n += 1
-
-
-def worker(input, args, done_instances):
-    # rootlogger.info('p%s: Worker starting: args: %s', args.id, args )
-    # print(f'p{args.id}: Worker starting: args: {args}')
-
-    client = storage.Client()
-    bucket = client.bucket(args.bucket, user_project=args.project)
-
-    for rows, n, rowcount in iter(input.get, 'STOP'):
-        delete_instances(args, rows, n, rowcount, done_instances, bucket)
-        # output.put(n)
-
-
-def delete_all_instances(args):
-    client = bigquery.Client()
-    try:
-        # Create a set of previously copied blobs
-        done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines())
-    except:
-        done_instances = []
-
-    # We first delete the instances in the current IDC version,
-
-    # Query to get the instances in the collection
-    query = f"""
-        SELECT i.uuid
-        FROM `idc-dev-etl.idc_v{args.version}.collection` as c 
-        JOIN `idc-dev-etl.idc_v{args.version}.patient` as p
-        ON c.collection_id = p.collection_id
-        JOIN `idc-dev-etl.idc_v{args.version}.study` as st
-        ON p.submitter_case_id = st.submitter_case_id
-        JOIN `idc-dev-etl.idc_v{args.version}.series` as se
-        ON st.study_instance_uid = se.study_instance_uid
-        JOIN `idc-dev-etl.idc_v{args.version}.instance` as i
-        ON se.series_instance_uid = i.series_instance_uid
-        WHERE c.collection_id = '{args.collection}'
-        AND i.rev_idc_version = {args.deleted_version}
-        ORDER by i.uuid
-        """
-    args.id = 0
-
-    increment = args.batch
-    # cur.execute(query)
-    query_job = client.query((query))
-    query_job.result()
-    # Get the destination table for the query results.
-    #
-    # All queries write to a destination table. If a destination table is not
-    # specified, BigQuery populates it with a reference to a temporary
-    # anonymous table after the query completes.
-    destination = query_job.destination
-
-    # Get the schema (and other properties) for the destination table.
-    #
-    # A schema is useful for converting from BigQuery types to Python types.
-    destination = client.get_table(destination)
-
-    prowcount = destination.num_rows
-    if prowcount:
-        print(f'Deleting collection {args.collection}; primary {prowcount} instances')
-
-        num_processes = max(1, min(args.processes, int(prowcount / increment)))
-        processes = []
-        # Create a pair of queue for each process
-
-        task_queue = Queue()
-
-        # task_queues = [Queue() for p in range(num_processes)]
-        # done_queues = [Queue() for p in range(num_processes)]
-
-        strt = time.time()
-
-        # Start worker processes
-        for process in range(num_processes):
-            args.id = process + 1
-            processes.append(
-                Process(group=None, target=worker, args=(task_queue, args, done_instances)))
-            # processes.append(
-            #     Process(group=None, target=worker, args=(task_queues[process], args, done_instances)))
-            # print(f'Started process {args.id}: {processes[-1]}')
-            processes[-1].start()
-
-        # Distribute the work across the task_queues
-        n = 1
-        while True:
-            # rows = cur.fetchmany(increment)
-            rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)]
-            if len(rows) == 0:
-                break
-            task_queue.put((rows, n, prowcount))
-            n += increment
-        print('Primary work distribution complete')
-
-        # Tell child processes to stop
-        for i in range(num_processes):
-            task_queue.put('STOP')
-
-        # Wait for process to terminate
-        for process in processes:
-            print(f'Joining process: {process.name}, {process.is_alive()}')
-            process.join()
-
-        delta = time.time() - strt
-        rate = prowcount / delta
-        print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes')
-    else:
-        print(f'Collection {args.collection} has no new instances in version {args.deleted_version}')
-    with open(f'{args.dones}', 'a') as f:
-        f.write(f'{args.collection}\n')
-    #
-    # with  open(f'{args.log_dir}/{args.collection}_success.log', 'w') as f:
-    #     f.write(f'{args.collection}\n')
-
-
-def predelete(args, collections):
-    client = bigquery.Client()
-
-    if not os.path.exists('{}'.format(args.log_dir)):
-        os.mkdir('{}'.format(args.log_dir))
-        st = os.stat('{}'.format(args.log_dir))
-        os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222)
-
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    for collection in collections:
-        if not collection in dones:
-            args.collection = collection
-            if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)):
-                os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection))
-
-            # Change logging file. File name includes collection ID.
-            for hdlr in successlogger.handlers[:]:
-                successlogger.removeHandler(hdlr)
-            success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection))
-            successlogger.addHandler(success_fh)
-            successformatter = logging.Formatter('%(message)s')
-            success_fh.setFormatter(successformatter)
-
-            for hdlr in errlogger.handlers[:]:
-                errlogger.removeHandler(hdlr)
-            err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection))
-            errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-            errlogger.addHandler(err_fh)
-            err_fh.setFormatter(errformatter)
-
-            delete_all_instances(args)
-
-
-
-
-
diff --git a/gcs/obsolete/empty_and_delete_bucket.py b/gcs/obsolete/empty_and_delete_bucket.py
deleted file mode 100644
index 7be83e3..0000000
--- a/gcs/obsolete/empty_and_delete_bucket.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Delete all blobs from some bucket, delete the bucket
-# empty_bucket_mp is faster.
-
-from google.cloud import storage
-from subprocess import run, PIPE
-from google.api_core.exceptions import Conflict
-import sys
-import argparse
-from utilities.gcs_helpers import list_buckets
-
-def empty_and_delete_bucket(args):
-    try:
-        result = run(['gsutil', '-m', '-u', f'{args.project}', 'rm', '-r', f'gs://{args.bucket}'])
-        print("   {} emptied, results: {}".format(args.bucket, result), flush=True)
-        if result.returncode:
-            print('Copy {} failed: {}'.format(result.stderr), flush=True)
-            return {"bucket": args.src_bucket_name, "status": -1}
-        return {"bucket": args.bucket, "status": 0}
-    except Exception as exc:
-        print("Error in deleting {}: {}".format(args.bucket, exc))
-        # raise
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--project', default='idc-dev-etl')
-    parser.add_argument('--bucket', default='idc-dev-v5-dicomstore-staging')
-    args = parser.parse_args()
-    print("{}".format(args), file=sys.stdout)
-
-    if args.bucket == 'idc-open':
-        print("Not allowed")
-        exit
-
-    empty_and_delete_bucket(args)
diff --git a/gcs/obsolete/empty_bucket.py b/gcs/obsolete/empty_bucket.py
deleted file mode 100644
index 00bb878..0000000
--- a/gcs/obsolete/empty_bucket.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Delete all blobs from some bucket
-# empty_bucket_mp.py is faster
-
-from subprocess import run, PIPE
-import sys
-import argparse
-
-def empty_bucket(args):
-    try:
-        result = run(['gsutil', '-m', '-u', f'{args.project}', 'rm', f'gs://{args.bucket}/*'])
-        print("   {} emptied, results: {}".format(args.bucket, result), flush=True)
-        if result.returncode:
-            print('Copy {} failed: {}'.format(result.stderr), flush=True)
-            return {"bucket": args.src_bucket_name, "status": -1}
-        return {"bucket": args.bucket, "status": 0}
-    except Exception as exc:
-        print("Error in deleting {}: {}".format(args.bucket, exc))
-        # raise
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--project', default='canceridc-data')
-    parser.add_argument('--bucket', default='idc-nlst-open')
-    args = parser.parse_args()
-    print("{}".format(args), file=sys.stdout)
-
-    if args.bucket == 'idc-open':
-        print("Not allowed")
-        exit
-
-    empty_bucket(args)
diff --git a/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py b/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py
deleted file mode 100644
index ce0955b..0000000
--- a/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# One time script to delete the idc-tcia-v2-xxx buckets in idc-dev-etl.
-# Could be adapted tp remove idc-tcia- buckets in canceridc-data.
-import argparse
-import os
-from google.cloud import storage, bigquery
-import logging
-from logging import INFO
-from gcs.empty_bucket_mp.empty_bucket_mp import pre_delete
-
-
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = f"""
-        SELECT c.tcia_api_collection_id 
-        FROM `idc-dev-etl.idc_v2.collection` AS c
-        ORDER BY c.tcia_api_collection_id
-        """
-    result = client.query(query).result()
-    collection_ids = [collection['tcia_api_collection_id'] for collection in result]
-    return collection_ids
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--bucket', default='idc-dev-v5-dicomstore-staging')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
-    parser.add_argument('--project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/empty_idc_dev_etl_v2_buckets')
-    parser.add_argument('--dones', default=f'{os.environ["PWD"]}/logs/dones.log')
-
-    args = parser.parse_args()
-
-    if not os.path.exists('{}'.format(args.log_dir)):
-        os.mkdir('{}'.format(args.log_dir))
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    dones = open(args.dones).read().splitlines()
-
-    client = storage.Client(project=args.project)
-
-    collections = [collection.lower().replace(' ','-').replace('_','-') for collection in get_collections_in_version(args)]
-
-    # for collection in collections:
-    #     found=False
-    #     if client.bucket(f'idc-tcia-1-{collection}', user_project=args.project).exists():
-    #         print(f'{collection:32}: idc-tcia-1-{collection}')
-    #         found=True
-    #     if client.bucket(f'idc-tcia-2-{collection}', user_project=args.project).exists():
-    #         print(f'{collection:32}: idc-tcia-2-{collection}')
-    #         found=True
-    #     if not found:
-    #         print(f'{collection}: ***No bucket***')
-    #
-
-
-    for collection in collections:
-        args.bucket = f"idc-tcia-2-{collection}"
-        if not args.bucket in dones:
-            bucket = client.bucket(args.bucket, user_project=args.project)
-            tried = 0
-            tries = 2
-            while tried < tries:
-                try:
-                    if bucket.exists():
-                        pre_delete(args)
-                        bucket.delete()
-                        rootlogger.info(f'Deleted bucket %s',args.bucket)
-                        break
-                    else:
-                        break
-                except Exception  as exc:
-                    print(f'p0: Delete bucket failed: {exc}')
-                    tried += 1
-            if tried == tries:
-                errlogger.error(f'Failed to delete bucket %s', args.bucket)
-            with open(args.dones, 'a') as f:
-                f.write(f'{args.bucket}\n')
-
-
diff --git a/gcs/move_collection/copy_collection.py b/gcs/obsolete/move_collection/copy_collection.py
similarity index 100%
rename from gcs/move_collection/copy_collection.py
rename to gcs/obsolete/move_collection/copy_collection.py
diff --git a/gcs/move_collection/delete_collection.py b/gcs/obsolete/move_collection/delete_collection.py
similarity index 100%
rename from gcs/move_collection/delete_collection.py
rename to gcs/obsolete/move_collection/delete_collection.py
diff --git a/gcs/move_collection/move_apollo/copy_apollo_collections.py b/gcs/obsolete/move_collection/move_apollo/copy_apollo_collections.py
similarity index 100%
rename from gcs/move_collection/move_apollo/copy_apollo_collections.py
rename to gcs/obsolete/move_collection/move_apollo/copy_apollo_collections.py
diff --git a/gcs/move_collection/move_apollo/delete_apollo_collections.py b/gcs/obsolete/move_collection/move_apollo/delete_apollo_collections.py
similarity index 100%
rename from gcs/move_collection/move_apollo/delete_apollo_collections.py
rename to gcs/obsolete/move_collection/move_apollo/delete_apollo_collections.py
diff --git a/gcs/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py b/gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py
similarity index 100%
rename from gcs/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py
rename to gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py
diff --git a/gcs/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py b/gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py
similarity index 100%
rename from gcs/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py
rename to gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py
diff --git a/gcs/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py b/gcs/obsolete/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py
similarity index 100%
rename from gcs/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py
rename to gcs/obsolete/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py
diff --git a/gcs/obsolete/populate_buckets_with_collections/README.md b/gcs/obsolete/populate_buckets_with_collections/README.md
deleted file mode 100644
index 760ced5..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-These scripts copied collections from idc_dev/idc_open to various buckets in support of redaction.
-Essentially single-use.
\ No newline at end of file
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py
deleted file mode 100644
index 85287ef..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py
+++ /dev/null
@@ -1,361 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-NOTE: This script and scripts which call it should be restructured such
-that the calling script passes in the list of collections to be copied.
-"""
-
-"""
-NOTE: The get_collections_in_version function is hardcoded for v5.
-Also, probably shoud remove the test for 'where v1=False and v2=True'
-in the SQL. That was a special case that should have been parameterized
-"""
-
-"""
-General purpose multiprocessing routine to copy the instances of some set of
-collections from one bucket to another.
-
-A parameter, args.retired, controls whether retired instances (as listed
-in the retired table) are copied.
-In general args.retired should be False when populating a bucket that will
-imported into a DICOM store, where (only current instances are wanted.
-args.retired should be Tree when populating a bucket that will be public
-(even if having Limited access) as well as the dev counterparts of these
-buckets.
-
-The inital use of this routine was in splitting idc_dev into multiple
-buckets in support of Google hosting and defacing.
-
-It can also be used to copy data among those buckets as needed, for example,
-when the set of collections to be defaced/redacted changes. Note that, for
-this purpose the module does not delete the source blob. That should be done
-separately.
-"""
-
-import argparse
-import os
-import logging
-
-rootlogger = logging.getLogger('root')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-import time
-from multiprocessing import Process, Queue
-from google.cloud import storage, bigquery
-
-from python_settings import settings
-import settings as etl_settings
-
-settings.configure(etl_settings)
-assert settings.configured
-
-TRIES = 3
-"""
-args paramaters
-bqdataset_name: bq datas et from which to access tables
-bq_collections_table': BQ table listing group of collections to be populate
-retired: Copy retired instances in collection if True
-src_bucket: Bucket from which to copy blobs
-dst_bucket, Bucket to which to copy blobs
-processes: Number of concurrent processes
-batch: Size of batch of blobs to be copied
-src_project: Project of destination bucket
-dst_project: Project of source bucket
-log_dir: Directory in which some log files are kept.
-dones: File listing collections that have been copied
-"""
-
-
-def get_collections_in_version(args):
-    client = bigquery.Client()
-    query = ""
-    if 'excluded_tables' in args.__dict__ and args.excluded_tables:
-        query = f"""
-            WITH ex as
-            ({query}
-                SELECT tcia_api_collection_id
-                FROM `{args.src_project}.{args.bqdataset_name}.{args.excluded_tables[0]}` 
-            """
-        for table in args.excluded_tables[1:]:
-            query = f"""
-                {query}
-                UNION ALL
-                SELECT tcia_api_collection_id
-                FROM `{args.src_project}.{args.bqdataset_name}.{table}` 
-                """
-        query = f"""
-            {query})
-            """
-        query = f"""
-            {query}
-            SELECT c.* 
-            FROM `idc-dev-etl.idc_v5.collection` AS c
-            LEFT JOIN ex
-            ON c.collection_id =ex.tcia_api_collection_id
-            where ex.tcia_api_collection_id is NULL
-
-            ORDER BY c.collection_id 
-            """
-        result = client.query(query).result()
-        collection_ids = [collection['collection_id'] for collection in result]
-    else:
-        query = f"""
-            SELECT c.* 
-            FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c
-            where v1=False and v2=True
-            ORDER BY c.tcia_api_collection_id 
-            """
-        result = client.query(query).result()
-        collection_ids = [collection['tcia_api_collection_id'] for collection in result]
-    return collection_ids
-
-
-def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket):
-    for row in rows:
-        index = f'{n}/{rowcount}'
-        blob_name = f'{row}.dcm'
-        if not blob_name in done_instances:
-            src_blob = src_bucket.blob(blob_name)
-            dst_blob = dst_bucket.blob(blob_name)
-            retries = 0
-            while True:
-                try:
-                    rewrite_token = False
-                    while True:
-                        rewrite_token, bytes_rewritten, bytes_to_rewrite = dst_blob.rewrite(
-                            src_blob, token=rewrite_token
-                        )
-                        if not rewrite_token:
-                            break
-                    successlogger.info(f'{blob_name}')
-                    break
-
-                    # blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket)
-                    # # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name)
-                    # successlogger.info(f'{blob_name}')
-                    # break
-
-                except Exception as exc:
-                    if retries == TRIES:
-                        errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id,
-                                        index, args.collection,
-                                        blob_name, retries, exc)
-                        break
-                time.sleep(retries)
-                retries += 1
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s', args.id, index, args.collection)
-        else:
-            if n % args.batch == 0:
-                rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name)
-        n += 1
-
-
-def worker(input, args, done_instances):
-    # rootlogger.info('p%s: Worker starting: args: %s', args.id, args )
-    # print(f'p{args.id}: Worker starting: args: {args}')
-
-    client = storage.Client()
-    src_bucket = client.bucket(args.src_bucket, user_project=args.src_project)
-    dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project)
-
-    for rows, n, rowcount in iter(input.get, 'STOP'):
-        copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket)
-        # output.put(n)
-
-
-def copy_all_instances(args):
-    client = bigquery.Client()
-    try:
-        # Create a set of previously copied blobs
-        done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines())
-    except:
-        done_instances = []
-
-    # We first copy the instances in the current IDC version,
-
-    # Query to get the instances in the collection
-    query = f"""
-        SELECT i.uuid
-        FROM `idc-dev-etl.idc_v{args.version}.collection` as c 
-        JOIN `idc-dev-etl.idc_v{args.version}.patient` as p
-        ON c.collection_id = p.collection_id
-        JOIN `idc-dev-etl.idc_v{args.version}.study` as st
-        ON p.submitter_case_id = st.submitter_case_id
-        JOIN `idc-dev-etl.idc_v{args.version}.series` as se
-        ON st.study_instance_uid = se.study_instance_uid
-        JOIN `idc-dev-etl.idc_v{args.version}.instance` as i
-        ON se.series_instance_uid = i.series_instance_uid
-        WHERE c.collection_id = '{args.collection}'
-        ORDER by i.uuid
-        """
-    args.id = 0
-
-    increment = args.batch
-    # cur.execute(query)
-    query_job = client.query((query))
-    query_job.result()
-    # Get the destination table for the query results.
-    #
-    # All queries write to a destination table. If a destination table is not
-    # specified, the BigQuery populates it with a reference to a temporary
-    # anonymous table after the query completes.
-    destination = query_job.destination
-
-    # Get the schema (and other properties) for the destination table.
-    #
-    # A schema is useful for converting from BigQuery types to Python types.
-    destination = client.get_table(destination)
-
-    prowcount = destination.num_rows
-    print(f'Copying collection {args.collection}; primary {prowcount} instances')
-
-    num_processes = max(1, min(args.processes, int(prowcount / increment)))
-    processes = []
-    # Create a pair of queue for each process
-
-    task_queue = Queue()
-
-    # task_queues = [Queue() for p in range(num_processes)]
-    # done_queues = [Queue() for p in range(num_processes)]
-
-    strt = time.time()
-
-    # Start worker processes
-    for process in range(num_processes):
-        args.id = process + 1
-        processes.append(
-            Process(group=None, target=worker, args=(task_queue, args, done_instances)))
-        # processes.append(
-        #     Process(group=None, target=worker, args=(task_queues[process], args, done_instances)))
-        # print(f'Started process {args.id}: {processes[-1]}')
-        processes[-1].start()
-
-    # Distribute the work across the task_queues
-    n = 1
-    while True:
-        # rows = cur.fetchmany(increment)
-        rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)]
-        if len(rows) == 0:
-            break
-        task_queue.put((rows, n, prowcount))
-        n += increment
-    print('Primary work distribution complete')
-
-    # Next we copy retired instances
-    # Query to get the instances from the retired table
-    if args.retired:
-        query = f"""
-            SELECT r.instance_uuid
-            FROM `idc-dev-etl.idc_v{args.version}.retired` as r 
-            WHERE r.collection_id = '{args.collection}'
-            ORDER by r.instance_uuid
-            """
-
-        query_job = client.query((query))
-        query_job.result()
-        # Get the destination table for the query results.
-        #
-        # All queries write to a destination table. If a destination table is not
-        # specified, the BigQuery populates it with a reference to a temporary
-        # anonymous table after the query completes.
-        destination = query_job.destination
-
-        # Get the schema (and other properties) for the destination table.
-        #
-        # A schema is useful for converting from BigQuery types to Python types.
-        destination = client.get_table(destination)
-
-        rrowcount = destination.num_rows
-        if rrowcount:
-            print(f'Copying retired {args.collection}; primary {rrowcount} instances')
-
-            # Distribute the work across the task_queues
-            n = 1
-            while True:
-                # rows = cur.fetchmany(increment)
-                rows = [r.instance_uuid for r in
-                        client.list_rows(destination, max_results=increment, start_index=n - 1)]
-                if len(rows) == 0:
-                    break
-                task_queue.put((rows, n, rrowcount))
-                n += increment
-            print('Retired work distribution complete')
-        else:
-            print(f'No retired instances in collection {args.collection}')
-    else:
-        rrowcount = 0
-
-    # Tell child processes to stop
-    for i in range(num_processes):
-        task_queue.put('STOP')
-
-    # Wait for process to terminate
-    for process in processes:
-        print(f'Joining process: {process.name}, {process.is_alive()}')
-        process.join()
-
-    delta = time.time() - strt
-    rate = (prowcount + rrowcount) / delta
-    print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes')
-
-
-def precopy(args):
-    client = bigquery.Client()
-    collections = get_collections_in_version(args)
-
-    if not os.path.exists('{}'.format(args.log_dir)):
-        os.mkdir('{}'.format(args.log_dir))
-        st = os.stat('{}'.format(args.log_dir))
-        os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222)
-
-    try:
-        dones = open(args.dones).read().splitlines()
-    except:
-        dones = []
-    for collection in collections:
-        if not collection in dones:
-            args.collection = collection
-            if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)):
-                os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection))
-
-            # Change logging file. File name includes collection ID.
-            for hdlr in successlogger.handlers[:]:
-                successlogger.removeHandler(hdlr)
-            success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection))
-            successlogger.addHandler(success_fh)
-            successformatter = logging.Formatter('%(message)s')
-            success_fh.setFormatter(successformatter)
-
-            for hdlr in errlogger.handlers[:]:
-                errlogger.removeHandler(hdlr)
-            err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection))
-            errformatter = logging.Formatter('%(levelname)s:err:%(message)s')
-            errlogger.addHandler(err_fh)
-            err_fh.setFormatter(errformatter)
-
-            copy_all_instances(args)
-
-            if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat(
-                    '{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size == 0:
-                # If no errors, then we are done with this collection
-                with open(args.dones, 'a') as f:
-                    f.write(f'{collection}\n')
-
-
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py
deleted file mode 100644
index cc3be8a..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in commercial restricted collections from the dev bucket to idc-dev-cr.
-This is/was used, among other things, for the initial population of the idc-dev-cr
-bucket.
-"""
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'cr'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc_dev')
-    parser.add_argument('--dst_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py
deleted file mode 100644
index f50e253..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'defaced'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-dev-open')
-    parser.add_argument('--dst_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py
deleted file mode 100644
index c495cb2..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'defaced'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--dst_bucket', default=f'idc-open-idc1')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='canceridc-data')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py
deleted file mode 100644
index 1447c5e..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in excluded collections from the dev bucket to idc-dev-excluded.
-This is/was used, among other things, for the initial population of the idc-dev-excluded
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'excluded'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc_dev')
-    parser.add_argument('--dst_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--processes', default=64, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=64, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py
deleted file mode 100644
index 2f81032..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in collections from the dev bucket to idc-dev-open.
-This was used, among other things, for the initial population of the idc-dev-open
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'open'
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=5, help='Next version to generate')
-    args = parser.parse_args()
-    parser.add_argument('--db', default=f'idc_v{args.version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc_dev')
-    parser.add_argument('--dst_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--excluded_tables', default=[
-        'excluded_collections',
-        'cr_collections',
-        'redacted_collections',
-        'defaced_collections'
-        ], help="Tables of lists of collections in other buckets to be excluded"
-    )
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py
deleted file mode 100644
index bad5e32..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'redacted'
-    version = 5
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=f'{version}', help='Next version to generate')
-    parser.add_argument('--db', default=f'idc_v{version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default='idc-dev-open')
-    parser.add_argument('--dst_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='idc-dev-etl')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py
deleted file mode 100644
index 52c2def..0000000
--- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted.
-This is/was used, among other things, for the initial population of the idc-dev-redacted
-bucket.
-"""
-
-"""
-Note: This script should be restructured such to pass in the list of collections to be copied.
-"""
-
-
-import argparse
-import os
-import logging
-from logging import INFO
-
-from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy
-
-
-if __name__ == '__main__':
-    group = 'redacted'
-    version = 5
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default=f'{version}', help='Next version to generate')
-    parser.add_argument('--db', default=f'idc_v{version}')
-    parser.add_argument('--bqdataset_name', default=f'idc_v{version}')
-    parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group')
-    parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True")
-    parser.add_argument('--src_bucket', default=f'idc-dev-{group}')
-    parser.add_argument('--dst_bucket', default=f'idc-open-idc')
-    parser.add_argument('--processes', default=128, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process')
-    parser.add_argument('--src_project', default='idc-dev-etl')
-    parser.add_argument('--dst_project', default='canceridc-data')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging')
-    # parser.add_argument('--collection_list', default='./collection_list.txt')
-    parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{version}_dicomstore_staging_dones.txt')
-
-    args = parser.parse_args()
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-    precopy(args)
-
diff --git a/gcs/obsolete/validate_opens_bucket.py b/gcs/obsolete/validate_opens_bucket.py
deleted file mode 100644
index ca61483..0000000
--- a/gcs/obsolete/validate_opens_bucket.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#
-# Copyright 2015-2021, Institute for Systems Biology
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-Multiprocess script to validate that the instances in a bucket are only
-those in some set of collections
-"""
-
-import argparse
-import os
-import logging
-from logging import INFO
-rootlogger = logging.getLogger('root')
-successlogger = logging.getLogger('success')
-errlogger = logging.getLogger('root.err')
-
-from gcs.validate_bucket.validate_bucket_mp import pre_validate
-
-
-if __name__ == '__main__':
-    version = 5
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--version', default = version)
-    parser.add_argument('--project', default = 'idc-dev-etl')
-    parser.add_argument('--bqdataset', default=f'idc_v{version}')
-    parser.add_argument('--bucket', default='idc-open-pdp-staging')
-    parser.add_argument('--collection_table', default='open_collections', help='BQ table containing list of collections')
-    parser.add_argument('--blob_names', default='./logs/blobs.txt', help='List of blobs names in above collections')
-    parser.add_argument('--processes', default=16, help="Number of concurrent processes")
-    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
-    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/validate_open_buckets')
-
-    args = parser.parse_args()
-
-    if not os.path.exists('{}'.format(args.log_dir)):
-        os.mkdir('{}'.format(args.log_dir))
-
-    rootlogger = logging.getLogger('root')
-    root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log')
-    rootformatter = logging.Formatter('%(levelname)s:root:%(message)s')
-    rootlogger.addHandler(root_fh)
-    root_fh.setFormatter(rootformatter)
-    rootlogger.setLevel(INFO)
-
-    successlogger = logging.getLogger('success')
-    successlogger.setLevel(INFO)
-
-    errlogger = logging.getLogger('root.err')
-
-
-
-    pre_validate(args)
diff --git a/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py b/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py
new file mode 100644
index 0000000..30eb25c
--- /dev/null
+++ b/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py
@@ -0,0 +1,49 @@
+#
+# Copyright 2015-2021, Institute for Systems Biology
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import argparse
+from gcs.copy_bucket_mp import copy_all_instances
+from utilities.logging_config import successlogger, progresslogger, errlogger
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--processes', default=1, help="Number of concurrent processes")
+    parser.add_argument('--batch', default=100, help='Size of batch assigned to each process')
+    parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/copy_bucket_mp')
+
+    args = parser.parse_args()
+
+    try:
+        # Create a set of previously copied blobs
+        dones = set(open(successlogger.handlers[0].baseFilename).read().splitlines())
+    except:
+        dones = set([])
+
+
+    args.src_bucket = 'idc-open-idc1-staging'
+    args.dst_bucket = 'idc-open-idc1'
+    copy_all_instances(args, dones)
+
+    args.src_bucket = 'idc-open-cr-staging'
+    args.dst_bucket = 'idc-open-cr'
+    copy_all_instances(args, dones)
+
+    args.src_bucket = 'public-datasets-idc-staging'
+    args.dst_bucket = 'public-datasets-idc'
+    copy_all_instances(args, dones)
+
+
diff --git a/gcs/validate_bucket/validate_bucket_mp.py b/gcs/release_gcs_data/validate_bucket/validate_bucket_mp.py
similarity index 100%
rename from gcs/validate_bucket/validate_bucket_mp.py
rename to gcs/release_gcs_data/validate_bucket/validate_bucket_mp.py
diff --git a/gcs/validate_bucket/validate_idc_dev_cr.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_cr.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_dev_cr.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_cr.py
diff --git a/gcs/validate_bucket/validate_idc_dev_defaced.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_defaced.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_dev_defaced.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_defaced.py
diff --git a/gcs/validate_bucket/validate_idc_dev_excluded.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_excluded.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_dev_excluded.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_excluded.py
diff --git a/gcs/validate_bucket/validate_idc_dev_open.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_open.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_dev_open.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_open.py
diff --git a/gcs/validate_bucket/validate_idc_dev_redacted.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_redacted.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_dev_redacted.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_redacted.py
diff --git a/gcs/validate_bucket/validate_idc_open_cr.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_cr.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_open_cr.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_cr.py
diff --git a/gcs/validate_bucket/validate_idc_open_cr_staging.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_cr_staging.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_open_cr_staging.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_cr_staging.py
diff --git a/gcs/validate_bucket/validate_idc_open_idc.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_open_idc.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc.py
diff --git a/gcs/validate_bucket/validate_idc_open_idc1.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_open_idc1.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1.py
diff --git a/gcs/validate_bucket/validate_idc_open_idc1_staging.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1_staging.py
similarity index 100%
rename from gcs/validate_bucket/validate_idc_open_idc1_staging.py
rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1_staging.py
diff --git a/gcs/validate_bucket/validate_public_datasets_idc.py b/gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc.py
similarity index 100%
rename from gcs/validate_bucket/validate_public_datasets_idc.py
rename to gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc.py
diff --git a/gcs/validate_bucket/validate_public_datasets_idc_staging.py b/gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc_staging.py
similarity index 100%
rename from gcs/validate_bucket/validate_public_datasets_idc_staging.py
rename to gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc_staging.py
diff --git a/validation/compare_hashes.py b/validation/compare_hashes.py
index bc8ac8e..a9b0b90 100644
--- a/validation/compare_hashes.py
+++ b/validation/compare_hashes.py
@@ -363,7 +363,7 @@ def compare_hashes(args):
     parser.add_argument('--ignore_differing_patient_counts', default=True)
     parser.add_argument('--log_level', default=("collection, patient, study, series, instance"),
                         help='Levels at which to log')
-    parser.add_argument('--collections', default=['RIDER Lung CT'], \
+    parser.add_argument('--collections', default=['RIDER Pilot'], \
         help='List of collections to compare. If empty, compare all collections')
     parser.add_argument('--skips', default=[])