From 702b120e7024fb179f6112540da55f684124d282 Mon Sep 17 00:00:00 2001 From: BillClifford Date: Mon, 26 Jun 2023 11:53:46 -0700 Subject: [PATCH] ~More completion of v15 ETL, cleanup --- .../publish_idc_pdp_staging_datasets.py | 4 +- .../copy_apollo_blobs.py | 48 --- .../copy_pathology_blobs.py | 49 --- .../pathology_collections.py | 17 - gcs/{copy_bucket_mp => }/copy_bucket_mp.py | 2 - ..._to_staging.py => copy_premerge_to_dev.py} | 0 gcs/misc/README.md | 1 - .../delete_idc_dev_defaced_pathology.py | 43 --- .../delete_list_of_blobs.py | 95 ----- gcs/misc/move_blobs_between_buckets.py | 178 --------- gcs/obsolete/copy_blobs.py | 265 ------------- .../collection_list.txt | 1 - .../copy_collections_bq.py | 268 ------------- .../copy_collections_nlst.py | 75 ---- .../copy_collections_psql.py | 263 ------------- .../copy_collections_v5.py | 66 ---- .../copy_prestaging_to_staging.py | 124 ------ .../copy_staging_buckets.py | 94 ----- .../copy_staging_buckets_v5.py | 38 -- .../README.md | 2 - .../depopulate_collections_from_bucket.py | 346 ----------------- ...ced_collections_from_open_bucket.v5.dev.py | 70 ---- ...ced_collections_from_open_bucket.v5.pdp.py | 70 ---- ...s_from_dicomstore_staging_bucket.v5.dev.py | 81 ---- ...ted_collections_from_open_bucket.v5.dev.py | 69 ---- ...ted_collections_from_open_bucket.v5.pdp.py | 69 ---- .../depopulate_version_from_bucket.v5.dev.py | 89 ----- .../depopulate_versions_from_bucket.py | 243 ------------ gcs/obsolete/empty_and_delete_bucket.py | 51 --- gcs/obsolete/empty_bucket.py | 48 --- gcs/obsolete/empty_idc_dev_etl_v2_buckets.py | 107 ------ .../move_collection/copy_collection.py | 0 .../move_collection/delete_collection.py | 0 .../move_apollo/copy_apollo_collections.py | 0 .../move_apollo/delete_apollo_collections.py | 0 .../copy_cptac_cm_and_lscc_collection.py | 0 .../delete_collection.py | 0 .../copy_prostate_diagnosis_collection.py | 0 .../README.md | 2 - .../populate_bucket_with_collections.py | 361 ------------------ .../populate_bucket_with_cr_collections.v5.py | 69 ---- ..._bucket_with_defaced_collections.v5.dev.py | 70 ---- ...bucket_with_defaced_collections.v5.prod.py | 70 ---- ...ate_bucket_with_excluded_collections.v5.py | 70 ---- ...opulate_bucket_with_open_collections.v5.py | 77 ---- ...bucket_with_redacted_collections.v5.dev.py | 70 ---- ...ucket_with_redacted_collections.v5.prod.py | 70 ---- gcs/obsolete/validate_opens_bucket.py | 66 ---- .../copy_staging_buckets_to_public_buckets.py | 49 +++ .../validate_bucket/validate_bucket_mp.py | 0 .../validate_bucket/validate_idc_dev_cr.py | 0 .../validate_idc_dev_defaced.py | 0 .../validate_idc_dev_excluded.py | 0 .../validate_bucket/validate_idc_dev_open.py | 0 .../validate_idc_dev_redacted.py | 0 .../validate_bucket/validate_idc_open_cr.py | 0 .../validate_idc_open_cr_staging.py | 0 .../validate_bucket/validate_idc_open_idc.py | 0 .../validate_bucket/validate_idc_open_idc1.py | 0 .../validate_idc_open_idc1_staging.py | 0 .../validate_public_datasets_idc.py | 0 .../validate_public_datasets_idc_staging.py | 0 validation/compare_hashes.py | 2 +- 63 files changed, 52 insertions(+), 3800 deletions(-) delete mode 100644 gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py delete mode 100644 gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py delete mode 100644 gcs/copy_blobs_using_BQ_query/pathology_collections.py rename gcs/{copy_bucket_mp => }/copy_bucket_mp.py (99%) rename gcs/copy_premerge_to_dev_buckets/{copy_premerge_to_staging.py => copy_premerge_to_dev.py} (100%) delete mode 100644 gcs/misc/README.md delete mode 100644 gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py delete mode 100644 gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py delete mode 100644 gcs/misc/move_blobs_between_buckets.py delete mode 100644 gcs/obsolete/copy_blobs.py delete mode 100644 gcs/obsolete/copy_collections__obsolete/collection_list.txt delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py delete mode 100644 gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py delete mode 100644 gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py delete mode 100644 gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py delete mode 100644 gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/README.md delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py delete mode 100644 gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py delete mode 100644 gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py delete mode 100644 gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py delete mode 100644 gcs/obsolete/empty_and_delete_bucket.py delete mode 100644 gcs/obsolete/empty_bucket.py delete mode 100644 gcs/obsolete/empty_idc_dev_etl_v2_buckets.py rename gcs/{ => obsolete}/move_collection/copy_collection.py (100%) rename gcs/{ => obsolete}/move_collection/delete_collection.py (100%) rename gcs/{ => obsolete}/move_collection/move_apollo/copy_apollo_collections.py (100%) rename gcs/{ => obsolete}/move_collection/move_apollo/delete_apollo_collections.py (100%) rename gcs/{ => obsolete}/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py (100%) rename gcs/{ => obsolete}/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py (100%) rename gcs/{ => obsolete}/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py (100%) delete mode 100644 gcs/obsolete/populate_buckets_with_collections/README.md delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py delete mode 100644 gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py delete mode 100644 gcs/obsolete/validate_opens_bucket.py create mode 100644 gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py rename gcs/{ => release_gcs_data}/validate_bucket/validate_bucket_mp.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_cr.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_defaced.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_excluded.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_open.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_dev_redacted.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_cr.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_cr_staging.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc1.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_idc_open_idc1_staging.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_public_datasets_idc.py (100%) rename gcs/{ => release_gcs_data}/validate_bucket/validate_public_datasets_idc_staging.py (100%) diff --git a/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py b/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py index 63520ed..4b85190 100644 --- a/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py +++ b/bq/utils/publish_dataset/publish_idc_pdp_staging_datasets.py @@ -33,7 +33,7 @@ progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}') for src_dataset in ( - 'idc_v14', - 'idc_v14_clinical', + f'idc_v{settings.CURRENT_VERSION}', + f'idc_v1{settings.CURRENT_VERSION}_clinical', ): publish_dataset(args) diff --git a/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py b/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py deleted file mode 100644 index 6da1c0a..0000000 --- a/gcs/copy_blobs_using_BQ_query/copy_apollo_blobs.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# One time script to copy all APOLLOxx blobs from idc-dev-redacted to idc-dev-open -import json -import os -import argparse - -from copy_blobs_mp import copy_all_blobs -from utilities.logging_config import successlogger, progresslogger, errlogger - -# Copy the blobs that are new to a version from dev pre-staging buckets -# to dev staging buckets. -import settings - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Version to work on') - parser.add_argument('--src_bucket', default="idc-dev-redacted") - parser.add_argument('--dst_bucket', default="idc-dev-open") - parser.add_argument('--batch', default=1000) - parser.add_argument('--processes', default=1) - args = parser.parse_args() - args.id = 0 # Default process ID - - progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}') - - query=f""" - SELECT DISTINCT concat(i_uuid,'.dcm') blob - FROM `idc-dev-etl.idc_v{args.version}_dev.all_joined` aj - WHERE collection_id LIKE 'APOLLO%' - """ - - copy_all_blobs(args, query) \ No newline at end of file diff --git a/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py b/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py deleted file mode 100644 index 308d19c..0000000 --- a/gcs/copy_blobs_using_BQ_query/copy_pathology_blobs.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import os -import argparse - -from copy_blobs_mp import copy_all_blobs -from pathology_collections import collection_list -from utilities.logging_config import successlogger, progresslogger, errlogger - -# Copy the blobs that are new to a version from dev pre-staging buckets -# to dev staging buckets. -import settings - - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=12, help='Version to work on') - parser.add_argument('--src_bucket', default="idc-dev-open") - parser.add_argument('--dst_bucket', default="pathology_blobs_whc") - parser.add_argument('--batch', default=1000) - parser.add_argument('--processes', default=1) - args = parser.parse_args() - args.id = 0 # Default process ID - - progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}') - - query=f""" - select concat(i_uuid,'.dcm') blob - from `idc-dev-etl.idc_v{args.version}_dev.all_joined` aj - where idc_version={args.version} and i_source='path' - and collection_id in {tuple(collection_list)} """ - - copy_all_blobs(args, query) \ No newline at end of file diff --git a/gcs/copy_blobs_using_BQ_query/pathology_collections.py b/gcs/copy_blobs_using_BQ_query/pathology_collections.py deleted file mode 100644 index 780c49a..0000000 --- a/gcs/copy_blobs_using_BQ_query/pathology_collections.py +++ /dev/null @@ -1,17 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -collection_list = ['CPTAC-LSCC', 'CPTAC-LSCC'] \ No newline at end of file diff --git a/gcs/copy_bucket_mp/copy_bucket_mp.py b/gcs/copy_bucket_mp.py similarity index 99% rename from gcs/copy_bucket_mp/copy_bucket_mp.py rename to gcs/copy_bucket_mp.py index fd18a46..55cb86d 100644 --- a/gcs/copy_bucket_mp/copy_bucket_mp.py +++ b/gcs/copy_bucket_mp.py @@ -90,8 +90,6 @@ def worker(input, args, dones): def copy_all_instances(args, dones): client = storage.Client() src_bucket = storage.Bucket(client, args.src_bucket) - - n=len(dones) progresslogger.info(f"{len(dones)} blobs previously copied") diff --git a/gcs/copy_premerge_to_dev_buckets/copy_premerge_to_staging.py b/gcs/copy_premerge_to_dev_buckets/copy_premerge_to_dev.py similarity index 100% rename from gcs/copy_premerge_to_dev_buckets/copy_premerge_to_staging.py rename to gcs/copy_premerge_to_dev_buckets/copy_premerge_to_dev.py diff --git a/gcs/misc/README.md b/gcs/misc/README.md deleted file mode 100644 index 055dae6..0000000 --- a/gcs/misc/README.md +++ /dev/null @@ -1 +0,0 @@ -Mostly replaced by newer scripts/ \ No newline at end of file diff --git a/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py b/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py deleted file mode 100644 index 4f73a0b..0000000 --- a/gcs/misc/delete_list_of_blobs/delete_idc_dev_defaced_pathology.py +++ /dev/null @@ -1,43 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# One time script to delete CPTAC pathology from the idc-open-idc1 bucket -# It was previously moved from the idc-dev-defaced bucket to idc-dev-open -# and idc-open-pdpn-staging. -import argparse -from gcs.misc.delete_list_of_blobs.delete_list_of_blobs import del_all_instances -from google.cloud import bigquery -from utilities.logging_config import successlogger, progresslogger, errlogger -from python_settings import settings - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--bucket', default='idc-dev-defaced') - parser.add_argument('--processes', default=16, help="Number of concurrent processes") - parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') - parser.add_argument('--project', default='canceridc-data') - args = parser.parse_args() - - client = bigquery.Client() - query = f""" - SELECT distinct i_uuid FROM `idc-dev-etl.idc_v12_dev.all_joined_included` - where collection_id in ('CPTAC-CM', 'CPTAC-LSCC') - and i_source='idc' - and i_rev_idc_version<10 - order by i_uuid - """ - instances = [f'{row.i_uuid}.dcm' for row in client.query(query)] - del_all_instances (args, instances) diff --git a/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py b/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py deleted file mode 100644 index 0aab39f..0000000 --- a/gcs/misc/delete_list_of_blobs/delete_list_of_blobs.py +++ /dev/null @@ -1,95 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from utilities.logging_config import successlogger, progresslogger, errlogger - -import time -from multiprocessing import Process, Queue -from google.cloud import storage -from google.api_core.exceptions import ServiceUnavailable, NotFound - -def delete_instances(args, client, bucket, blobs, n): - try: - # with client.batch(): - # for blob in blobs: - # bucket.blob(blob).delete() - # # bucket.blob(blob[0], generation=blob[1]).delete() - for blob in blobs: - bucket.blob(blob).delete() - # bucket.blob(blob[0], generation=blob[1]).delete() - successlogger.info(f'{blob}') - except ServiceUnavailable: - errlogger.error('p%s Delete %s blob %s failed', args.id, args.bucket, blob) - except NotFound: - errlogger.error('p%s Delete %s blobs % failed, not found', args.id, args.bucket, blob) - except Exception as exc: - errlogger.error('p%s Exception on %s blob %s: %s', args.id, args.bucket, blob, exc) - - -def worker(input, args): - client = storage.Client() - bucket = storage.Bucket(client, args.bucket) - for blobs, n in iter(input.get, 'STOP'): - delete_instances(args, client, bucket, blobs, n) - - -def del_all_instances(args, instance_list): - bucket = args.bucket - client = storage.Client() - bucket = storage.Bucket(client, args.bucket) - - dones = set(open(successlogger.handlers[0].baseFilename).read().splitlines()) - - num_processes = args.processes - processes = [] - - task_queue = Queue() - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args))) - processes[-1].start() - - - # Distribute the work across the task_queues - n = 0 - n=0 - # Submit args.batch size chunks to process - while instance_list: - some_instances= list(set(instance_list[0:args.batch]) - dones) - instance_list = instance_list[args.batch:] - if some_instances: - task_queue.put((some_instances,n)) - n += args.batch - progresslogger.info('Primary work distribution complete; {} blobs'.format(n)) - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - - - # Wait for process to terminate - for process in processes: - # print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - - delta = time.time() - strt - rate = (n)/delta - progresslogger.info(f'Completed bucket {args.bucket}, {rate} instances/sec, {num_processes} processes') diff --git a/gcs/misc/move_blobs_between_buckets.py b/gcs/misc/move_blobs_between_buckets.py deleted file mode 100644 index f41bd2a..0000000 --- a/gcs/misc/move_blobs_between_buckets.py +++ /dev/null @@ -1,178 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import json -import os -import argparse -import logging -from logging import INFO -from google.cloud import bigquery, storage -import time -from multiprocessing import Process, Queue -from utilities.logging_config import successlogger, progresslogger, errlogger - -# Copy the blobs that are new to a version from dev pre-staging buckets -# to dev staging buckets. -import settings - - -# Get a the dev_url and pub_url of all new instances. The dev_url is the url of the -# premerge bucket or staging bucket holding the new instance. The pub_url is the -# url of the bucket to which to copy it -def get_urls(args): - client = bigquery.Client() - query = f""" - SELECT - instance_uuid uuid - FROM - `idc-dev-etl.idc_v{args.version}_pub.auxiliary_metadata` - WHERE - instance_revised_idc_version = {args.version} - AND tcia_api_collection_id = '{args.collection}' - """ - # urls = list(client.query(query)) - query_job = client.query(query) # Make an API request. - query_job.result() # Wait for the query to complete. - destination = query_job.destination - destination = client.get_table(destination) - return destination - -def move_some_blobs(args, client, urls, n, dones): - done = 0 - copied = 0 - for blob_name in urls: - if not blob_name in dones: - src_bucket = client.bucket(args.src_bucket) - src_blob = src_bucket.blob(blob_name) - trg_bucket = client.bucket(args.trg_bucket) - trg_blob = trg_bucket.blob(blob_name) - for attempt in range(3): - try: - rewrite_token = False - while True: - rewrite_token, bytes_rewritten, bytes_to_rewrite = trg_blob.rewrite( - src_blob, token=rewrite_token - ) - if not rewrite_token: - break - src_blob.delete() - - successlogger.info('%s', blob_name) - progresslogger.info(f'p{args.id}: {done+n}of{len(urls)+n}: {args.src_bucket}/{blob_name} --> {args.trg_bucket}/{blob_name}') - break - except Exception as exc: - errlogger.error('p%s: Blob: %s, attempt: %s; %s', args.id, blob_name, attempt, exc) - - done += 1 - if copied == 0: - progresslogger.info(f'p{args.id}: Skipped {n}:{n+done-1}') - - -def worker(input, args, dones): - # proglogger.info('p%s: Worker starting: args: %s', args.id, args ) - # print(f'p{args.id}: Worker starting: args: {args}') - - # RETRIES = 3 - # try: - # dones = set(open(f'{successlogger.handlers[0].baseFilename}').read().splitlines()) - # except: - # dones = [] - - client = storage.Client() - for urls, n in iter(input.get, 'STOP'): - move_some_blobs(args, client, urls, n, dones) - - -def copy_all_blobs(args): - bq_client = bigquery.Client() - destination = get_urls(args) - - num_processes = args.processes - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - strt = time.time() - dones = set(open(f'{successlogger.handlers[0].baseFilename}').read().splitlines()) - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, dones))) - processes[-1].start() - - # Distribute the work across the task_queues - n = 0 - for page in bq_client.list_rows(destination, page_size=args.batch).pages: - uuids = [f'{row.uuid}.dcm' for row in page] - task_queue.put((uuids, n)) - # print(f'Queued {n}:{n+args.batch-1}') - n += page.num_items - print('Primary work distribution complete; {} blobs'.format(n)) - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - - delta = time.time() - strt - rate = (n)/delta - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=settings.CURRENT_VERSION, help='Version to work on') - # parser.add_argument('--log_dir', default=f'{settings.LOGGING_BASE}/{settings.BASE_NAME}') - parser.add_argument('--batch', default=100) - parser.add_argument('--processes', default=16) - parser.add_argument('--collection', default = 'CPTAC-LSCC') - parser.add_argument('--src_bucket', default = 'idc-open-idc1') - parser.add_argument('--trg_bucket', default = 'idc-open-pdp-staging') - args = parser.parse_args() - args.id = 0 # Default process ID - - progresslogger.info(f'args: {json.dumps(args.__dict__, indent=2)}') - - # if not os.path.exists(settings.LOGGING_BASE): - # os.mkdir(settings.LOGGING_BASE) - # if not os.path.exists(args.log_dir): - # os.mkdir(args.log_dir) - # - # successlogger = logging.getLogger('root.success') - # successlogger.setLevel(INFO) - # for hdlr in successlogger.handlers[:]: - # successlogger.removeHandler(hdlr) - # success_fh = logging.FileHandler('{}/success.log'.format(args.log_dir)) - # successlogger.addHandler(success_fh) - # successformatter = logging.Formatter('%(message)s') - # success_fh.setFormatter(successformatter) - # - # errlogger = logging.getLogger('root.err') - # for hdlr in errlogger.handlers[:]: - # errlogger.removeHandler(hdlr) - # err_fh = logging.FileHandler('{}/error.log'.format(args.log_dir)) - # errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - # errlogger.addHandler(err_fh) - # err_fh.setFormatter(errformatter) - - - copy_all_blobs(args) \ No newline at end of file diff --git a/gcs/obsolete/copy_blobs.py b/gcs/obsolete/copy_blobs.py deleted file mode 100644 index 5157993..0000000 --- a/gcs/obsolete/copy_blobs.py +++ /dev/null @@ -1,265 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Copy all blobs named in the DB to some bucket. -# This is specifically to copy blobs from the dev bucket -# to the open bucket. -# Since we multiprocess by collection, this depends on the -# a table that is the join of the version, collection,..., instance tables. - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO -import time -from datetime import timedelta -from multiprocessing import Process, Queue -from queue import Empty -from google.cloud import storage - - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured -import psycopg2 -from psycopg2.extras import DictCursor - -TRIES=3 - -def val_collection(cur, args, dones, collection_index, tcia_api_collection_id): - if not tcia_api_collection_id in dones: - - # src_client = storage.Client(project=args.src_project) - # dst_client = storage.Client(project=args.dst_project) - # src_bucket = src_client.bucket(args.src_bucket) - # dst_bucket = dst_client.bucket(args.dst_bucket, user_project=args.dst_project) - client = storage.Client() - src_bucket = client.bucket(args.src_bucket, user_project=args.src_project) - dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project) - n = 1 - - try: - done_instances = set(open(f'./logs/cb_{tcia_api_collection_id}_success.log').read().splitlines()) - except: - done_instances = [] - - increment = 5000 - query= f""" - SELECT * - FROM {args.all_table} - WHERE tcia_api_collection_id = '{tcia_api_collection_id}' - order by sop_instance_uid - """ - cur.execute(query) - rowcount=cur.rowcount - successes = open(f'./logs/cb_{tcia_api_collection_id}_success.log', 'a') - failures = open(f'./logs/cb_{tcia_api_collection_id}_failures.log', 'a') - failure_count=0 - while True: - rows = cur.fetchmany(increment) - if len(rows) == 0: - break - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row["instance_uuid"]}.dcm' - if not blob_name in done_instances: - retries = 0 - while True: - try: - blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket) - rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, tcia_api_collection_id, blob_name) - successes.write(f'{blob_name}\n') - break - except Exception as exc: - errlogger.error('%s %s: %s: copy failed %s\n, retry %s; %s', args.id, - index, tcia_api_collection_id, - blob_name, retries, exc) - if retries == TRIES: - failures.write(f'{blob_name}; {exc}\n') - failure_count += 1 - break - time.sleep(retries) - retries += 1 - else: - if n % 10000 == 0: - rootlogger.info('%s %s: %s: skipping blob %s ', args.id, index, tcia_api_collection_id, blob_name) - n += 1 - - if failure_count == 0: - # with open(args.dones, 'a') as f: - # f.write(f"{tcia_api_collection_id}\n") - donelogger.info('%s', tcia_api_collection_id) - rootlogger.info('%s: Completed collection %s ', args.id, tcia_api_collection_id) - else: - errlogger.error('%s: Failed collection %s; %s failures ', args.id, tcia_api_collection_id, failure_count) - - else: - rootlogger.info("p%s: Collection %s, %s, previously built", args.id, tcia_api_collection_id, collection_index) - - - -def worker(input, output, args, dones): - rootlogger.debug('p%s: Worker starting: args: %s', args.id, args) - conn = psycopg2.connect(dbname=settings.DATABASE_NAME, user=settings.DATABASE_USERNAME, - password=settings.DATABASE_PASSWORD, host=settings.DATABASE_HOST) - with conn: - with conn.cursor(cursor_factory=DictCursor) as cur: - - for more_args in iter(input.get, 'STOP'): - validated = 0 - for attempt in range(TRIES): - try: - collection_index, tcia_api_collection_id = more_args - # copy_collection(args, dones, collection_index, tcia_api_collection_id) - val_collection(cur, args, dones, collection_index, tcia_api_collection_id) - break - except Exception as exc: - errlogger.error("p%s, exception %s; reattempt %s on collection %s", args.id, exc, attempt, tcia_api_collection_id) - - - if attempt == TRIES: - errlogger.error("p%s, Failed to process collection: %s", args.id, tcia_api_collection_id) - - output.put((tcia_api_collection_id)) - -def copy_collections(cur, args, version): - # Session = sessionmaker(bind= sql_engine) - # version = version_is_done(sess, args.version) - try: - skips = open(args.skips).read().splitlines() - except: - skips = [] - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - begin = time.time() - cur.execute(""" - SELECT * FROM collection - WHERE version_id = (%s)""", (version['id'],)) - collections = cur.fetchall() - - rootlogger.info("Version %s; %s collections", version['idc_version_number'], len(collections)) - if args.processes == 0: - args.id=0 - for collection in collections: - if not collection['tcia_api_collection_id'] in skips: - collection_index = f'{collections.index(collection)+1} of {len(collections)}' - val_collection(cur, args, dones, collection_index, collection['tcia_api_collection_id']) - - else: - processes = [] - # Create queues - task_queue = Queue() - done_queue = Queue() - - # List of patients enqueued - enqueued_collections = [] - - # Start worker processes - for process in range(args.processes): - args.id = process + 1 - processes.append( - Process(target=worker, args=(task_queue, done_queue, args, dones))) - processes[-1].start() - - # Enqueue each patient in the the task queue - for collection in collections: - if not collection['tcia_api_collection_id'] in skips: - collection_index = f'{collections.index(collection) + 1} of {len(collections)}' - task_queue.put((collection_index, collection['tcia_api_collection_id'])) - enqueued_collections.append(collection['tcia_api_collection_id']) - - # Collect the results for each patient - try: - while not enqueued_collections == []: - # Timeout if waiting too long - tcia_api_collection_id = done_queue.get(True) - enqueued_collections.remove(tcia_api_collection_id) - - # Tell child processes to stop - for process in processes: - task_queue.put('STOP') - - # Wait for them to stop - for process in processes: - process.join() - - duration = str(timedelta(seconds=(time.time() - begin))) - rootlogger.info("Collection %s, %s, completed in %s", collection['tcia_api_collection_id'], collection_index, - duration) - - - except Empty as e: - errlogger.error("Exception copy_collections__obsolete ") - for process in processes: - process.terminate() - process.join() - duration = str(timedelta(seconds=(time.time() - begin))) - rootlogger.info("Collection copying NOT completed") - - -def precopy(args): - conn = psycopg2.connect(dbname=settings.DATABASE_NAME, user=settings.DATABASE_USERNAME, - password=settings.DATABASE_PASSWORD, host=settings.DATABASE_HOST) - with conn: - with conn.cursor(cursor_factory=DictCursor) as cur: - cur.execute(""" - SELECT * - FROM version - WHERE idc_version_number = (%s)""", (args.version,)) - - version = cur.fetchone() - copy_collections(cur, args, version) - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=2, help='Next version to generate') - parser.add_argument('--src_bucket', default='idc_dev', help='Bucket to validate') - parser.add_argument('--dst_bucket', default='idc-open', help='Bucket to validate') - parser.add_argument('--all_table', default='all_v2') - parser.add_argument('--processes', default=16, help="Number of concurrent processes") - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='canceridc-data') - parser.add_argument('--skips', default='./logs/copy_blobs_skips.log' ) - parser.add_argument('--dones', default='./logs/copy_blobs__dones.log' ) - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler('{}/logs/copy_blobs_log.log'.format(os.environ['PWD'])) - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - donelogger = logging.getLogger('done') - done_fh = logging.FileHandler(args.dones) - doneformatter = logging.Formatter('%(message)s') - donelogger.addHandler(done_fh) - done_fh.setFormatter(doneformatter) - donelogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - err_fh = logging.FileHandler('{}/logs/copy_blobs_err.log'.format(os.environ['PWD'])) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - precopy(args) diff --git a/gcs/obsolete/copy_collections__obsolete/collection_list.txt b/gcs/obsolete/copy_collections__obsolete/collection_list.txt deleted file mode 100644 index 8e6c556..0000000 --- a/gcs/obsolete/copy_collections__obsolete/collection_list.txt +++ /dev/null @@ -1 +0,0 @@ -NLST \ No newline at end of file diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py deleted file mode 100644 index e5e34df..0000000 --- a/gcs/obsolete/copy_collections__obsolete/copy_collections_bq.py +++ /dev/null @@ -1,268 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all included collections (not in the excluded_collections table) to another bucket. -This is generally used to populate a bucket that can then be imported -into a DICOM store. -""" - -import argparse -import os -import queue -import time -from subprocess import run, PIPE -import logging -rootlogger = logging.getLogger('root') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -from logging import INFO -import time -from datetime import timedelta -from multiprocessing import Process, Queue -from queue import Empty -from google.cloud import storage, bigquery - - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured -import psycopg2 -from psycopg2.extras import DictCursor - -TRIES=3 - -# Get all collections in some version that are not excluded -def get_collections_in_version(args): - client = bigquery.Client() - query = f""" - SELECT c.* - FROM `{args.src_project}.{args.bqdataset_name}.{args.bq_collection_table}` as c - LEFT JOIN `{args.src_project}.{args.bqdataset_name}.{args.bq_excluded_collections}` as ex - ON LOWER(c.collection_id) = LOWER(ex.tcia_api_collection_id) - WHERE ex.tcia_api_collection_id is NULL - ORDER BY c.collection_id - """ - result = client.query(query).result() - collection_ids = [collection['collection_id'] for collection in result] - return collection_ids - - -def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket): - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row}.dcm' - if not blob_name in done_instances: - retries = 0 - while True: - try: - blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket) - # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name) - successlogger.info(f'{blob_name}') - break - except Exception as exc: - if retries == TRIES: - errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id, - index, args.collection, - blob_name, retries, exc) - break - time.sleep(retries) - retries += 1 - if n % args.batch == 0: - rootlogger.info('p%s %s: %s', args.id, index, args.collection) - else: - if n % args.batch == 0: - rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name) - n += 1 - - -def worker(input, args, done_instances): - # rootlogger.info('p%s: Worker starting: args: %s', args.id, args ) - print(f'p{args.id}: Worker starting: args: {args}') - - conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT, - password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST) - with conn: - with conn.cursor(cursor_factory=DictCursor) as cur: - client = storage.Client() - src_bucket = client.bucket(args.src_bucket, user_project=args.src_project) - dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project) - - for rows, n, rowcount in iter(input.get, 'STOP'): - copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket) - # output.put(n) - - -def copy_all_instances(args, query): - client = bigquery.Client() - try: - # Create a set of previously copied blobs - done_instances = set(open(f'{args.log_dir}/cc_{args.collection}_success.log').read().splitlines()) - except: - done_instances = [] - - increment = args.batch - # cur.execute(query) - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, the BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - rowcount = destination.num_rows - print(f'Copying collection {args.collection}; {rowcount} instances') - - num_processes = max(1,min(args.processes, int(rowcount/increment))) - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - # task_queues = [Queue() for p in range(num_processes)] - # done_queues = [Queue() for p in range(num_processes)] - - # List of patients enqueued - enqueued_batches = [] - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, done_instances))) - # processes.append( - # Process(group=None, target=worker, args=(task_queues[process], args, done_instances))) - # print(f'Started process {args.id}: {processes[-1]}') - processes[-1].start() - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n-1)] - if len(rows) == 0: - break - task_queue.put((rows, n, rowcount)) - # task_queues[q%num_processes].put((rows, n, rowcount)) - enqueued_batches.append(n) - # print(f'Enqueue {n} on queue {q%num_processes}') - n += increment - print('Work distribution complete') - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - # print(f'Stop queue {i}') - - # # Wait until all work is complete - # q = 0 - # while not enqueued_batches == []: - # # Timeout if waiting too long - # try: - # results = done_queues[q%num_processes].get(timeout=1) - # enqueued_batches.remove(results) - # except queue.Empty: - # pass - # q += 1 - # - # Close all the queues - # for q in task_queues: - # q.close() - # for q in done_queues: - # q.close() - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - # if process.is_alive(): - # rootlogger.info('Collection: %s, terminating process %s',args.collection, process.name) - # process.kill() - # print(f'Joined process {process.name.split("-")[-1]}, exitcode: {process.exitcode}') - - - delta = time.time() - strt - rate = rowcount/delta - print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes') - - - -def precopy(args): - client = bigquery.Client() - collections = get_collections_in_version(args) - - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - for collection in collections: - if not collection in dones: - args.collection = collection - if os.path.exists('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)): - os.remove('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) - - # Change logging file. File name includes collection ID. - for hdlr in successlogger.handlers[:]: - successlogger.removeHandler(hdlr) - success_fh = logging.FileHandler('{}/cc_{}_success.log'.format(args.log_dir, collection)) - successlogger.addHandler(success_fh) - successformatter = logging.Formatter('%(message)s') - success_fh.setFormatter(successformatter) - - for hdlr in errlogger.handlers[:]: - errlogger.removeHandler(hdlr) - err_fh = logging.FileHandler('{}/cc_{}_error.log'.format(args.log_dir, collection)) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - # Query to get the instances in the collection - query = f""" - SELECT i.uuid - FROM `idc-dev-etl.idc_v{args.version}.collection` as c - JOIN `idc-dev-etl.idc_v{args.version}.patient` as p - ON c.collection_id = p.collection_id - JOIN `idc-dev-etl.idc_v{args.version}.study` as st - ON p.submitter_case_id = st.submitter_case_id - JOIN `idc-dev-etl.idc_v{args.version}.series` as se - ON st.study_instance_uid = se.study_instance_uid - JOIN `idc-dev-etl.idc_v{args.version}.instance` as i - ON se.series_instance_uid = i.series_instance_uid - WHERE c.collection_id = '{args.collection}' - ORDER by i.uuid - """ - args.id = 0 - - copy_all_instances(args, query) - - if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat('{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size==0: - # If no errors, then we are done with this collection - with open(args.dones, 'a') as f: - f.write(f'{collection}\n') - - diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py deleted file mode 100644 index ea18038..0000000 --- a/gcs/obsolete/copy_collections__obsolete/copy_collections_nlst.py +++ /dev/null @@ -1,75 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs named in some collection from the dev bucket to some other bucket. -This is/was used, among other things, for the initial population of the idc_gch_staging -bucket from which Google Healthcare ingests our data. -""" - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO -import time -from datetime import timedelta -from multiprocessing import Process, Queue -from queue import Empty -from google.cloud import storage - - -from python_settings import settings -import settings as etl_settings - -# settings.configure(etl_settings) -# assert settings.configured -# import psycopg2 -# from psycopg2.extras import DictCursor -from gcs.copy_collections__obsolete.copy_collections_bq import precopy - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=4, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--src_bucket', default='idc_v5_nlst') - parser.add_argument('--dst_bucket', default='idc_dev') - parser.add_argument('--processes', default=96, help="Number of concurrent processes") - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default='/mnt/disks/idc-etl/logs/copy_collections__obsolete') - parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default='./logs/dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler('{}/logs/copy_collections__obsolete.log'.format(os.environ['PWD'])) - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py deleted file mode 100644 index e2b76b7..0000000 --- a/gcs/obsolete/copy_collections__obsolete/copy_collections_psql.py +++ /dev/null @@ -1,263 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all included collections (not in the excluded_collections table) to another bucket. -This is generally used to populate a bucket that can then be imported -into a DICOM store. -""" - -import argparse -import os -import queue -import time -from subprocess import run, PIPE -import logging -rootlogger = logging.getLogger('root') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -from logging import INFO -import time -from datetime import timedelta -from multiprocessing import Process, Queue -from queue import Empty -from google.cloud import storage, bigquery - - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured -import psycopg2 -from psycopg2.extras import DictCursor - -TRIES=3 - -# Get all collections in some version that are not excluded -def get_collections_in_version(args): - client = bigquery.Client() - query = f""" - SELECT c.* - FROM `{args.src_project}.{args.bqdataset_name}.{args.bq_collection_table}` as c - LEFT JOIN `{args.src_project}.{args.bqdataset_name}.{args.bq_excluded_collections}` as ex - ON LOWER(c.collection_id) = LOWER(ex.tcia_api_collection_id) - WHERE ex.tcia_api_collection_id is NULL - ORDER BY c.collection_id - """ - result = client.query(query).result() - collection_ids = [collection['collection_id'] for collection in result] - return collection_ids - - -def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket): - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row["uuid"]}.dcm' - if not blob_name in done_instances: - retries = 0 - while True: - try: - blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket) - # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name) - successlogger.info(f'{blob_name}') - break - except Exception as exc: - if retries == TRIES: - errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id, - index, args.collection, - blob_name, retries, exc) - break - time.sleep(retries) - retries += 1 - if n % args.batch == 0: - rootlogger.info('p%s %s: %s', args.id, index, args.collection) - else: - if n % args.batch == 0: - rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name) - n += 1 - - -def worker(input, args, done_instances): - # rootlogger.info('p%s: Worker starting: args: %s', args.id, args ) - print(f'p{args.id}: Worker starting: args: {args}') - - conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT, - password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST) - with conn: - with conn.cursor(cursor_factory=DictCursor) as cur: - client = storage.Client() - src_bucket = client.bucket(args.src_bucket, user_project=args.src_project) - dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project) - - for rows, n, rowcount in iter(input.get, 'STOP'): - copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket) - # output.put(n) - - -def copy_all_instances(args, cur, query): - - try: - # Create a set of previously copied blobs - done_instances = set(open(f'{args.log_dir}/cc_{args.collection}_success.log').read().splitlines()) - except: - done_instances = [] - - increment = args.batch - cur.execute(query) - rowcount = cur.rowcount - print(f'Copying collection {args.collection}; {rowcount} instances') - - strt = time.time() - num_processes = max(1,min(args.processes, int(rowcount/increment))) - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - # task_queues = [Queue() for p in range(num_processes)] - # done_queues = [Queue() for p in range(num_processes)] - - # List of patients enqueued - enqueued_batches = [] - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, done_instances))) - # processes.append( - # Process(group=None, target=worker, args=(task_queues[process], args, done_instances))) - # print(f'Started process {args.id}: {processes[-1]}') - processes[-1].start() - - # Distribute the work across the task_queues - n = 1 - q=0 - while True: - rows = cur.fetchmany(increment) - if len(rows) == 0: - break - task_queue.put((rows, n, rowcount)) - # task_queues[q%num_processes].put((rows, n, rowcount)) - enqueued_batches.append(n) - # print(f'Enqueue {n} on queue {q%num_processes}') - n += increment - q+=1 - print('Work distribution complete') - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - # print(f'Stop queue {i}') - - # # Wait until all work is complete - # q = 0 - # while not enqueued_batches == []: - # # Timeout if waiting too long - # try: - # results = done_queues[q%num_processes].get(timeout=1) - # enqueued_batches.remove(results) - # except queue.Empty: - # pass - # q += 1 - # - # Close all the queues - # for q in task_queues: - # q.close() - # for q in done_queues: - # q.close() - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - # if process.is_alive(): - # rootlogger.info('Collection: %s, terminating process %s',args.collection, process.name) - # process.kill() - # print(f'Joined process {process.name.split("-")[-1]}, exitcode: {process.exitcode}') - - - delta = time.time() - strt - rate = rowcount/delta - print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes') - - - -def precopy(args): - conn = psycopg2.connect(dbname=args.db, user=settings.CLOUD_USERNAME, port=settings.CLOUD_PORT, - password=settings.CLOUD_PASSWORD, host=settings.CLOUD_HOST) - - # Get excluded collections - - # collections = open(args.collection_list).read().splitlines() - - collections = get_collections_in_version(args) - - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - for collection in collections: - if not collection in dones: - args.collection = collection - with conn: - if os.path.exists('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)): - os.remove('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) - - # Change logging file. File name includes collection ID. - for hdlr in successlogger.handlers[:]: - successlogger.removeHandler(hdlr) - success_fh = logging.FileHandler('{}/cc_{}_success.log'.format(args.log_dir, collection)) - successlogger.addHandler(success_fh) - successformatter = logging.Formatter('%(message)s') - success_fh.setFormatter(successformatter) - - for hdlr in errlogger.handlers[:]: - errlogger.removeHandler(hdlr) - err_fh = logging.FileHandler('{}/cc_{}_error.log'.format(args.log_dir, collection)) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - # Query to get the instances in the collection - with conn.cursor(cursor_factory=DictCursor) as cur: - query = f""" - SELECT i.uuid - FROM collection as c - JOIN patient as p - ON c.collection_id = p.collection_id - JOIN study as st - ON p.submitter_case_id = st.submitter_case_id - JOIN series as se - ON st.study_instance_uid = se.study_instance_uid - JOIN instance as i - ON se.series_instance_uid = i.series_instance_uid - WHERE c.collection_id = '{args.collection}' - ORDER by i.uuid - """ - args.id = 0 - copy_all_instances(args, cur, query) - - if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat('{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size==0: - # If no errors, then we are done with this collection - with open(args.dones, 'a') as f: - f.write(f'{collection}\n') - - diff --git a/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py b/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py deleted file mode 100644 index 1038682..0000000 --- a/gcs/obsolete/copy_collections__obsolete/copy_collections_v5.py +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs named in some collections from the dev bucket to some other bucket. -This is/was used, among other things, for the initial population of the idc_gch_staging -bucket from which Google Healthcare ingests our data. -""" - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO - -from gcs.copy_collections__obsolete.copy_collections_bq import precopy - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collection_table', default='collection') - parser.add_argument('--bq_excluded_collections', default='excluded_collections') - parser.add_argument('--src_bucket', default='idc_dev') - parser.add_argument('--dst_bucket', default=f'idc_dev_v{args.version}_dicomstore_staging') - parser.add_argument('--processes', default=96, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default='/mnt/disks/idc-etl/logs/copy_collections_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/copy_collections_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler('{}/logs/copy_collections__obsolete.log'.format(os.environ['PWD'])) - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py b/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py deleted file mode 100644 index dcadfbd..0000000 --- a/gcs/obsolete/copy_prestaging_to_staging/copy_prestaging_to_staging.py +++ /dev/null @@ -1,124 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Copy pre-staging buckets populated by ingestion to staging buckets: -# Ingestion copies data into prestaging buckets named by version, -# collection, and source e.g. idc_v8_path_tcga_brca. The data in these buckets must be -# copied to one of the idc-dev-etl staging buckets: -# idc-dev-open, idc-dev-cr, idc-dev-defaced, idc-dev-redacted, idc-dev-excluded. - -import os -import argparse -import logging -from logging import INFO - -from idc.models import Base, Collection, CR_Collections, Defaced_Collections, Excluded_Collections, Open_Collections, Redacted_Collections -import settings as etl_settings -from python_settings import settings -settings.configure(etl_settings) -from google.cloud import storage -from gcs.copy_bucket_mp.copy_bucket_mp import copy_all_instances - -from sqlalchemy import create_engine -from sqlalchemy_utils import register_composites -from sqlalchemy.orm import Session - - -def get_collection_groups(sess): - dev_staging_buckets = {} - pub_staging_buckets = {} - collections = sess.query(CR_Collections.tcia_api_collection_id, CR_Collections.dev_url, CR_Collections.pub_url) - for collection in collections: - dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url - pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url - collections = sess.query(Defaced_Collections.tcia_api_collection_id, Defaced_Collections.dev_url, Defaced_Collections.pub_url) - for collection in collections: - dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url - pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url - collections = sess.query(Excluded_Collections.tcia_api_collection_id) - for collection in collections: - dev_staging_buckets[collection.tcia_api_collection_id] = 'idc-dev-excluded' - collections = sess.query(Open_Collections.tcia_api_collection_id, Open_Collections.dev_url, Open_Collections.pub_url) - for collection in collections: - dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url - pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url - collections = sess.query(Redacted_Collections.tcia_api_collection_id, Redacted_Collections.dev_url, Redacted_Collections.pub_url) - for collection in collections: - dev_staging_buckets[collection.tcia_api_collection_id] = collection.dev_url - pub_staging_buckets[collection.tcia_api_collection_id] = collection.pub_url - return dev_staging_buckets, pub_staging_buckets - - -def copy_prestaging_to_staging(args, prestaging_bucket, staging_bucket): - print(f'Copying {prestaging_bucket} to {staging_bucket}') - args.src_bucket = prestaging_bucket - args.dst_bucket = staging_bucket - copy_all_instances(args) - - -def copy_dev_buckets(args): - sql_uri = f'postgresql+psycopg2://{settings.CLOUD_USERNAME}:{settings.CLOUD_PASSWORD}@{settings.CLOUD_HOST}:{settings.CLOUD_PORT}/{args.db}' - # sql_engine = create_engine(sql_uri, echo=True) # Use this to see the SQL being sent to PSQL - sql_engine = create_engine(sql_uri) - args.sql_uri = sql_uri # The subprocesses need this uri to create their own SQL engine - - # Create the tables if they do not already exist - Base.metadata.create_all(sql_engine) - - # Enable the underlying psycopg2 to deal with composites - conn = sql_engine.connect() - register_composites(conn) - - with Session(sql_engine) as sess: - dev_staging_buckets, pub_staging_buckets = get_collection_groups(sess) - pass - revised_collection_ids = sorted([row.collection_id for row in sess.query(Collection).filter(Collection.rev_idc_version == args.version).all()]) - for collection_id in revised_collection_ids: - prestaging_collection_id = collection_id.lower().replace('-','_').replace(' ','_') - prestaging_bucket = f"{args.prestaging_bucket_prefix}{prestaging_collection_id}" - staging_bucket = f'{args.staging_bucket_prefix}{dev_staging_buckets[collection_id]}' - copy_prestaging_to_staging(args, prestaging_bucket, staging_bucket) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=8, help='Version to work on') - parser.add_argument('--client', default=storage.Client()) - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v8', help='Database on which to operate') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--prestaging_bucket_prefix', default=f'idc_v{args.version}_', help='Copy instances here before forwarding to --staging_bucket') - parser.add_argument('--staging_bucket_prefix', default=f'', help='Copy instances here before forwarding to --staging_bucket') - parser.add_argument('--processes', default=8, help="Number of concurrent processes") - parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/copy_prestaging_to_staging_bucket_mp') - args = parser.parse_args() - args.id = 0 # Default process ID - - proglogger = logging.getLogger('root.prog') - prog_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log') - progformatter = logging.Formatter('%(levelname)s:prog:%(message)s') - proglogger.addHandler(prog_fh) - prog_fh.setFormatter(progformatter) - proglogger.setLevel(INFO) - - successlogger = logging.getLogger('root.success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - copy_dev_buckets(args) \ No newline at end of file diff --git a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py b/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py deleted file mode 100644 index 545323a..0000000 --- a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Copy some set of BQ tables from one dataset to another. Used to populate public dataset -# Uses gsutil -m cp. Not continuable or performant. -import argparse -import sys -import os -import logging -from logging import INFO, DEBUG -from subprocess import run - -from idc.models import Version, Collection, Patient, Study, Series, Instance, Retired, WSI_metadata, instance_source -from sqlalchemy import select,delete -from sqlalchemy.orm import Session - -from python_settings import settings -import settings as etl_settings -settings.configure(etl_settings) - -from sqlalchemy import create_engine -from sqlalchemy_utils import register_composites - -rootlogger = logging.getLogger('root') -errlogger = logging.getLogger('root.err') - - -def copy_bucket(args, src_bucket): - print("Copying {}".format(src_bucket), flush=True) - try: - result = run(['gsutil', '-m', 'cp', f'gs://{src_bucket}/*', - f'gs://{args.dst_bucket}']) - print(" {} copied, results: {}".format(src_bucket, result), flush=True) - if result.returncode: - errlogger.error('Copy %s failed: %s', src_bucket, result.stderr) - return {"bucket": src_bucket, "status": -1} - rootlogger.info('%s',src_bucket) - return 0 - except: - errlogger.error("Error copying {}: {},{},{}".format(src_bucket, sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2]), file=sys.stdout, flush=True) - raise - - -def copy_buckets(args): - # rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler('{}/logs/copy_bucket_v{}_log.log'.format(os.environ['PWD'], args.version)) - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - # errlogger = logging.getLogger('root.err') - err_fh = logging.FileHandler('{}/logs/copy_bucket_v{}_err.log'.format(os.environ['PWD'], args.version)) - errformatter = logging.Formatter('{%(pathname)s:%(lineno)d} %(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - rootlogger.debug('Args: %s', args) - - sql_uri = f'postgresql+psycopg2://{settings.CLOUD_USERNAME}:{settings.CLOUD_PASSWORD}@{settings.CLOUD_HOST}:{settings.CLOUD_PORT}/{args.db}' - # sql_engine = create_engine(sql_uri, echo=True) - sql_engine = create_engine(sql_uri) - args.sql_engine = sql_engine - - conn = sql_engine.connect() - register_composites(conn) - - dones = open('{}/logs/copy_bucket_v{}_log.log'.format(os.environ['PWD'], args.version)).read().splitlines() - - # Add a new Version with idc_version_number args.version, if it does not already exist - with Session(sql_engine) as sess: - idc_collections = [c.collection_id for c in sess.query(Collection).\ - filter(Collection.rev_idc_version==5 and Collection.done == True ).order_by('collection_id')] - for c in idc_collections: - src_bucket = f"{args.src_bucket_prefix}{c.lower().replace('-', '_').replace(' ', '_')}" - if not c in dones: - result = copy_bucket(args, src_bucket) - - - - diff --git a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py b/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py deleted file mode 100644 index 990c5f7..0000000 --- a/gcs/obsolete/copy_staging_buckets__obsolete/copy_staging_buckets_v5.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Copy some set of BQ tables from one dataset to another. Used to populate public dataset -import argparse -import sys -from gcs.copy_staging_buckets__obsolete.copy_staging_buckets import copy_buckets -import logging - -if __name__ == '__main__': - - parser =argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='IDC version for which to build the table') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}', help='Database to access') - parser.add_argument('--src_bucket_prefix', default=f'idc_v{args.version}_') - parser.add_argument('--dst_bucket', default=f'idc_dev', help='Destination BQ dataset') - - args = parser.parse_args() - print("{}".format(args), file=sys.stdout) - - rootlogger = logging.getLogger('root') - errlogger = logging.getLogger('root.err') - - copy_buckets(args) \ No newline at end of file diff --git a/gcs/obsolete/depopulate_collections_from_bucket/README.md b/gcs/obsolete/depopulate_collections_from_bucket/README.md deleted file mode 100644 index d7eaf69..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/README.md +++ /dev/null @@ -1,2 +0,0 @@ -The scripts in this directory were used to depopulate to-be-redacted collextions from various buckets. -The base script, depopulated_collections_from_bucket.py, might be useful at some future time, but the calling scripts are probably not useful. diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py deleted file mode 100644 index c1985b2..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_collections_from_bucket.py +++ /dev/null @@ -1,346 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Note: This script and scripts which call it should be restructured such -that the calling script passes in the list of collections to be copied. -""" - -""" -NOTE: The get_collections_in_version function is hardcoded for v5. -Also, probably shoud remove the test for 'where v1=False and v2=True' -in the SQL. That was a special case that should have been parameterized -""" - -""" -General purpose multiprocessing routine to the instances of some set of -collections from one bucket to another. - -A parameter, args.retired, controls whether retired instances (as listed -in the retired table) are also deleted. -In general args.retired should be False when depopulating a bucket that will -imported into a DICOM store, where only current instances are wanted. -args.retired should be True when populating a bucket that will be public -(even if having Limited access) as well as the dev counterparts of these -buckets. - -""" - -import argparse -import os -import logging - -rootlogger = logging.getLogger('root') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -import time -from multiprocessing import Process, Queue -from google.cloud import storage, bigquery -from google.cloud.exceptions import NotFound - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured - -TRIES = 3 -""" -args paramaters -bqdataset_name: bq datas et from which to access tables -bq_collections_table': BQ table listing group of collections to be populate -retired: Copy retired instances in collection if True -src_bucket: Bucket from which to delete blobs -processes: Number of concurrent processes -batch: Size of batch of blobs to be copied -src_project: Project of destination bucket -dst_project: Project of source bucket -log_dir: Directory in which some log files are kept. -dones: File listing collections that have been copied -""" - - -def get_collections_in_version(args): - client = bigquery.Client() - query = "" - if 'excluded_tables' in args.__dict__ and args.excluded_tables: - query = f""" - WITH ex as - ({query} - SELECT tcia_api_collection_id - FROM `{args.src_project}.{args.bqdataset_name}.{args.excluded_tables[0]}` - """ - for table in args.excluded_tables[1:]: - query = f""" - {query} - UNION ALL - SELECT tcia_api_collection_id - FROM `{args.src_project}.{args.bqdataset_name}.{table}` - """ - query = f""" - {query}) - """ - query = f""" - {query} - SELECT c.* - FROM `idc-dev-etl.idc_v5.collection` AS c - LEFT JOIN ex - ON c.collection_id =ex.tcia_api_collection_id - where ex.tcia_api_collection_id is NULL - - ORDER BY c.collection_id - """ - result = client.query(query).result() - collection_ids = [collection['collection_id'] for collection in result] - else: - query = f""" - SELECT c.* - FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c - where v1=False and v2=True - ORDER BY c.tcia_api_collection_id - """ - result = client.query(query).result() - collection_ids = [collection['tcia_api_collection_id'] for collection in result] - return collection_ids - - -def delete_instances(args, rows, n, rowcount, done_instances, src_bucket): - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row}.dcm' - if not blob_name in done_instances: - retries = 0 - while True: - try: - src_bucket.delete_blob(blob_name) - # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name) - successlogger.info(f'{blob_name}') - break - except NotFound: - errlogger.error('p%s %s: %s: Failed, not found %s\n', args.id, - index, args.collection, - blob_name) - break - except Exception as exc: - if retries == TRIES: - errlogger.error('p%s %s: %s: Failed %s\n: %s', args.id, - index, args.collection, - blob_name, exc) - break - retries += 1 - - if n % args.batch == 0: - rootlogger.info('p%s %s: %s', args.id, index, args.collection) - else: - if n % args.batch == 0: - rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name) - n += 1 - - -def worker(input, args, done_instances): - # rootlogger.info('p%s: Worker starting: args: %s', args.id, args ) - # print(f'p{args.id}: Worker starting: args: {args}') - - client = storage.Client() - src_bucket = client.bucket(args.src_bucket, user_project=args.src_project) - - for rows, n, rowcount in iter(input.get, 'STOP'): - delete_instances(args, rows, n, rowcount, done_instances, src_bucket) - # output.put(n) - - -def delete_all_instances(args): - client = bigquery.Client() - try: - # Create a set of previously copied blobs - done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines()) - except: - done_instances = [] - - # We first delete the instances in the current IDC version, - - # Query to get the instances in the collection - query = f""" - SELECT i.uuid - FROM `idc-dev-etl.idc_v{args.version}.collection` as c - JOIN `idc-dev-etl.idc_v{args.version}.patient` as p - ON c.collection_id = p.collection_id - JOIN `idc-dev-etl.idc_v{args.version}.study` as st - ON p.submitter_case_id = st.submitter_case_id - JOIN `idc-dev-etl.idc_v{args.version}.series` as se - ON st.study_instance_uid = se.study_instance_uid - JOIN `idc-dev-etl.idc_v{args.version}.instance` as i - ON se.series_instance_uid = i.series_instance_uid - WHERE c.collection_id = '{args.collection}' - ORDER by i.uuid - """ - args.id = 0 - - increment = args.batch - # cur.execute(query) - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, the BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - prowcount = destination.num_rows - print(f'Copying collection {args.collection}; primary {prowcount} instances') - - num_processes = max(1, min(args.processes, int(prowcount / increment))) - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - # task_queues = [Queue() for p in range(num_processes)] - # done_queues = [Queue() for p in range(num_processes)] - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, done_instances))) - # processes.append( - # Process(group=None, target=worker, args=(task_queues[process], args, done_instances))) - # print(f'Started process {args.id}: {processes[-1]}') - processes[-1].start() - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)] - if len(rows) == 0: - break - task_queue.put((rows, n, prowcount)) - n += increment - print('Primary work distribution complete') - - # Next we delete retired instances - # Query to get the instances from the retired table - if args.retired: - query = f""" - SELECT r.instance_uuid - FROM `idc-dev-etl.idc_v{args.version}.retired` as r - WHERE r.collection_id = '{args.collection}' - ORDER by r.instance_uuid - """ - - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, the BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - rrowcount = destination.num_rows - if rrowcount: - print(f'Copying retired {args.collection}; primary {rrowcount} instances') - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.instance_uuid for r in - client.list_rows(destination, max_results=increment, start_index=n - 1)] - if len(rows) == 0: - break - task_queue.put((rows, n, rrowcount)) - n += increment - print('Retired work distribution complete') - else: - print(f'No retired instances in collection {args.collection}') - else: - rrowcount = 0 - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - - delta = time.time() - strt - rate = (prowcount + rrowcount) / delta - print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes') - - -def predelete(args, collections=None): - client = bigquery.Client() - if not collections: - collections = get_collections_in_version(args) - - if not os.path.exists('{}'.format(args.log_dir)): - os.mkdir('{}'.format(args.log_dir)) - st = os.stat('{}'.format(args.log_dir)) - os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222) - - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - for collection in collections: - if not collection in dones: - args.collection = collection - if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)): - os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection)) - - # Change logging file. File name includes collection ID. - for hdlr in successlogger.handlers[:]: - successlogger.removeHandler(hdlr) - success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection)) - successlogger.addHandler(success_fh) - successformatter = logging.Formatter('%(message)s') - success_fh.setFormatter(successformatter) - - for hdlr in errlogger.handlers[:]: - errlogger.removeHandler(hdlr) - err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection)) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - delete_all_instances(args) - - if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat( - '{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size == 0: - # If no errors, then we are done with this collection - with open(args.dones, 'a') as f: - f.write(f'{collection}\n') - - - diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py deleted file mode 100644 index 19401c6..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.dev.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO - -from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete - - -if __name__ == '__main__': - group = 'defaced' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-dev-open') - parser.add_argument('--processes', default=50, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket') - args = parser.parse_args() - parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt') - args = parser.parse_args() - - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - predelete(args) - diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py deleted file mode 100644 index d0355e8..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_defaced_collections_from_open_bucket.v5.pdp.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO - -from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete - - -if __name__ == '__main__': - group = 'defaced' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-open-pdp-staging') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-pdp-staging') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket') - args = parser.parse_args() - parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt') - args = parser.parse_args() - - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - predelete(args) - diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py deleted file mode 100644 index bd5d633..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_dicomstore_staging_bucket.v5.dev.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Delete collections from the idc-dev-v5-dicomstore-staging bucket. -This is/was used for generating a bucket for dicom store import tocreate a dicomstore -without redacted collections. -Buckets for import into a dicomstore do not have retired instances, thus retired -instances are not deleted. -""" - - -import argparse -import os -from google.cloud import bigquery -import logging -from logging import INFO - -from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete - -def get_collections_in_version(args): - client = bigquery.Client() - query = f""" - SELECT c.* - FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c - ORDER BY c.tcia_api_collection_id - """ - result = client.query(query).result() - collection_ids = [collection['tcia_api_collection_id'] for collection in result] - return collection_ids - - - -if __name__ == '__main__': - group = 'redacted' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=False, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-dev-v5-dicomstore-staging', help='Bucket from which to delete instances') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_dicomstore_staging_bucket') - args = parser.parse_args() - parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt') - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_dicomstore_staging_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - collections = get_collections_in_version(args) - - predelete(args, collections) - diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py deleted file mode 100644 index 1f03a9b..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.dev.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO - -from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete - - -if __name__ == '__main__': - group = 'redacted' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-dev-open') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket') - args = parser.parse_args() - parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt') - args = parser.parse_args() - - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - predelete(args) - diff --git a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py b/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py deleted file mode 100644 index 5c51902..0000000 --- a/gcs/obsolete/depopulate_collections_from_bucket/depopulate_redacted_collections_from_open_bucket.v5.pdp.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -from subprocess import run, PIPE -import logging -from logging import INFO - -from gcs.depopulate_collections_from_bucket.depopulate_collections_from_bucket import predelete - - -if __name__ == '__main__': - group = 'redacted' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-open-pdp-staging') - parser.add_argument('--processes', default=96, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-pdp-staging') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_collections_from_bucket') - args = parser.parse_args() - parser.add_argument('--dones', default=f'./logs/depopulate_{group}_bucket_from_{args.src_bucket}_v{args.version}_dones.txt') - args = parser.parse_args() - - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - predelete(args) - diff --git a/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py b/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py deleted file mode 100644 index 856244c..0000000 --- a/gcs/obsolete/depopulate_version_from_bucket/depopulate_version_from_bucket.v5.dev.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import os -from google.cloud import bigquery -import logging -from logging import INFO, DEBUG - -from gcs.depopulate_version_from_bucket.depopulate_versions_from_bucket import predelete - -def get_collections_in_version(args): - client = bigquery.Client() - query = f""" - SELECT o.tcia_api_collection_id as tcia_api_collection_id - FROM `idc-dev-etl.idc_v5.open_collections` AS o --- ORDER BY o.tcia_api_collection_id - UNION ALL - SELECT c.tcia_api_collection_id - FROM `idc-dev-etl.idc_v5.cr_collections` AS c --- ORDER BY c.tcia_api_collection_id - UNION ALL - SELECT d.tcia_api_collection_id - FROM `idc-dev-etl.idc_v5.defaced_collections` AS d --- UNION ALL --- SELECT d.tcia_api_collection_id --- FROM `idc-dev-etl.idc_v5.redacted_collections` AS d - ORDER BY tcia_api_collection_id --- ORDER BY d.tcia_api_collection_id - - """ - result = client.query(query).result() - collection_ids = [collection['tcia_api_collection_id'] for collection in result] - return collection_ids - - - -if __name__ == '__main__': - bucket = 'idc-open' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bucket', default=f'{bucket}', help='Bucket from which to delete instances') - parser.add_argument('--processes', default=8, help="Number of concurrent processes") - parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') - parser.add_argument('--project', default='canceridc-data') - parser.add_argument('--deleted_version', default=3, help='Version whose instances are to be deleted') - args = parser.parse_args() - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/depopulate_version_{args.deleted_version}_from_{bucket}') - parser.add_argument('--dones', default=f'./logs/depopulate_v{args.deleted_version}_dones.txt') - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/log.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - proglogger = logging.getLogger('root.prog') - prog_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/prog.log') - progformatter = logging.Formatter('%(levelname)s:prog:%(message)s') - proglogger.addHandler(prog_fh) - prog_fh.setFormatter(progformatter) - proglogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(DEBUG) - - errlogger = logging.getLogger('root.err') - - collections = get_collections_in_version(args) - - predelete(args, collections) - diff --git a/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py b/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py deleted file mode 100644 index 41a4e9a..0000000 --- a/gcs/obsolete/depopulate_version_from_bucket/depopulate_versions_from_bucket.py +++ /dev/null @@ -1,243 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -""" -General purpose multiprocessing routine to delete all instances added -in some version from a specified bucker -""" - -import argparse -import os -import logging - -rootlogger = logging.getLogger('root') -proglogger = logging.getLogger('root.prog') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -import time -from multiprocessing import Process, Queue -from google.cloud import storage, bigquery -from google.cloud.exceptions import NotFound - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured - -TRIES = 3 -""" -args paramaters -bqdataset_name: bq datas et from which to access tables -bq_collections_table': BQ table listing group of collections to be populate -retired: Copy retired instances in collection if True -src_bucket: Bucket from which to delete blobs -processes: Number of concurrent processes -batch: Size of batch of blobs to be copied -src_project: Project of destination bucket -dst_project: Project of source bucket -log_dir: Directory in which some log files are kept. -dones: File listing collections that have been copied -""" - -def delete_instances(args, rows, n, rowcount, done_instances, bucket): - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row}.dcm' - if not blob_name in done_instances: - retries = 0 - while True: - try: - bucket.delete_blob(blob_name) - # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name) - successlogger.debug(f'{blob_name}') - break - except NotFound: - errlogger.error('p%s %s: %s: Failed, not found %s\n', args.id, - index, args.collection, - blob_name) - break - except Exception as exc: - if retries == TRIES: - errlogger.error('p%s %s: %s: Failed %s\n: %s', args.id, - index, args.collection, - blob_name, exc) - break - retries += 1 - - if n % args.batch == 0: - proglogger.info('p%s %s: %s', args.id, index, args.collection) - # print('p%s %s: %s', args.id, index, args.collection) - else: - if n % args.batch == 0: - proglogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name) - n += 1 - - -def worker(input, args, done_instances): - # rootlogger.info('p%s: Worker starting: args: %s', args.id, args ) - # print(f'p{args.id}: Worker starting: args: {args}') - - client = storage.Client() - bucket = client.bucket(args.bucket, user_project=args.project) - - for rows, n, rowcount in iter(input.get, 'STOP'): - delete_instances(args, rows, n, rowcount, done_instances, bucket) - # output.put(n) - - -def delete_all_instances(args): - client = bigquery.Client() - try: - # Create a set of previously copied blobs - done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines()) - except: - done_instances = [] - - # We first delete the instances in the current IDC version, - - # Query to get the instances in the collection - query = f""" - SELECT i.uuid - FROM `idc-dev-etl.idc_v{args.version}.collection` as c - JOIN `idc-dev-etl.idc_v{args.version}.patient` as p - ON c.collection_id = p.collection_id - JOIN `idc-dev-etl.idc_v{args.version}.study` as st - ON p.submitter_case_id = st.submitter_case_id - JOIN `idc-dev-etl.idc_v{args.version}.series` as se - ON st.study_instance_uid = se.study_instance_uid - JOIN `idc-dev-etl.idc_v{args.version}.instance` as i - ON se.series_instance_uid = i.series_instance_uid - WHERE c.collection_id = '{args.collection}' - AND i.rev_idc_version = {args.deleted_version} - ORDER by i.uuid - """ - args.id = 0 - - increment = args.batch - # cur.execute(query) - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - prowcount = destination.num_rows - if prowcount: - print(f'Deleting collection {args.collection}; primary {prowcount} instances') - - num_processes = max(1, min(args.processes, int(prowcount / increment))) - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - # task_queues = [Queue() for p in range(num_processes)] - # done_queues = [Queue() for p in range(num_processes)] - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, done_instances))) - # processes.append( - # Process(group=None, target=worker, args=(task_queues[process], args, done_instances))) - # print(f'Started process {args.id}: {processes[-1]}') - processes[-1].start() - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)] - if len(rows) == 0: - break - task_queue.put((rows, n, prowcount)) - n += increment - print('Primary work distribution complete') - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - - delta = time.time() - strt - rate = prowcount / delta - print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes') - else: - print(f'Collection {args.collection} has no new instances in version {args.deleted_version}') - with open(f'{args.dones}', 'a') as f: - f.write(f'{args.collection}\n') - # - # with open(f'{args.log_dir}/{args.collection}_success.log', 'w') as f: - # f.write(f'{args.collection}\n') - - -def predelete(args, collections): - client = bigquery.Client() - - if not os.path.exists('{}'.format(args.log_dir)): - os.mkdir('{}'.format(args.log_dir)) - st = os.stat('{}'.format(args.log_dir)) - os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222) - - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - for collection in collections: - if not collection in dones: - args.collection = collection - if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)): - os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection)) - - # Change logging file. File name includes collection ID. - for hdlr in successlogger.handlers[:]: - successlogger.removeHandler(hdlr) - success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection)) - successlogger.addHandler(success_fh) - successformatter = logging.Formatter('%(message)s') - success_fh.setFormatter(successformatter) - - for hdlr in errlogger.handlers[:]: - errlogger.removeHandler(hdlr) - err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection)) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - delete_all_instances(args) - - - - - diff --git a/gcs/obsolete/empty_and_delete_bucket.py b/gcs/obsolete/empty_and_delete_bucket.py deleted file mode 100644 index 7be83e3..0000000 --- a/gcs/obsolete/empty_and_delete_bucket.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Delete all blobs from some bucket, delete the bucket -# empty_bucket_mp is faster. - -from google.cloud import storage -from subprocess import run, PIPE -from google.api_core.exceptions import Conflict -import sys -import argparse -from utilities.gcs_helpers import list_buckets - -def empty_and_delete_bucket(args): - try: - result = run(['gsutil', '-m', '-u', f'{args.project}', 'rm', '-r', f'gs://{args.bucket}']) - print(" {} emptied, results: {}".format(args.bucket, result), flush=True) - if result.returncode: - print('Copy {} failed: {}'.format(result.stderr), flush=True) - return {"bucket": args.src_bucket_name, "status": -1} - return {"bucket": args.bucket, "status": 0} - except Exception as exc: - print("Error in deleting {}: {}".format(args.bucket, exc)) - # raise - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--project', default='idc-dev-etl') - parser.add_argument('--bucket', default='idc-dev-v5-dicomstore-staging') - args = parser.parse_args() - print("{}".format(args), file=sys.stdout) - - if args.bucket == 'idc-open': - print("Not allowed") - exit - - empty_and_delete_bucket(args) diff --git a/gcs/obsolete/empty_bucket.py b/gcs/obsolete/empty_bucket.py deleted file mode 100644 index 00bb878..0000000 --- a/gcs/obsolete/empty_bucket.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Delete all blobs from some bucket -# empty_bucket_mp.py is faster - -from subprocess import run, PIPE -import sys -import argparse - -def empty_bucket(args): - try: - result = run(['gsutil', '-m', '-u', f'{args.project}', 'rm', f'gs://{args.bucket}/*']) - print(" {} emptied, results: {}".format(args.bucket, result), flush=True) - if result.returncode: - print('Copy {} failed: {}'.format(result.stderr), flush=True) - return {"bucket": args.src_bucket_name, "status": -1} - return {"bucket": args.bucket, "status": 0} - except Exception as exc: - print("Error in deleting {}: {}".format(args.bucket, exc)) - # raise - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--project', default='canceridc-data') - parser.add_argument('--bucket', default='idc-nlst-open') - args = parser.parse_args() - print("{}".format(args), file=sys.stdout) - - if args.bucket == 'idc-open': - print("Not allowed") - exit - - empty_bucket(args) diff --git a/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py b/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py deleted file mode 100644 index ce0955b..0000000 --- a/gcs/obsolete/empty_idc_dev_etl_v2_buckets.py +++ /dev/null @@ -1,107 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# One time script to delete the idc-tcia-v2-xxx buckets in idc-dev-etl. -# Could be adapted tp remove idc-tcia- buckets in canceridc-data. -import argparse -import os -from google.cloud import storage, bigquery -import logging -from logging import INFO -from gcs.empty_bucket_mp.empty_bucket_mp import pre_delete - - -def get_collections_in_version(args): - client = bigquery.Client() - query = f""" - SELECT c.tcia_api_collection_id - FROM `idc-dev-etl.idc_v2.collection` AS c - ORDER BY c.tcia_api_collection_id - """ - result = client.query(query).result() - collection_ids = [collection['tcia_api_collection_id'] for collection in result] - return collection_ids - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--bucket', default='idc-dev-v5-dicomstore-staging') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') - parser.add_argument('--project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/empty_idc_dev_etl_v2_buckets') - parser.add_argument('--dones', default=f'{os.environ["PWD"]}/logs/dones.log') - - args = parser.parse_args() - - if not os.path.exists('{}'.format(args.log_dir)): - os.mkdir('{}'.format(args.log_dir)) - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - dones = open(args.dones).read().splitlines() - - client = storage.Client(project=args.project) - - collections = [collection.lower().replace(' ','-').replace('_','-') for collection in get_collections_in_version(args)] - - # for collection in collections: - # found=False - # if client.bucket(f'idc-tcia-1-{collection}', user_project=args.project).exists(): - # print(f'{collection:32}: idc-tcia-1-{collection}') - # found=True - # if client.bucket(f'idc-tcia-2-{collection}', user_project=args.project).exists(): - # print(f'{collection:32}: idc-tcia-2-{collection}') - # found=True - # if not found: - # print(f'{collection}: ***No bucket***') - # - - - for collection in collections: - args.bucket = f"idc-tcia-2-{collection}" - if not args.bucket in dones: - bucket = client.bucket(args.bucket, user_project=args.project) - tried = 0 - tries = 2 - while tried < tries: - try: - if bucket.exists(): - pre_delete(args) - bucket.delete() - rootlogger.info(f'Deleted bucket %s',args.bucket) - break - else: - break - except Exception as exc: - print(f'p0: Delete bucket failed: {exc}') - tried += 1 - if tried == tries: - errlogger.error(f'Failed to delete bucket %s', args.bucket) - with open(args.dones, 'a') as f: - f.write(f'{args.bucket}\n') - - diff --git a/gcs/move_collection/copy_collection.py b/gcs/obsolete/move_collection/copy_collection.py similarity index 100% rename from gcs/move_collection/copy_collection.py rename to gcs/obsolete/move_collection/copy_collection.py diff --git a/gcs/move_collection/delete_collection.py b/gcs/obsolete/move_collection/delete_collection.py similarity index 100% rename from gcs/move_collection/delete_collection.py rename to gcs/obsolete/move_collection/delete_collection.py diff --git a/gcs/move_collection/move_apollo/copy_apollo_collections.py b/gcs/obsolete/move_collection/move_apollo/copy_apollo_collections.py similarity index 100% rename from gcs/move_collection/move_apollo/copy_apollo_collections.py rename to gcs/obsolete/move_collection/move_apollo/copy_apollo_collections.py diff --git a/gcs/move_collection/move_apollo/delete_apollo_collections.py b/gcs/obsolete/move_collection/move_apollo/delete_apollo_collections.py similarity index 100% rename from gcs/move_collection/move_apollo/delete_apollo_collections.py rename to gcs/obsolete/move_collection/move_apollo/delete_apollo_collections.py diff --git a/gcs/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py b/gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py similarity index 100% rename from gcs/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py rename to gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/copy_cptac_cm_and_lscc_collection.py diff --git a/gcs/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py b/gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py similarity index 100% rename from gcs/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py rename to gcs/obsolete/move_collection/move_cptac_cm_and_cptac_lscc/delete_collection.py diff --git a/gcs/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py b/gcs/obsolete/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py similarity index 100% rename from gcs/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py rename to gcs/obsolete/move_collection/move_prostate_diagnosis/copy_prostate_diagnosis_collection.py diff --git a/gcs/obsolete/populate_buckets_with_collections/README.md b/gcs/obsolete/populate_buckets_with_collections/README.md deleted file mode 100644 index 760ced5..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/README.md +++ /dev/null @@ -1,2 +0,0 @@ -These scripts copied collections from idc_dev/idc_open to various buckets in support of redaction. -Essentially single-use. \ No newline at end of file diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py deleted file mode 100644 index 85287ef..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_collections.py +++ /dev/null @@ -1,361 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -NOTE: This script and scripts which call it should be restructured such -that the calling script passes in the list of collections to be copied. -""" - -""" -NOTE: The get_collections_in_version function is hardcoded for v5. -Also, probably shoud remove the test for 'where v1=False and v2=True' -in the SQL. That was a special case that should have been parameterized -""" - -""" -General purpose multiprocessing routine to copy the instances of some set of -collections from one bucket to another. - -A parameter, args.retired, controls whether retired instances (as listed -in the retired table) are copied. -In general args.retired should be False when populating a bucket that will -imported into a DICOM store, where (only current instances are wanted. -args.retired should be Tree when populating a bucket that will be public -(even if having Limited access) as well as the dev counterparts of these -buckets. - -The inital use of this routine was in splitting idc_dev into multiple -buckets in support of Google hosting and defacing. - -It can also be used to copy data among those buckets as needed, for example, -when the set of collections to be defaced/redacted changes. Note that, for -this purpose the module does not delete the source blob. That should be done -separately. -""" - -import argparse -import os -import logging - -rootlogger = logging.getLogger('root') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -import time -from multiprocessing import Process, Queue -from google.cloud import storage, bigquery - -from python_settings import settings -import settings as etl_settings - -settings.configure(etl_settings) -assert settings.configured - -TRIES = 3 -""" -args paramaters -bqdataset_name: bq datas et from which to access tables -bq_collections_table': BQ table listing group of collections to be populate -retired: Copy retired instances in collection if True -src_bucket: Bucket from which to copy blobs -dst_bucket, Bucket to which to copy blobs -processes: Number of concurrent processes -batch: Size of batch of blobs to be copied -src_project: Project of destination bucket -dst_project: Project of source bucket -log_dir: Directory in which some log files are kept. -dones: File listing collections that have been copied -""" - - -def get_collections_in_version(args): - client = bigquery.Client() - query = "" - if 'excluded_tables' in args.__dict__ and args.excluded_tables: - query = f""" - WITH ex as - ({query} - SELECT tcia_api_collection_id - FROM `{args.src_project}.{args.bqdataset_name}.{args.excluded_tables[0]}` - """ - for table in args.excluded_tables[1:]: - query = f""" - {query} - UNION ALL - SELECT tcia_api_collection_id - FROM `{args.src_project}.{args.bqdataset_name}.{table}` - """ - query = f""" - {query}) - """ - query = f""" - {query} - SELECT c.* - FROM `idc-dev-etl.idc_v5.collection` AS c - LEFT JOIN ex - ON c.collection_id =ex.tcia_api_collection_id - where ex.tcia_api_collection_id is NULL - - ORDER BY c.collection_id - """ - result = client.query(query).result() - collection_ids = [collection['collection_id'] for collection in result] - else: - query = f""" - SELECT c.* - FROM `idc-dev-etl.idc_v5.{args.bq_collections_table}` AS c - where v1=False and v2=True - ORDER BY c.tcia_api_collection_id - """ - result = client.query(query).result() - collection_ids = [collection['tcia_api_collection_id'] for collection in result] - return collection_ids - - -def copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket): - for row in rows: - index = f'{n}/{rowcount}' - blob_name = f'{row}.dcm' - if not blob_name in done_instances: - src_blob = src_bucket.blob(blob_name) - dst_blob = dst_bucket.blob(blob_name) - retries = 0 - while True: - try: - rewrite_token = False - while True: - rewrite_token, bytes_rewritten, bytes_to_rewrite = dst_blob.rewrite( - src_blob, token=rewrite_token - ) - if not rewrite_token: - break - successlogger.info(f'{blob_name}') - break - - # blob_copy = src_bucket.copy_blob(src_bucket.blob(blob_name), dst_bucket) - # # rootlogger.info('%s %s: %s: copy succeeded %s', args.id, index, args.collection, blob_name) - # successlogger.info(f'{blob_name}') - # break - - except Exception as exc: - if retries == TRIES: - errlogger.error('p%s %s: %s: copy failed %s\n, retry %s; %s', args.id, - index, args.collection, - blob_name, retries, exc) - break - time.sleep(retries) - retries += 1 - if n % args.batch == 0: - rootlogger.info('p%s %s: %s', args.id, index, args.collection) - else: - if n % args.batch == 0: - rootlogger.info('p%s %s: %s: skipping blob %s ', args.id, index, args.collection, blob_name) - n += 1 - - -def worker(input, args, done_instances): - # rootlogger.info('p%s: Worker starting: args: %s', args.id, args ) - # print(f'p{args.id}: Worker starting: args: {args}') - - client = storage.Client() - src_bucket = client.bucket(args.src_bucket, user_project=args.src_project) - dst_bucket = client.bucket(args.dst_bucket, user_project=args.dst_project) - - for rows, n, rowcount in iter(input.get, 'STOP'): - copy_instances(args, rows, n, rowcount, done_instances, src_bucket, dst_bucket) - # output.put(n) - - -def copy_all_instances(args): - client = bigquery.Client() - try: - # Create a set of previously copied blobs - done_instances = set(open(f'{args.log_dir}/{args.collection}_success.log').read().splitlines()) - except: - done_instances = [] - - # We first copy the instances in the current IDC version, - - # Query to get the instances in the collection - query = f""" - SELECT i.uuid - FROM `idc-dev-etl.idc_v{args.version}.collection` as c - JOIN `idc-dev-etl.idc_v{args.version}.patient` as p - ON c.collection_id = p.collection_id - JOIN `idc-dev-etl.idc_v{args.version}.study` as st - ON p.submitter_case_id = st.submitter_case_id - JOIN `idc-dev-etl.idc_v{args.version}.series` as se - ON st.study_instance_uid = se.study_instance_uid - JOIN `idc-dev-etl.idc_v{args.version}.instance` as i - ON se.series_instance_uid = i.series_instance_uid - WHERE c.collection_id = '{args.collection}' - ORDER by i.uuid - """ - args.id = 0 - - increment = args.batch - # cur.execute(query) - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, the BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - prowcount = destination.num_rows - print(f'Copying collection {args.collection}; primary {prowcount} instances') - - num_processes = max(1, min(args.processes, int(prowcount / increment))) - processes = [] - # Create a pair of queue for each process - - task_queue = Queue() - - # task_queues = [Queue() for p in range(num_processes)] - # done_queues = [Queue() for p in range(num_processes)] - - strt = time.time() - - # Start worker processes - for process in range(num_processes): - args.id = process + 1 - processes.append( - Process(group=None, target=worker, args=(task_queue, args, done_instances))) - # processes.append( - # Process(group=None, target=worker, args=(task_queues[process], args, done_instances))) - # print(f'Started process {args.id}: {processes[-1]}') - processes[-1].start() - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.uuid for r in client.list_rows(destination, max_results=increment, start_index=n - 1)] - if len(rows) == 0: - break - task_queue.put((rows, n, prowcount)) - n += increment - print('Primary work distribution complete') - - # Next we copy retired instances - # Query to get the instances from the retired table - if args.retired: - query = f""" - SELECT r.instance_uuid - FROM `idc-dev-etl.idc_v{args.version}.retired` as r - WHERE r.collection_id = '{args.collection}' - ORDER by r.instance_uuid - """ - - query_job = client.query((query)) - query_job.result() - # Get the destination table for the query results. - # - # All queries write to a destination table. If a destination table is not - # specified, the BigQuery populates it with a reference to a temporary - # anonymous table after the query completes. - destination = query_job.destination - - # Get the schema (and other properties) for the destination table. - # - # A schema is useful for converting from BigQuery types to Python types. - destination = client.get_table(destination) - - rrowcount = destination.num_rows - if rrowcount: - print(f'Copying retired {args.collection}; primary {rrowcount} instances') - - # Distribute the work across the task_queues - n = 1 - while True: - # rows = cur.fetchmany(increment) - rows = [r.instance_uuid for r in - client.list_rows(destination, max_results=increment, start_index=n - 1)] - if len(rows) == 0: - break - task_queue.put((rows, n, rrowcount)) - n += increment - print('Retired work distribution complete') - else: - print(f'No retired instances in collection {args.collection}') - else: - rrowcount = 0 - - # Tell child processes to stop - for i in range(num_processes): - task_queue.put('STOP') - - # Wait for process to terminate - for process in processes: - print(f'Joining process: {process.name}, {process.is_alive()}') - process.join() - - delta = time.time() - strt - rate = (prowcount + rrowcount) / delta - print(f'Completed collection {args.collection}, {rate} instances/sec, {num_processes} processes') - - -def precopy(args): - client = bigquery.Client() - collections = get_collections_in_version(args) - - if not os.path.exists('{}'.format(args.log_dir)): - os.mkdir('{}'.format(args.log_dir)) - st = os.stat('{}'.format(args.log_dir)) - os.chmod('{}'.format(args.log_dir), st.st_mode | 0o222) - - try: - dones = open(args.dones).read().splitlines() - except: - dones = [] - for collection in collections: - if not collection in dones: - args.collection = collection - if os.path.exists('{}/logs/{}_error.log'.format(args.log_dir, collection)): - os.remove('{}/logs/{}_error.log'.format(args.log_dir, collection)) - - # Change logging file. File name includes collection ID. - for hdlr in successlogger.handlers[:]: - successlogger.removeHandler(hdlr) - success_fh = logging.FileHandler('{}/{}_success.log'.format(args.log_dir, collection)) - successlogger.addHandler(success_fh) - successformatter = logging.Formatter('%(message)s') - success_fh.setFormatter(successformatter) - - for hdlr in errlogger.handlers[:]: - errlogger.removeHandler(hdlr) - err_fh = logging.FileHandler('{}/{}_error.log'.format(args.log_dir, collection)) - errformatter = logging.Formatter('%(levelname)s:err:%(message)s') - errlogger.addHandler(err_fh) - err_fh.setFormatter(errformatter) - - copy_all_instances(args) - - if not os.path.isfile('{}/logs/cc_{}_error.log'.format(args.log_dir, collection)) or os.stat( - '{}/logs/cc_{}_error.log'.format(os.environ['PWD'], collection)).st_size == 0: - # If no errors, then we are done with this collection - with open(args.dones, 'a') as f: - f.write(f'{collection}\n') - - - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py deleted file mode 100644 index cc3be8a..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_cr_collections.v5.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in commercial restricted collections from the dev bucket to idc-dev-cr. -This is/was used, among other things, for the initial population of the idc-dev-cr -bucket. -""" -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'cr' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc_dev') - parser.add_argument('--dst_bucket', default=f'idc-dev-{group}') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py deleted file mode 100644 index f50e253..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.dev.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'defaced' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-dev-open') - parser.add_argument('--dst_bucket', default=f'idc-dev-{group}') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py deleted file mode 100644 index c495cb2..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_defaced_collections.v5.prod.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'defaced' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default=f'idc-dev-{group}') - parser.add_argument('--dst_bucket', default=f'idc-open-idc1') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='canceridc-data') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py deleted file mode 100644 index 1447c5e..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_excluded_collections.v5.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in excluded collections from the dev bucket to idc-dev-excluded. -This is/was used, among other things, for the initial population of the idc-dev-excluded -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'excluded' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc_dev') - parser.add_argument('--dst_bucket', default=f'idc-dev-{group}') - parser.add_argument('--processes', default=64, help="Number of concurrent processes") - parser.add_argument('--batch', default=64, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py deleted file mode 100644 index 2f81032..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_open_collections.v5.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in collections from the dev bucket to idc-dev-open. -This was used, among other things, for the initial population of the idc-dev-open -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'open' - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=5, help='Next version to generate') - args = parser.parse_args() - parser.add_argument('--db', default=f'idc_v{args.version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{args.version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc_dev') - parser.add_argument('--dst_bucket', default=f'idc-dev-{group}') - parser.add_argument('--excluded_tables', default=[ - 'excluded_collections', - 'cr_collections', - 'redacted_collections', - 'defaced_collections' - ], help="Tables of lists of collections in other buckets to be excluded" - ) - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{args.version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py deleted file mode 100644 index bad5e32..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.dev.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'redacted' - version = 5 - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=f'{version}', help='Next version to generate') - parser.add_argument('--db', default=f'idc_v{version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default='idc-dev-open') - parser.add_argument('--dst_bucket', default=f'idc-dev-{group}') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='idc-dev-etl') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py b/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py deleted file mode 100644 index 52c2def..0000000 --- a/gcs/obsolete/populate_buckets_with_collections/populate_bucket_with_redacted_collections.v5.prod.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Copy all blobs in redacted collections from the dev bucket to idc-dev-redacted. -This is/was used, among other things, for the initial population of the idc-dev-redacted -bucket. -""" - -""" -Note: This script should be restructured such to pass in the list of collections to be copied. -""" - - -import argparse -import os -import logging -from logging import INFO - -from gcs.obsolete.populate_buckets_with_collections.populate_bucket_with_collections import precopy - - -if __name__ == '__main__': - group = 'redacted' - version = 5 - parser = argparse.ArgumentParser() - parser.add_argument('--version', default=f'{version}', help='Next version to generate') - parser.add_argument('--db', default=f'idc_v{version}') - parser.add_argument('--bqdataset_name', default=f'idc_v{version}') - parser.add_argument('--bq_collections_table', default=f'{group}_collections', help='Table listing collections in group') - parser.add_argument('--retired', default=True, help="Copy retired instances in collection if True") - parser.add_argument('--src_bucket', default=f'idc-dev-{group}') - parser.add_argument('--dst_bucket', default=f'idc-open-idc') - parser.add_argument('--processes', default=128, help="Number of concurrent processes") - parser.add_argument('--batch', default=1000, help='Size of batch assigned to each process') - parser.add_argument('--src_project', default='idc-dev-etl') - parser.add_argument('--dst_project', default='canceridc-data') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/populate_{group}_bucket_v5_dicomstore_staging') - # parser.add_argument('--collection_list', default='./collection_list.txt') - parser.add_argument('--dones', default=f'./logs/populate_{group}_bucket_v{version}_dicomstore_staging_dones.txt') - - args = parser.parse_args() - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/{group}_buckets.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - precopy(args) - diff --git a/gcs/obsolete/validate_opens_bucket.py b/gcs/obsolete/validate_opens_bucket.py deleted file mode 100644 index ca61483..0000000 --- a/gcs/obsolete/validate_opens_bucket.py +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright 2015-2021, Institute for Systems Biology -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Multiprocess script to validate that the instances in a bucket are only -those in some set of collections -""" - -import argparse -import os -import logging -from logging import INFO -rootlogger = logging.getLogger('root') -successlogger = logging.getLogger('success') -errlogger = logging.getLogger('root.err') - -from gcs.validate_bucket.validate_bucket_mp import pre_validate - - -if __name__ == '__main__': - version = 5 - - parser = argparse.ArgumentParser() - parser.add_argument('--version', default = version) - parser.add_argument('--project', default = 'idc-dev-etl') - parser.add_argument('--bqdataset', default=f'idc_v{version}') - parser.add_argument('--bucket', default='idc-open-pdp-staging') - parser.add_argument('--collection_table', default='open_collections', help='BQ table containing list of collections') - parser.add_argument('--blob_names', default='./logs/blobs.txt', help='List of blobs names in above collections') - parser.add_argument('--processes', default=16, help="Number of concurrent processes") - parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') - parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/validate_open_buckets') - - args = parser.parse_args() - - if not os.path.exists('{}'.format(args.log_dir)): - os.mkdir('{}'.format(args.log_dir)) - - rootlogger = logging.getLogger('root') - root_fh = logging.FileHandler(f'{os.environ["PWD"]}/logs/bucket.log') - rootformatter = logging.Formatter('%(levelname)s:root:%(message)s') - rootlogger.addHandler(root_fh) - root_fh.setFormatter(rootformatter) - rootlogger.setLevel(INFO) - - successlogger = logging.getLogger('success') - successlogger.setLevel(INFO) - - errlogger = logging.getLogger('root.err') - - - - pre_validate(args) diff --git a/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py b/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py new file mode 100644 index 0000000..30eb25c --- /dev/null +++ b/gcs/release_gcs_data/copy_staging_buckets_to_public_buckets.py @@ -0,0 +1,49 @@ +# +# Copyright 2015-2021, Institute for Systems Biology +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import argparse +from gcs.copy_bucket_mp import copy_all_instances +from utilities.logging_config import successlogger, progresslogger, errlogger + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--processes', default=1, help="Number of concurrent processes") + parser.add_argument('--batch', default=100, help='Size of batch assigned to each process') + parser.add_argument('--log_dir', default=f'/mnt/disks/idc-etl/logs/copy_bucket_mp') + + args = parser.parse_args() + + try: + # Create a set of previously copied blobs + dones = set(open(successlogger.handlers[0].baseFilename).read().splitlines()) + except: + dones = set([]) + + + args.src_bucket = 'idc-open-idc1-staging' + args.dst_bucket = 'idc-open-idc1' + copy_all_instances(args, dones) + + args.src_bucket = 'idc-open-cr-staging' + args.dst_bucket = 'idc-open-cr' + copy_all_instances(args, dones) + + args.src_bucket = 'public-datasets-idc-staging' + args.dst_bucket = 'public-datasets-idc' + copy_all_instances(args, dones) + + diff --git a/gcs/validate_bucket/validate_bucket_mp.py b/gcs/release_gcs_data/validate_bucket/validate_bucket_mp.py similarity index 100% rename from gcs/validate_bucket/validate_bucket_mp.py rename to gcs/release_gcs_data/validate_bucket/validate_bucket_mp.py diff --git a/gcs/validate_bucket/validate_idc_dev_cr.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_cr.py similarity index 100% rename from gcs/validate_bucket/validate_idc_dev_cr.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_cr.py diff --git a/gcs/validate_bucket/validate_idc_dev_defaced.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_defaced.py similarity index 100% rename from gcs/validate_bucket/validate_idc_dev_defaced.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_defaced.py diff --git a/gcs/validate_bucket/validate_idc_dev_excluded.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_excluded.py similarity index 100% rename from gcs/validate_bucket/validate_idc_dev_excluded.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_excluded.py diff --git a/gcs/validate_bucket/validate_idc_dev_open.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_open.py similarity index 100% rename from gcs/validate_bucket/validate_idc_dev_open.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_open.py diff --git a/gcs/validate_bucket/validate_idc_dev_redacted.py b/gcs/release_gcs_data/validate_bucket/validate_idc_dev_redacted.py similarity index 100% rename from gcs/validate_bucket/validate_idc_dev_redacted.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_dev_redacted.py diff --git a/gcs/validate_bucket/validate_idc_open_cr.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_cr.py similarity index 100% rename from gcs/validate_bucket/validate_idc_open_cr.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_cr.py diff --git a/gcs/validate_bucket/validate_idc_open_cr_staging.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_cr_staging.py similarity index 100% rename from gcs/validate_bucket/validate_idc_open_cr_staging.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_cr_staging.py diff --git a/gcs/validate_bucket/validate_idc_open_idc.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc.py similarity index 100% rename from gcs/validate_bucket/validate_idc_open_idc.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc.py diff --git a/gcs/validate_bucket/validate_idc_open_idc1.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1.py similarity index 100% rename from gcs/validate_bucket/validate_idc_open_idc1.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1.py diff --git a/gcs/validate_bucket/validate_idc_open_idc1_staging.py b/gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1_staging.py similarity index 100% rename from gcs/validate_bucket/validate_idc_open_idc1_staging.py rename to gcs/release_gcs_data/validate_bucket/validate_idc_open_idc1_staging.py diff --git a/gcs/validate_bucket/validate_public_datasets_idc.py b/gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc.py similarity index 100% rename from gcs/validate_bucket/validate_public_datasets_idc.py rename to gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc.py diff --git a/gcs/validate_bucket/validate_public_datasets_idc_staging.py b/gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc_staging.py similarity index 100% rename from gcs/validate_bucket/validate_public_datasets_idc_staging.py rename to gcs/release_gcs_data/validate_bucket/validate_public_datasets_idc_staging.py diff --git a/validation/compare_hashes.py b/validation/compare_hashes.py index bc8ac8e..a9b0b90 100644 --- a/validation/compare_hashes.py +++ b/validation/compare_hashes.py @@ -363,7 +363,7 @@ def compare_hashes(args): parser.add_argument('--ignore_differing_patient_counts', default=True) parser.add_argument('--log_level', default=("collection, patient, study, series, instance"), help='Levels at which to log') - parser.add_argument('--collections', default=['RIDER Lung CT'], \ + parser.add_argument('--collections', default=['RIDER Pilot'], \ help='List of collections to compare. If empty, compare all collections') parser.add_argument('--skips', default=[])